from ._base import BaseTokenizer
[docs]class WordTokenizer(BaseTokenizer):
"""
White space based tokenization
"""
tokens_frequency = None
[docs] def train(self, file_path):
"""Train using words' frequency
Args:
file_path (str): file to train
"""
print("Training WordTokenizer ...")
self.vocab = self._truncate_dict(self._get_tokens_frequency(file_path))
self.vocab_size = len(self.vocab)
[docs] def tokenize(self, text):
"""Tokenize using the frequency dictionary
Args:
text (str): input string
Returns:
list: generated tokens
"""
assert self.vocab
output_tokens = []
for word in text.split():
if word in self.vocab.keys():
output_tokens.append(word)
else:
output_tokens.append(self.unk_token)
return output_tokens
[docs] def detokenize(self, tokens):
""" Convert tokens to a string
Args:
tokens (list): list of tokens
Returns:
str: detokenized string
"""
detokenized = " ".join(tokens)
return detokenized