Source code for tkseem.character_tokenizer

import pickle
import re
from collections import defaultdict

from ._base import BaseTokenizer


[docs]class CharacterTokenizer(BaseTokenizer): """ Character based tokenization """
[docs] def train(self, file_path): """Train data using characters Args: file_path (str): file to train """ print("Training CharacterTokenizer ...") rx = re.compile(r"\B(.)") text = open(file_path, "r").read() text = rx.sub(r" ##\1", text) tokens_frequency = defaultdict(int) for word in text.split(" "): tokens_frequency[word] += 1 self.vocab = self._truncate_dict(dict(tokens_frequency)) self.vocab_size = len(self.vocab)
[docs] def tokenize(self, text): """Tokenize using the frequency dictionary Args: text (str): input string Returns: list: generated tokens """ rx = re.compile(r"\B(.)") text = rx.sub(r" ##\1", text) output_tokens = [] for token in text.split(): if token in self.vocab: output_tokens.append(token) else: output_tokens.append(self.unk_token) return output_tokens