feat(tokens): as class, with punctuation/case/oov_token

parent de44f92a
...@@ -3,29 +3,35 @@ from keras_preprocessing.text import Tokenizer ...@@ -3,29 +3,35 @@ from keras_preprocessing.text import Tokenizer
from KoozDawa.dawa.loader import load_kawa from KoozDawa.dawa.loader import load_kawa
def get_sequence_of_tokens(corpus, tokenizer): class PoemTokenizer(Tokenizer):
# TODO Tokenize while keeping apostrophes like j'aime def __init__(self, **kwargs) -> None:
tokenizer.fit_on_texts(corpus) super().__init__(lower=False, filters='"#$%&()*+,-/<=>@[\\]^_`{|}~\t\n', oov_token="😢", **kwargs)
total_words = len(tokenizer.word_index) + 1
# convert data to sequence of tokens def get_sequence_of_tokens(self, corpus):
input_sequences = [] self.fit_on_texts(corpus)
total_words = len(self.word_index) + 1
for line in corpus: # convert data to sequence of tokens
token_list = tokenizer.texts_to_sequences([line])[0] input_sequences = []
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i + 1]
input_sequences.append(n_gram_sequence)
texts = tokenizer.sequences_to_texts(input_sequences) for line in corpus:
print("Tokenized:", texts[:5]) token_list = self.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i + 1]
input_sequences.append(n_gram_sequence)
return input_sequences, total_words texts = self.sequences_to_texts(input_sequences)
print("Tokenized:", texts[:5])
return input_sequences, total_words
def get_text(self, sequence):
return self.sequences_to_texts(sequence)
if __name__ == '__main__': if __name__ == '__main__':
kawa = load_kawa("../") kawa = load_kawa("../")
tokenizer = Tokenizer() tokenizer = PoemTokenizer()
seqs, words = get_sequence_of_tokens(kawa, tokenizer) seqs, words = tokenizer.get_sequence_of_tokens(kawa)
texts = tokenizer.sequences_to_texts(seqs) texts = tokenizer.get_text(seqs)
print("%i words." % words) print("%i words." % words)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment