chore(tokens): Better main

parent 425b8bf6
...@@ -3,8 +3,8 @@ from keras_preprocessing.text import Tokenizer ...@@ -3,8 +3,8 @@ from keras_preprocessing.text import Tokenizer
from KoozDawa.dawa.loader import load_kawa from KoozDawa.dawa.loader import load_kawa
def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()): def get_sequence_of_tokens(corpus, tokenizer):
# TODO Tokenize while keeping accents # TODO Tokenize while keeping apostrophes like j'aime
tokenizer.fit_on_texts(corpus) tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1 total_words = len(tokenizer.word_index) + 1
...@@ -18,12 +18,14 @@ def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()): ...@@ -18,12 +18,14 @@ def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()):
input_sequences.append(n_gram_sequence) input_sequences.append(n_gram_sequence)
texts = tokenizer.sequences_to_texts(input_sequences) texts = tokenizer.sequences_to_texts(input_sequences)
print("Tokenized:", texts) print("Tokenized:", texts[:5])
return input_sequences, total_words return input_sequences, total_words
if __name__ == '__main__': if __name__ == '__main__':
kawa = load_kawa("../") kawa = load_kawa("../")
seqs, words = get_sequence_of_tokens(kawa) tokenizer = Tokenizer()
seqs, words = get_sequence_of_tokens(kawa, tokenizer)
texts = tokenizer.sequences_to_texts(seqs)
print("%i words." % words) print("%i words." % words)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment