chore(tokens): Better main

parent 425b8bf6
......@@ -3,8 +3,8 @@ from keras_preprocessing.text import Tokenizer
from KoozDawa.dawa.loader import load_kawa
def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()):
# TODO Tokenize while keeping accents
def get_sequence_of_tokens(corpus, tokenizer):
# TODO Tokenize while keeping apostrophes like j'aime
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
......@@ -18,12 +18,14 @@ def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()):
input_sequences.append(n_gram_sequence)
texts = tokenizer.sequences_to_texts(input_sequences)
print("Tokenized:", texts)
print("Tokenized:", texts[:5])
return input_sequences, total_words
if __name__ == '__main__':
kawa = load_kawa("../")
seqs, words = get_sequence_of_tokens(kawa)
tokenizer = Tokenizer()
seqs, words = get_sequence_of_tokens(kawa, tokenizer)
texts = tokenizer.sequences_to_texts(seqs)
print("%i words." % words)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment