refactor(dawa): Extract loader & tokens

236ec28a · PLN (Algolia) · 5338bfc8 · 236ec28a · 236ec28a · 236ec28a
Unverified Commit 236ec28a authored Nov 17, 2019 by PLN (Algolia)
Hide whitespace changes
Inline Side-by-side

Showing with 89 additions and 67 deletions

loader.py KoozDawa/dawa/loader.py +43 -0

lstm.py KoozDawa/dawa/lstm.py +15 -67

tokens.py KoozDawa/dawa/tokens.py +31 -0

No files found.
--- a/KoozDawa/dawa/loader.py
+++ b/KoozDawa/dawa/loader.py
+import os
+import string
+
+from numpy.random import seed
+from tensorflow_core.python.framework.random_seed import set_random_seed
+
+
+def load_kawa(root="./"):
+    # set seeds for reproducibility
+    set_random_seed(2)
+    seed(1)
+    data_dir = root + 'data/'
+    all_lines = []
+    for filename in os.listdir(data_dir):
+        with open(data_dir + filename) as f:
+            content = f.readlines()
+            all_lines.extend(content)
+
+    all_lines = [h for h in all_lines if h[0] not in ["[", "#"]
+                 ]
+    len(all_lines)
+    print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
+    return all_lines
+
+
+def clean_text(lines):
+    """
+    In dataset preparation step, we will first perform text cleaning of the data
+    which includes removal of punctuations and lower casing all the words.
+    """
+    lines = " ".join(v for v in lines if v not in string.punctuation).lower()
+    lines = lines.encode("utf8").decode("ascii", 'ignore')
+    return lines
+
+
+def main():
+    lines = load_kawa("../")
+    clean = clean_text(lines)
+    print(clean)
+
+
+if __name__ == '__main__':
+    main()
--- a/KoozDawa/lstm.py
+++ b/KoozDawa/lstm.py
-import os
-import string
 import warnings

 import numpy as np
 from keras import Sequential
 from keras.engine.saving import load_model
 from keras.layers import Embedding, LSTM, Dropout, Dense
-from keras.preprocessing.text import Tokenizer
 from keras.utils import to_categorical
 from keras_preprocessing.sequence import pad_sequences
-from numpy.random import seed
-from tensorflow_core.python.framework.random_seed import set_random_seed
+from keras_preprocessing.text import Tokenizer
+
+from KoozDawa.dawa.loader import load_kawa, clean_text
+from KoozDawa.dawa.tokens import get_sequence_of_tokens

 warnings.filterwarnings("ignore")
 warnings.simplefilter(action='ignore', category=FutureWarning)


-def load():
-    # set seeds for reproducibility
-    set_random_seed(2)
-    seed(1)
-    data_dir = 'data/'
-    all_lines = []
-    for filename in os.listdir(data_dir):
-        with open(data_dir + filename) as f:
-            content = f.readlines()
-            all_lines.extend(content)
-
-    all_lines = [h for h in all_lines if h[0] not in ["[", "#"]
-                 ]
-    len(all_lines)
-    print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
-    return all_lines
-
-
-# 3.1 Dataset cleaning
-# In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations
-# and lower casing all the words.
-
-def clean_text(txt):
-    txt = " ".join(v for v in txt if v not in string.punctuation).lower()
-    txt = txt.encode("utf8").decode("ascii", 'ignore')
-    return txt
-
-
-# 3.2 Generating Sequence of N-gram Tokens
-#
-# Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token.
-#
-# The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens.
-
-
-tokenizer = Tokenizer()
-
-
-def get_sequence_of_tokens(corpus):
-    # TODO Tokenize while keeping accents
-    tokenizer.fit_on_texts(corpus)
-    total_words = len(tokenizer.word_index) + 1
-
-    # convert data to sequence of tokens
-    input_sequences = []
-    for line in corpus:
-        token_list = tokenizer.texts_to_sequences([line])[0]
-        for i in range(1, len(token_list)):
-            n_gram_sequence = token_list[:i + 1]
-            input_sequences.append(n_gram_sequence)
-    return input_sequences, total_words
-
-
-# 3.3 Padding the Sequences and obtain Variables : Predictors and Target¶
+# 3.3 Padding the Sequences and obtain Variables : Predictors and Target
 def generate_padded_sequences(input_sequences, total_words):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

+    print("Max len:", max_sequence_len)
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len
@@ -98,7 +45,7 @@ def create_model(max_sequence_len, total_words, layers=100, dropout=0.1):  # TOD
    return model


-def generate_text(seed_text, nb_words, model, max_sequence_len):
+def generate_text(model, tokenizer, seed_text="", nb_words=5, max_sequence_len=0):
    for _ in range(nb_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
@@ -115,17 +62,18 @@ def generate_text(seed_text, nb_words, model, max_sequence_len):

 def main():
    should_train = True
-    nb_epoch = 20
+    nb_epoch = 100
    model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
-    max_sequence_len = 5 # TODO: Test different default
+    max_sequence_len = 5  # TODO: Test different default
+    tokenizer = Tokenizer()

    if should_train:
-        lines = load()
+        lines = load_kawa()

        corpus = [clean_text(x) for x in lines]
        print(corpus[:10])

-        inp_sequences, total_words = get_sequence_of_tokens(corpus)
+        inp_sequences, total_words = get_sequence_of_tokens(corpus, tokenizer)
        print(inp_sequences[:10])

        predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
@@ -139,12 +87,12 @@ def main():
    else:
        model = load_model(model_file)

-    print(generate_text("", 10, model, max_sequence_len))
-    print(generate_text("L'étoile", 10, model, max_sequence_len))
+    for sample in ["", "L'étoile ", "Elle ", "Les punchlines "]:
+        print(generate_text(model, tokenizer, sample, 100, max_sequence_len))

    while True:
        input_text = input("> ")
-        print(generate_text(input_text, 10, model, max_sequence_len))
+        print(generate_text(model, tokenizer, input_text, 100, max_sequence_len))


 if __name__ == '__main__':

--- a/KoozDawa/dawa/tokens.py
+++ b/KoozDawa/dawa/tokens.py
+from keras_preprocessing.text import Tokenizer
+
+from KoozDawa.dawa.loader import load_kawa
+
+
+def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()):
+    # TODO Tokenize while keeping accents
+    tokenizer.fit_on_texts(corpus)
+    total_words = len(tokenizer.word_index) + 1
+
+    # convert data to sequence of tokens
+    input_sequences = []
+
+    # FIXME Debug: truncate corpus
+    corpus = corpus[:50]
+    for line in corpus:
+        token_list = tokenizer.texts_to_sequences([line])[0]
+        for i in range(1, len(token_list)):
+            n_gram_sequence = token_list[:i + 1]
+            input_sequences.append(n_gram_sequence)
+
+    texts = tokenizer.sequences_to_texts(input_sequences)
+    print("Tokenized:", texts)
+
+    return input_sequences, total_words
+
+
+if __name__ == '__main__':
+    kawa = load_kawa("../")
+    seqs, words = get_sequence_of_tokens(kawa)
+    print("%i words." % words)