refactor(dawa): Generalize LSTM/Tweeper

42b38e3e · PLN (Algolia) · fbbea615 · 42b38e3e · 42b38e3e · 42b38e3e
Unverified Commit 42b38e3e authored Nov 26, 2019 by PLN (Algolia)
Show whitespace changes
Inline Side-by-side

Showing with 128 additions and 59 deletions

dawa.py KoozDawa/dawa.py +27 -37

tweet.py KoozDawa/tweet.py +24 -0

lstm.py glossolalia/lstm.py +63 -4

tweeper.py glossolalia/tweeper.py +14 -18

No files found.
--- a/KoozDawa/dawa.py
+++ b/KoozDawa/dawa.py
+from datetime import datetime
+
 from keras.callbacks import ModelCheckpoint, EarlyStopping

 from glossolalia.loader import load_seeds, load_text
-from glossolalia.lstm import generate_padded_sequences, create_model, generate_text
-from glossolalia.tokens import PoemTokenizer
+from glossolalia.lstm import LisSansTaMaman


-def main():
+def train():
    # should_train = True
    # model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
    nb_words = 20
-    nb_epoch = 50
-    nb_layers = 64
-    dropout = .2
-    tokenizer = PoemTokenizer()
+    nb_epoch = 100
+    nb_layers = 100
+    dropout = .3  # TODO finetune layers/dropout
+    validation_split = 0.2
+    lstm = LisSansTaMaman(nb_layers, dropout, validation_split, debug=True)
+    filename_model = "../models/dawa/dawa_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (
+        nb_layers, dropout, nb_epoch)
+    filename_output = "./output/dawa_%i-d%.1f_%s.txt" % (
+        nb_layers, dropout, datetime.now().strftime("%y%m%d_%H%M"))
+    callbacks_list = [
+        ModelCheckpoint(filename_model, monitor='val_accuracy', period=10, save_best_only=True),
+        EarlyStopping(monitor='val_accuracy', patience=5)]

-    # if should_train:
    corpus = load_text()
    print("Corpus:", corpus[:10])
-
-    inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus)
-    predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
-    model = create_model(max_sequence_len, total_words, layers=nb_layers, dropout=dropout)
-    model.summary()
-
-    file_path = "../models/dawa/dawa_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (nb_layers, dropout, nb_epoch)
-    checkpoint = ModelCheckpoint(file_path, monitor='accuracy', period=10, save_best_only=True)
-    # print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
-    early_stopping = EarlyStopping(monitor='accuracy', patience=5)
-    callbacks_list = [checkpoint, early_stopping]
-
+    lstm.create_model(corpus[:1000])
+    with open(filename_output, "a+") as f:
        for i in range(0, nb_epoch, 10):
-        model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list)
+            lstm.fit(epochs=min(i + 10, nb_epoch), initial_epoch=i,
+                     callbacks=callbacks_list,
+                     validation_split=validation_split)
+
            for seed in ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous"]:
-            print(generate_text(model, tokenizer, seed, nb_words, max_sequence_len))
+                print(lstm.predict(seed, nb_words))

        # model.save(model_file)
        # else: # FIXME: Load and predict, maybe reuse checkpoints?
        # model = load_model(model_file)

        for i, seed in enumerate(load_seeds(corpus, 5)):
-        output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len)
+            output = lstm.predict(seed, nb_words)
            print("%i %s -> %s" % (i, seed, output))
+            f.writelines(output)

-    with open("./output/dawa.txt", "a+") as f:
        while True:
            input_text = input("> ")
-            text = generate_text(model, tokenizer, input_text, nb_words, max_sequence_len)
-
+            text = lstm.predict(input_text, nb_words)
            print(text)
            f.writelines("%s\n" % text)


-def debug_unrandomize():
-    from numpy.random import seed
-    from tensorflow_core.python.framework.random_seed import set_random_seed
-
-    # set seeds for reproducibility
-    set_random_seed(2)
-    seed(1)
-
-
 if __name__ == '__main__':
-    debug_unrandomize()
-    main()
+    train()
--- a/KoozDawa/tweet.py
+++ b/KoozDawa/tweet.py
+from glossolalia.tweeper import Tweeper
+
+
+def tweet():
+    # La nuit est belle, ma chérie salue sur la capuche
+    # grands brûlés de la chine
+    # Femme qui crame strasbourg
+    # le soleil est triste
+    # on a pas un martyr parce qu't'es la
+    # des neiges d'insuline
+    # une hypothèse qu'engendre la haine n'est qu'une prison vide
+    # Un jour de l'an commencé sur les autres
+    # Relater l'passionnel dans les casseroles d'eau de marécages
+    # sniff de Caravage rapide
+    # La nuit c'est le soleil
+
+    # Les rues d'ma vie se terminent par des partouzes de ciel
+    # des glaçons pour les yeux brisées
+
+    Tweeper("KoozDawa").tweet("tassepés en panel")
+
+
+if __name__ == '__main__':
+    tweet()
--- a/glossolalia/lstm.py
+++ b/glossolalia/lstm.py
 import warnings
+from typing import List

 import numpy as np
-from keras import Sequential
+from keras import Sequential, Model
+from keras.callbacks import Callback, History
 from keras.layers import Embedding, LSTM, Dropout, Dense
 from keras.utils import to_categorical
 from keras_preprocessing.sequence import pad_sequences
 from keras_preprocessing.text import Tokenizer

+from glossolalia.tokens import PoemTokenizer
+
 warnings.filterwarnings("ignore")
 warnings.simplefilter(action='ignore', category=FutureWarning)


-# 3.3 Padding the Sequences and obtain Variables : Predictors and Target
-def generate_padded_sequences(input_sequences, total_words):
+def debug_unrandomize():
+    from numpy.random import seed
+    from tensorflow_core.python.framework.random_seed import set_random_seed
+
+    # set seeds for reproducibility
+    set_random_seed(2)
+    seed(1)
+
+
+class LisSansTaMaman(object):
+    """ A LSTM model adapted for french lyrical texts."""
+
+    def __init__(self, nb_layers: int = 100,
+                 dropout: float = 0.1, validation_split: float = 0.0,
+                 tokenizer=PoemTokenizer(),
+                 debug: bool = False):
+        self.validation_split = validation_split
+        self.dropout = dropout
+        self.nb_layers = nb_layers
+        self.tokenizer = tokenizer
+
+        # Model state
+        self.model: Model = None
+        self.predictors = None
+        self.label = None
+        self.max_sequence_len = None
+
+        if debug:
+            debug_unrandomize()
+
+    def create_model(self, corpus: List[str]):
+        inp_sequences, total_words = self.tokenizer.get_sequence_of_tokens(corpus)
+
+        self.predictors, self.label, self.max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
+        model = create_model(self.max_sequence_len, total_words, layers=self.nb_layers, dropout=self.dropout)
+        model.summary()
+
+        self.model = model
+
+    # TODO: Batch fit? splitting nb_epoch into N step
+    def fit(self, epochs: int, initial_epoch: int = 0,
+            callbacks: List[Callback] = None,
+            validation_split: float = 0
+            ) -> History:
+        return self.model.fit(self.predictors, self.label,
+                              verbose=2,
+                              callbacks=callbacks,
+                              validation_split=validation_split,
+                              epochs=epochs, initial_epoch=initial_epoch)
+
+    def predict(self, seed="", nb_words=None):
+        if nb_words is None:
+            nb_words = 20  # TODO: Guess based on model a good number of words
+        return generate_text(self.model, self.tokenizer, seed, nb_words, self.max_sequence_len)
+
+
+def generate_padded_sequences(input_sequences, total_words: int):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
@@ -20,7 +79,7 @@ def generate_padded_sequences(input_sequences, total_words):
    return predictors, label, max_sequence_len


-def create_model(max_sequence_len, total_words, layers=128, dropout=0.3):  # TODO finetune layers/dropout
+def create_model(max_sequence_len: int, total_words: int, layers: int, dropout: float):
    print("Creating model across %i words for %i-long seqs (%i layers, %.2f dropout):" %
          (total_words, max_sequence_len, layers, dropout))
    input_len = max_sequence_len - 1

--- a/KoozDawa/tweeper.py
+++ b/KoozDawa/tweeper.py
 #! /usr/bin/env python
 import os
 import time
+
 import tweepy
 from didyoumean3.didyoumean import did_you_mean
+from tweepy import Cursor


 class Tweeper(object):
-
-    def __init__(self):
+    def __init__(self, name: str):
        auth = tweepy.OAuthHandler(
            os.environ["ZOO_DAWA_KEY"],
            os.environ["ZOO_DAWA_KEY_SECRET"])
@@ -15,24 +16,18 @@ class Tweeper(object):
            os.environ["ZOO_DAWA_TOKEN"],
            os.environ["ZOO_DAWA_TOKEN_SECRET"])
        self.api = tweepy.API(auth)
+        self.name = name
+
+    @property
+    def all_tweets(self):
+        return [t.text for t in Cursor(self.api.user_timeline, id=self.name).items()]

-    def tweet(self, message):
+    def tweet(self, message, wait_delay=5, prevent_duplicate=True):
        """Tweets a message after spellchecking it."""
+        if prevent_duplicate and message in self.all_tweets:
+            print("Was already tweeted: %s." % message)
+        else:
            message = did_you_mean(message)
            print("About to tweet:", message)
-        time.sleep(5)
+            time.sleep(wait_delay)
            self.api.update_status(message)
\ No newline at end of file
-
-
-def main():
-    Tweeper().tweet("le business réel de la saint-valentin")
-# Nous la nuit de la renaissance j’étais la tête
-
-# Authenticate to Twitter
-# tassepés en panel
-# grands brûlés de la chine
-# La nuit est belle, ma chérie salue sur la capuche
-# Je suis pas étonné de dire pétrin
-# Femme qui crame strasbourg
-if __name__ == '__main__':
-    main()