feat(cleaner): MVP on Lucio

7a0407d8 · PLN (Algolia) · 8210fef0 · 7a0407d8 · 7a0407d8 · 7a0407d8
Unverified Commit 7a0407d8 authored Dec 01, 2019 by PLN (Algolia)
Hide whitespace changes
Inline Side-by-side

Showing with 77 additions and 4 deletions

buko.py Buko/buko.py +57 -0

genius.txt Buko/genius.txt +0 -0

tweet.py KoozDawa/tweet.py +0 -1

cleaner.py glossolalia/cleaner.py +20 -3

No files found.
--- a/Buko/buko.py
+++ b/Buko/buko.py
+import os
+from datetime import datetime
+from keras.callbacks import ModelCheckpoint, EarlyStopping
+from glossolalia.loader import load_seeds, load_text
+from glossolalia.lstm import LisSansTaMaman
+def train():
+    # should_train = True
+    nb_words = 20
+    nb_epoch = 50
+    nb_layers = 100
+    dropout = .2  # TODO finetune layers/dropout
+    validation_split = 0.2
+    lstm = LisSansTaMaman(nb_layers, dropout, validation_split, debug=True)
+    #
+    filename_model = "../models/buko/buko_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (
+        nb_layers, dropout, nb_epoch)
+    dir = "./output/"
+    filename_output = dir + "buko_%i-d%.1f_%s.txt" % (
+        nb_layers, dropout, datetime.now().strftime("%y%m%d_%H%M"))
+    callbacks_list = [
+        ModelCheckpoint(filename_model, monitor='val_accuracy', period=10, save_best_only=True),
+        EarlyStopping(monitor='val_accuracy', patience=5)]
+    corpus = load_text("./genius.txt")
+    print("Corpus:", corpus[:10])
+    lstm.create_model(corpus[:1000])
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+    with open(filename_output, "a+") as f:
+        for i in range(0, nb_epoch, 10):
+            lstm.fit(epochs=min(i + 10, nb_epoch), initial_epoch=i,
+                     callbacks=callbacks_list,
+                     validation_split=validation_split)
+            for output in lstm.predict_seeds(nb_words):
+                print(output)
+                f.writelines(output)
+        for i, seed in enumerate(load_seeds(corpus, 5)):
+            output = lstm.predict(seed, nb_words)
+            print("%i %s -> %s" % (i, seed, output))
+            f.writelines(output)
+        while True:
+            input_text = input("> ")
+            text = lstm.predict(input_text, nb_words)
+            print(text)
+            f.writelines("%s\n" % text)
+if __name__ == '__main__':
+    train()
--- a/Buko/genius.txt
+++ b/Buko/genius.txt
--- a/KoozDawa/tweet.py
+++ b/KoozDawa/tweet.py
@@ -10,7 +10,6 @@ def tweet():
    # des neiges d'insuline
    # Un jour de l'an commencé sur les autres
    # une hypothèse qu'engendre la haine n'est qu'une prison vide
-    # sniff de Caravage rapide
    # Relater l'passionnel dans les casseroles d'eau de marécages
    # La nuit c'est le soleil

--- a/glossolalia/cleaner.py
+++ b/glossolalia/cleaner.py
-from glossolalia import loader
+import json
 def clean(text):
@@ -10,5 +10,22 @@ def clean(text):
 if __name__ == '__main__':
-    text = loader.load_text("../LeBoulbiNet/data/lyrics.txt")
+    corpus: str = ""
-    print(text)
+    with open("../data/Lyrics_LucioBukowski.json") as f:
+        text = json.load(f)
+        songs = text["songs"]
+        for song in songs:
+            lyrics = song["lyrics"]
+            if lyrics is not None:
+                corpus += lyrics + "\n"
+        corpus = "\n".join([l.replace("’", "\'")
+                            for l in corpus.split("\n")
+                            if l != ""
+                            and l.find("RapGenius France") == -1])
+        print(corpus)
+        with open("../data/genius.txt", "w+") as o:
+            o.writelines(corpus)