feat(LeVerbe): WIP

6b5c072b · PLN (Algolia) · d208707c · 6b5c072b · 6b5c072b · 6b5c072b
Unverified Commit 6b5c072b authored Feb 01, 2020 by PLN (Algolia)
7 changed files
--- a/LeVerbe/1996.zip
+++ b/LeVerbe/1996.zip
--- a/LeVerbe/bible_fr.txt
+++ b/LeVerbe/bible_fr.txt
--- a/LeVerbe/cleanup.py
+++ b/LeVerbe/cleanup.py
+import re
+
+from ftfy import fix_encoding
+
+
+def try_fix_encoding():
+    with open("./bible_fr.txt", encoding='utf-8') as f:
+        for line in f.readlines():
+            print(fix_encoding(line))
+
+
+def extract_lines():
+    keywords = [
+        "word",
+        "voice",
+        "breath"
+    ]
+    with open("./james.txt", "r") as fin:
+        with open("./data.txt", "w") as fout:
+            for line in fin.readlines():
+                if len(line) == 1:
+                    print("Skipping empty line")
+                    continue
+                print("Line:", line)
+                if any(re.search("\\b%s\\b" % word, line) for word in keywords):
+                    print("Match: %s" % line)
+                    fout.write(line)
+
+
+if __name__ == '__main__':
+    # extract_lines()
+    try_fix_encoding()
--- a/LeVerbe/data.txt
+++ b/LeVerbe/data.txt
--- a/LeVerbe/james.txt
+++ b/LeVerbe/james.txt
--- a/LeVerbe/verbe.py
+++ b/LeVerbe/verbe.py
+from datetime import datetime
+
+from glossolalia.loader import load_seeds, load_text
+from glossolalia.lstm import LisSansTaMaman
+
+"""
+he a mighty hand
+"""
+
+
+def train():
+    # should_train = True
+    nb_words = 60
+    nb_epoch = 60
+    nb_layers = 128
+    dropout = .3
+    validation_split = .3
+    lstm = LisSansTaMaman(nb_layers, dropout, validation_split, debug=True)
+    filename_output = "./output/verbe_%i-d%.1f_%s.txt" % (
+        nb_layers, dropout, datetime.now().strftime("%y%m%d_%H%M"))
+    seeds = ["Thus", "And", "He", "Our", "I", "Thou", "But"]
+
+    corpus = load_text("./data.txt")
+    print("Corpus:", corpus[:10])
+    lstm.create_model(corpus[:100])  # FIXME: Scale up to whole corpus
+
+    with open(filename_output, "a+") as f:
+        for i in range(0, nb_epoch, 10):
+            lstm.fit(epochs=min(i + 10, nb_epoch), initial_epoch=i,
+                     validation_split=validation_split)
+
+            for output in lstm.predict_seeds(nb_words, seeds):
+                print(output)
+                f.writelines(output)
+
+        for i, seed in enumerate(load_seeds(corpus, 5)):
+            output = lstm.predict(seed, nb_words)
+            print("%i %s -> %s" % (i, seed, output))
+            f.writelines(output)
+
+        while True:
+            input_text = input("> ")
+            text = lstm.predict(input_text, nb_words)
+            print(text)
+            f.writelines("%s\n" % text)
+
+
+if __name__ == '__main__':
+    train()
--- a/data/Lyrics_Lucio.json
+++ b/data/Lyrics_Lucio.json