refactor: Extract glossolalia

parent 7dfd5a5e
This source diff could not be displayed because it is too large. You can view the blob instead.
from keras.callbacks import ModelCheckpoint, EarlyStopping
from glossolalia.loader import load_kawa, clean_text, load_seeds
from glossolalia.loader import load_seeds, load_text
from glossolalia.lstm import generate_padded_sequences, create_model, generate_text
from glossolalia.tokens import PoemTokenizer
......@@ -9,15 +9,13 @@ def main():
# should_train = True
# model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
nb_words = 20
nb_epoch = 100
nb_layers = 128
nb_epoch = 50
nb_layers = 64
dropout = .2
tokenizer = PoemTokenizer()
# if should_train:
lines = load_kawa()
corpus = [clean_text(x) for x in lines]
corpus = load_text()
print("Corpus:", corpus[:10])
inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus)
......@@ -33,13 +31,14 @@ def main():
for i in range(0, nb_epoch, 10):
model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list)
print(generate_text(model, tokenizer, "", nb_words, max_sequence_len))
for seed in ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous"]:
print(generate_text(model, tokenizer, seed, nb_words, max_sequence_len))
# model.save(model_file)
# else: # FIXME: Load and predict, maybe reuse checkpoints?
# model = load_model(model_file)
for i, seed in enumerate(load_seeds(lines, 5)):
for i, seed in enumerate(load_seeds(corpus, 5)):
output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len)
print("%i %s -> %s" % (i, seed, output))
......
......@@ -25,11 +25,14 @@ class Tweeper(object):
def main():
Tweeper().tweet("les anges se sont fichés")
Tweeper().tweet("le business réel de la saint-valentin")
# Nous la nuit de la renaissance j’étais la tête
# Authenticate to Twitter
# tassepés en panel
# grands brûlés de la chine
# La nuit est belle, ma chérie salue sur la capuche
# Je suis pas étonné de dire pétrin
# Femme qui crame strasbourg
if __name__ == '__main__':
main()
from keras.callbacks import ModelCheckpoint, EarlyStopping
from glossolalia.loader import load_seeds, load_text
from glossolalia.lstm import generate_padded_sequences, create_model, generate_text
from glossolalia.tokens import PoemTokenizer
def main():
# should_train = True
# model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
nb_words = 20
nb_epoch = 50
nb_layers = 64
dropout = .2
tokenizer = PoemTokenizer()
# if should_train:
corpus = load_text()
print("Corpus:", corpus[:10])
inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus)
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
model = create_model(max_sequence_len, total_words, layers=nb_layers, dropout=dropout)
model.summary()
file_path = "../models/boulbi/boulbi_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (nb_layers, dropout, nb_epoch)
checkpoint = ModelCheckpoint(file_path, monitor='accuracy', period=10, save_best_only=True)
# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]
for i in range(0, nb_epoch, 10):
model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list)
for seed in ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous"]:
print(generate_text(model, tokenizer, seed, nb_words, max_sequence_len))
# model.save(model_file)
# else: # FIXME: Load and predict, maybe reuse checkpoints?
# model = load_model(model_file)
for i, seed in enumerate(load_seeds(corpus, 5)):
output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len)
print("%i %s -> %s" % (i, seed, output))
with open("./output/boulbi.txt", "a+") as f:
while True:
input_text = input("> ")
text = generate_text(model, tokenizer, input_text, nb_words, max_sequence_len)
print(text)
f.writelines("%s\n" % text)
def debug_unrandomize():
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed
# set seeds for reproducibility
set_random_seed(2)
seed(1)
if __name__ == '__main__':
debug_unrandomize()
main()
from glossolalia import loader
def clean(text):
pass
if __name__ == '__main__':
text = loader.load_text("../LeBoulbiNet/data/lyrics.txt")
print(text)
......@@ -3,57 +3,55 @@ import string
from pprint import pprint
from random import choice, randint
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed
def load_text(filename="./data/genius.txt"):
lines = filter_lines(get_lines(filename))
print("Loaded %i lines of data: %s." % (len(lines), lines[0]))
return lines
def load_kawa(root="./"):
# set seeds for reproducibility
set_random_seed(2)
seed(1)
def load_texts(root="./"):
data_dir = root + 'data/'
all_lines = []
files = os.listdir(data_dir)
print("%i files in data folder." % len(files))
for filename in files:
with open(data_dir + filename) as f:
all_lines.extend(get_lines(filename))
all_lines = filter_lines(all_lines)
print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
return all_lines
def filter_lines(all_lines):
all_lines = [h for h in all_lines if h[0] not in ["[", "#"]]
return all_lines
def get_lines(filename):
all_lines = []
with open(filename) as f:
content = f.readlines()
all_lines.extend(content)
all_lines = [h for h in all_lines if h[0] not in ["[", "#"]
]
len(all_lines)
print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
return all_lines
def load_seeds(kawa=None, nb_seeds=10):
if kawa is None:
kawa = load_kawa()
def load_seeds(corpus=None, nb_seeds=10):
if corpus is None:
corpus = load_texts()
seeds = []
for i in range(nb_seeds):
plain_kawa = filter(lambda k: k != "\n", kawa)
chosen = choice(list(plain_kawa))
plain_lines = filter(lambda k: k != "\n", corpus)
chosen = choice(list(plain_lines))
split = chosen.split(" ")
nb_words = randint(1, len(split))
seeds.append(" ".join(split[:nb_words]))
return seeds
def clean_text(lines):
"""
In dataset preparation step, we will first perform text cleaning of the data
which includes removal of punctuations and lower casing all the words.
"""
lines = "".join(v for v in lines if v not in string.punctuation)
# lines = lines.encode("utf8").decode("ascii", 'ignore')
return lines
def main():
lines = load_kawa("../")
clean = clean_text(lines)
print(clean)
lines = load_texts("../")
print("Some seeds:")
pprint(load_seeds(lines))
......
import os
import pickle
import lyricsgenius
from lyricsgenius.artist import Artist
def fetch():
def fetch(artist_name: str):
genius = lyricsgenius.Genius("zUSpjfQ9ELXDqOjx9hGfAlJGYQFrNvHh3rlDV298_QSr5ScKf3qlHZtOO2KsXspQ")
dooz: Artist = genius.search_artist("Dooz-kawa")
dooz: Artist = genius.search_artist(artist_name)
dooz.save_lyrics(overwrite=True)
print(dooz)
# with open("./dooz.dat", "a+") as f:
# pickle.dump(dooz, f)
def load() -> Artist:
with open("./dooz.dat", "r") as f:
return pickle.load(f)
def main():
fetch()
# dooz = load()
# print(dooz)
if __name__ == '__main__':
main()
fetch("Booba")
from keras_preprocessing.text import Tokenizer
from glossolalia.loader import load_kawa
from glossolalia.loader import load_texts
class PoemTokenizer(Tokenizer):
def __init__(self, **kwargs) -> None:
super().__init__(lower=True, # TODO: Better generalization without?
filters='#$%&()*+/<=>@[\\]^_`{|}~\t\n', oov_token="😢",
filters='$%&()*+/<=>@[\\]^_`{|}~\t\n', oov_token="😢",
**kwargs)
def get_sequence_of_tokens(self, corpus):
......@@ -32,7 +32,7 @@ class PoemTokenizer(Tokenizer):
if __name__ == '__main__':
kawa = load_kawa("../")
kawa = load_texts("../")
tokenizer = PoemTokenizer()
seqs, words = tokenizer.get_sequence_of_tokens(kawa)
texts = tokenizer.get_text(seqs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment