refactor: Extract glossolalia

parent 7dfd5a5e
This source diff could not be displayed because it is too large. You can view the blob instead.
from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.callbacks import ModelCheckpoint, EarlyStopping
from glossolalia.loader import load_kawa, clean_text, load_seeds from glossolalia.loader import load_seeds, load_text
from glossolalia.lstm import generate_padded_sequences, create_model, generate_text from glossolalia.lstm import generate_padded_sequences, create_model, generate_text
from glossolalia.tokens import PoemTokenizer from glossolalia.tokens import PoemTokenizer
...@@ -9,15 +9,13 @@ def main(): ...@@ -9,15 +9,13 @@ def main():
# should_train = True # should_train = True
# model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch # model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
nb_words = 20 nb_words = 20
nb_epoch = 100 nb_epoch = 50
nb_layers = 128 nb_layers = 64
dropout = .2 dropout = .2
tokenizer = PoemTokenizer() tokenizer = PoemTokenizer()
# if should_train: # if should_train:
lines = load_kawa() corpus = load_text()
corpus = [clean_text(x) for x in lines]
print("Corpus:", corpus[:10]) print("Corpus:", corpus[:10])
inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus) inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus)
...@@ -33,13 +31,14 @@ def main(): ...@@ -33,13 +31,14 @@ def main():
for i in range(0, nb_epoch, 10): for i in range(0, nb_epoch, 10):
model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list) model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list)
print(generate_text(model, tokenizer, "", nb_words, max_sequence_len)) for seed in ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous"]:
print(generate_text(model, tokenizer, seed, nb_words, max_sequence_len))
# model.save(model_file) # model.save(model_file)
# else: # FIXME: Load and predict, maybe reuse checkpoints? # else: # FIXME: Load and predict, maybe reuse checkpoints?
# model = load_model(model_file) # model = load_model(model_file)
for i, seed in enumerate(load_seeds(lines, 5)): for i, seed in enumerate(load_seeds(corpus, 5)):
output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len) output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len)
print("%i %s -> %s" % (i, seed, output)) print("%i %s -> %s" % (i, seed, output))
......
...@@ -25,11 +25,14 @@ class Tweeper(object): ...@@ -25,11 +25,14 @@ class Tweeper(object):
def main(): def main():
Tweeper().tweet("les anges se sont fichés") Tweeper().tweet("le business réel de la saint-valentin")
# Nous la nuit de la renaissance j’étais la tête
# Authenticate to Twitter # Authenticate to Twitter
# tassepés en panel
# grands brûlés de la chine
# La nuit est belle, ma chérie salue sur la capuche
# Je suis pas étonné de dire pétrin
# Femme qui crame strasbourg
if __name__ == '__main__': if __name__ == '__main__':
main() main()
from keras.callbacks import ModelCheckpoint, EarlyStopping
from glossolalia.loader import load_seeds, load_text
from glossolalia.lstm import generate_padded_sequences, create_model, generate_text
from glossolalia.tokens import PoemTokenizer
def main():
# should_train = True
# model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
nb_words = 20
nb_epoch = 50
nb_layers = 64
dropout = .2
tokenizer = PoemTokenizer()
# if should_train:
corpus = load_text()
print("Corpus:", corpus[:10])
inp_sequences, total_words = tokenizer.get_sequence_of_tokens(corpus)
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
model = create_model(max_sequence_len, total_words, layers=nb_layers, dropout=dropout)
model.summary()
file_path = "../models/boulbi/boulbi_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (nb_layers, dropout, nb_epoch)
checkpoint = ModelCheckpoint(file_path, monitor='accuracy', period=10, save_best_only=True)
# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]
for i in range(0, nb_epoch, 10):
model.fit(predictors, label, initial_epoch=i, epochs=min(i + 10, nb_epoch), verbose=2, callbacks=callbacks_list)
for seed in ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous"]:
print(generate_text(model, tokenizer, seed, nb_words, max_sequence_len))
# model.save(model_file)
# else: # FIXME: Load and predict, maybe reuse checkpoints?
# model = load_model(model_file)
for i, seed in enumerate(load_seeds(corpus, 5)):
output = generate_text(model, tokenizer, seed, nb_words, max_sequence_len)
print("%i %s -> %s" % (i, seed, output))
with open("./output/boulbi.txt", "a+") as f:
while True:
input_text = input("> ")
text = generate_text(model, tokenizer, input_text, nb_words, max_sequence_len)
print(text)
f.writelines("%s\n" % text)
def debug_unrandomize():
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed
# set seeds for reproducibility
set_random_seed(2)
seed(1)
if __name__ == '__main__':
debug_unrandomize()
main()
from glossolalia import loader
def clean(text):
pass
if __name__ == '__main__':
text = loader.load_text("../LeBoulbiNet/data/lyrics.txt")
print(text)
...@@ -3,57 +3,55 @@ import string ...@@ -3,57 +3,55 @@ import string
from pprint import pprint from pprint import pprint
from random import choice, randint from random import choice, randint
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed def load_text(filename="./data/genius.txt"):
lines = filter_lines(get_lines(filename))
print("Loaded %i lines of data: %s." % (len(lines), lines[0]))
return lines
def load_kawa(root="./"): def load_texts(root="./"):
# set seeds for reproducibility
set_random_seed(2)
seed(1)
data_dir = root + 'data/' data_dir = root + 'data/'
all_lines = [] all_lines = []
files = os.listdir(data_dir) files = os.listdir(data_dir)
print("%i files in data folder." % len(files)) print("%i files in data folder." % len(files))
for filename in files: for filename in files:
with open(data_dir + filename) as f: all_lines.extend(get_lines(filename))
content = f.readlines()
all_lines.extend(content)
all_lines = [h for h in all_lines if h[0] not in ["[", "#"] all_lines = filter_lines(all_lines)
]
len(all_lines)
print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0])) print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
return all_lines return all_lines
def load_seeds(kawa=None, nb_seeds=10): def filter_lines(all_lines):
if kawa is None: all_lines = [h for h in all_lines if h[0] not in ["[", "#"]]
kawa = load_kawa() return all_lines
def get_lines(filename):
all_lines = []
with open(filename) as f:
content = f.readlines()
all_lines.extend(content)
return all_lines
def load_seeds(corpus=None, nb_seeds=10):
if corpus is None:
corpus = load_texts()
seeds = [] seeds = []
for i in range(nb_seeds): for i in range(nb_seeds):
plain_kawa = filter(lambda k: k != "\n", kawa) plain_lines = filter(lambda k: k != "\n", corpus)
chosen = choice(list(plain_kawa)) chosen = choice(list(plain_lines))
split = chosen.split(" ") split = chosen.split(" ")
nb_words = randint(1, len(split)) nb_words = randint(1, len(split))
seeds.append(" ".join(split[:nb_words])) seeds.append(" ".join(split[:nb_words]))
return seeds return seeds
def clean_text(lines):
"""
In dataset preparation step, we will first perform text cleaning of the data
which includes removal of punctuations and lower casing all the words.
"""
lines = "".join(v for v in lines if v not in string.punctuation)
# lines = lines.encode("utf8").decode("ascii", 'ignore')
return lines
def main(): def main():
lines = load_kawa("../") lines = load_texts("../")
clean = clean_text(lines)
print(clean)
print("Some seeds:") print("Some seeds:")
pprint(load_seeds(lines)) pprint(load_seeds(lines))
......
...@@ -55,4 +55,4 @@ def generate_text(model: Sequential, tokenizer: Tokenizer, seed_text="", nb_word ...@@ -55,4 +55,4 @@ def generate_text(model: Sequential, tokenizer: Tokenizer, seed_text="", nb_word
token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre') token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
predicted = model.predict_classes(token_list, verbose=2)[0] predicted = model.predict_classes(token_list, verbose=2)[0]
output += " " + word_indices[predicted] output += " " + word_indices[predicted]
return output.capitalize() return output.capitalize()
\ No newline at end of file
import os
import pickle
import lyricsgenius import lyricsgenius
from lyricsgenius.artist import Artist from lyricsgenius.artist import Artist
def fetch(): def fetch(artist_name: str):
genius = lyricsgenius.Genius("zUSpjfQ9ELXDqOjx9hGfAlJGYQFrNvHh3rlDV298_QSr5ScKf3qlHZtOO2KsXspQ") genius = lyricsgenius.Genius("zUSpjfQ9ELXDqOjx9hGfAlJGYQFrNvHh3rlDV298_QSr5ScKf3qlHZtOO2KsXspQ")
dooz: Artist = genius.search_artist("Dooz-kawa") dooz: Artist = genius.search_artist(artist_name)
dooz.save_lyrics(overwrite=True) dooz.save_lyrics(overwrite=True)
print(dooz)
# with open("./dooz.dat", "a+") as f:
# pickle.dump(dooz, f)
def load() -> Artist:
with open("./dooz.dat", "r") as f:
return pickle.load(f)
def main():
fetch()
# dooz = load()
# print(dooz)
if __name__ == '__main__': if __name__ == '__main__':
main() fetch("Booba")
from keras_preprocessing.text import Tokenizer from keras_preprocessing.text import Tokenizer
from glossolalia.loader import load_kawa from glossolalia.loader import load_texts
class PoemTokenizer(Tokenizer): class PoemTokenizer(Tokenizer):
def __init__(self, **kwargs) -> None: def __init__(self, **kwargs) -> None:
super().__init__(lower=True, # TODO: Better generalization without? super().__init__(lower=True, # TODO: Better generalization without?
filters='#$%&()*+/<=>@[\\]^_`{|}~\t\n', oov_token="😢", filters='$%&()*+/<=>@[\\]^_`{|}~\t\n', oov_token="😢",
**kwargs) **kwargs)
def get_sequence_of_tokens(self, corpus): def get_sequence_of_tokens(self, corpus):
...@@ -32,7 +32,7 @@ class PoemTokenizer(Tokenizer): ...@@ -32,7 +32,7 @@ class PoemTokenizer(Tokenizer):
if __name__ == '__main__': if __name__ == '__main__':
kawa = load_kawa("../") kawa = load_texts("../")
tokenizer = PoemTokenizer() tokenizer = PoemTokenizer()
seqs, words = tokenizer.get_sequence_of_tokens(kawa) seqs, words = tokenizer.get_sequence_of_tokens(kawa)
texts = tokenizer.get_text(seqs) texts = tokenizer.get_text(seqs)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment