Unverified Commit 4c3009e8 by PLN

feat:AnarchoWiz, verne output, glossolalia updates

parent 6b5c072b
...@@ -132,6 +132,6 @@ dmypy.json ...@@ -132,6 +132,6 @@ dmypy.json
.idea/ .idea/
# Outputs # Outputs
output/*.txt #output/*.txt
output/ #output/
models/ models/
from datetime import datetime
from keras.callbacks import ModelCheckpoint, EarlyStopping
from glossolalia.loader import load_seeds, load_text
from glossolalia.lstm import LisSansTaMaman
def train():
nb_words = 50
nb_epoch = 200
nb_layers = 64
dropout = .4 # TODO finetune layers/dropout
validation_split = 0.3
lstm = LisSansTaMaman(nb_layers, dropout, validation_split, bidirectional=True, debug=True)
#
filename_model = "../models/anar/anar_lstm%i-d%.1f-{epoch:02d}_%i-{accuracy:.4f}.hdf5" % (
nb_layers, dropout, nb_epoch)
filename_output = "./output/anarchowiz_%i-d%.1f_%s.txt" % (
nb_layers, dropout, datetime.now().strftime("%y%m%d_%H%M"))
callbacks_list = [
ModelCheckpoint(filename_model, monitor='val_accuracy', period=10, save_best_only=True),
EarlyStopping(monitor='val_accuracy', patience=5)]
corpus = load_text("sorcellerie.txt")[:100] \
+ load_text("anarchie.txt")[:200]
print("Corpus:", corpus[:10])
lstm.create_model(corpus)
with open(filename_output, "a+") as f:
for i in range(0, nb_epoch, 10):
lstm.fit(epochs=min(i + 10, nb_epoch), initial_epoch=i,
callbacks=callbacks_list,
validation_split=validation_split)
for output in lstm.predict_seeds(nb_words):
print(output)
f.writelines(output)
for i, seed in enumerate(load_seeds(corpus, 5)):
output = lstm.predict(seed, nb_words)
print("%i %s -> %s" % (i, seed, output))
f.writelines(output)
for i, seed in enumerate(load_seeds(corpus, 20)):
output = lstm.predict(seed, nb_words)
print("%i %s -> %s" % (i, seed, output))
f.writelines(output)
while True:
input_text = input("> ")
text = lstm.predict(input_text, nb_words)
print(text)
f.writelines("%s\n" % text)
if __name__ == '__main__':
train()
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
def main():
example_gpt2()
def example_gpt2():
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
# encode context the generation is conditioned on
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='tf')
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)
# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))
# set no_repeat_ngram_size to 2
beam_output = model.generate(
input_ids,
max_length=50,
num_beams=5,
no_repeat_ngram_size=2,
early_stopping=True
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))
# set return_num_sequences > 1
beam_outputs = model.generate(
input_ids,
max_length=50,
num_beams=5,
no_repeat_ngram_size=2,
num_return_sequences=5,
early_stopping=True
)
# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))
# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=0
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=0,
temperature=0.7
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
# set top_k to 50
sample_output = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=50
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
# deactivate top_k sampling and sample only from 92% most likely words
sample_output = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_p=0.92,
top_k=0
)
print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
input_ids,
do_sample=True,
max_length=50,
top_k=50,
top_p=0.95,
num_return_sequences=3
)
print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
if __name__ == '__main__':
main()
# la pureté de l'émancipation
> Étude automatique sur la monstruosité
Le problème de l'afrique, c'est le développement de la sagesse.
La réalité de l'afrique, c'est d'apprendre de la compagnie de transports.
Les passagers sous-marins, puis les esprits.
Les tentacules peuvent enlacer les esprits.
Les incrédules sont les africains, dans les théâtres.
La france européenne vous appartient pas
souvenir de la fatigue du voyage
les passagers terrestres, éléphants et les chamanes.
Vous voulez le règlement pacifique de sa vengeance du 30 avril.
Fantastique animal endossa la responsabilité de ses mille chevaux-vapeur.
Le problème de l'afrique, c'est d'apprendre de formidables engins.
## Nicolas Sarkozy & Jules Vernes
## Discours de Dakar / 20000 Lieux sous les mers
\ No newline at end of file
...@@ -4,7 +4,7 @@ from typing import List ...@@ -4,7 +4,7 @@ from typing import List
import numpy as np import numpy as np
from keras import Sequential, Model from keras import Sequential, Model
from keras.callbacks import Callback, History from keras.callbacks import Callback, History
from keras.layers import Embedding, LSTM, Dropout, Dense from keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from keras.utils import to_categorical from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer from keras_preprocessing.text import Tokenizer
...@@ -22,7 +22,9 @@ class LisSansTaMaman(object): ...@@ -22,7 +22,9 @@ class LisSansTaMaman(object):
def __init__(self, nb_layers: int = 100, def __init__(self, nb_layers: int = 100,
dropout: float = 0.1, validation_split: float = 0.0, dropout: float = 0.1, validation_split: float = 0.0,
tokenizer=PoemTokenizer(), tokenizer=PoemTokenizer(),
bidirectional: bool = False,
debug: bool = False): debug: bool = False):
self.bidirectional = bidirectional
self.validation_split = validation_split self.validation_split = validation_split
self.dropout = dropout self.dropout = dropout
self.nb_layers = nb_layers self.nb_layers = nb_layers
...@@ -41,7 +43,9 @@ class LisSansTaMaman(object): ...@@ -41,7 +43,9 @@ class LisSansTaMaman(object):
inp_sequences, total_words = self.tokenizer.get_sequence_of_tokens(corpus) inp_sequences, total_words = self.tokenizer.get_sequence_of_tokens(corpus)
self.predictors, self.label, self.max_sequence_len = generate_padded_sequences(inp_sequences, total_words) self.predictors, self.label, self.max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
model = create_model(self.max_sequence_len, total_words, layers=self.nb_layers, dropout=self.dropout) model = create_model(self.max_sequence_len, total_words,
layers=self.nb_layers, dropout=self.dropout,
bidirectional=self.bidirectional)
model.summary() model.summary()
self.model = model self.model = model
...@@ -60,10 +64,12 @@ class LisSansTaMaman(object): ...@@ -60,10 +64,12 @@ class LisSansTaMaman(object):
def predict_seeds(self, nb_words=None, seeds: List[str] = None): def predict_seeds(self, nb_words=None, seeds: List[str] = None):
if seeds is None: if seeds is None:
seeds = ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous"] seeds = ["", "Je", "Tu", "Le", "La", "Les", "Un", "On", "Nous", "L'", "Leur"]
return [self.predict(seed, nb_words) for seed in seeds] return [self.predict(seed, nb_words) for seed in seeds]
def predict(self, seed="", nb_words=None): def predict(self, seed="", nb_words=None):
if self.tokenizer.lower:
seed = seed.lower()
if nb_words is None: if nb_words is None:
nb_words = 20 # TODO: Guess based on model a good number of words nb_words = 20 # TODO: Guess based on model a good number of words
return generate_text(self.model, self.tokenizer, seed, nb_words, self.max_sequence_len) return generate_text(self.model, self.tokenizer, seed, nb_words, self.max_sequence_len)
...@@ -77,7 +83,8 @@ def generate_padded_sequences(input_sequences, total_words: int): ...@@ -77,7 +83,8 @@ def generate_padded_sequences(input_sequences, total_words: int):
return predictors, label, max_sequence_len return predictors, label, max_sequence_len
def create_model(max_sequence_len: int, total_words: int, layers: int, dropout: float): def create_model(max_sequence_len: int, total_words: int, layers: int, dropout: float,
bidirectional: bool = False):
print("Creating model across %i words for %i-long seqs (%i layers, %.2f dropout):" % print("Creating model across %i words for %i-long seqs (%i layers, %.2f dropout):" %
(total_words, max_sequence_len, layers, dropout)) (total_words, max_sequence_len, layers, dropout))
input_len = max_sequence_len - 1 input_len = max_sequence_len - 1
...@@ -86,9 +93,11 @@ def create_model(max_sequence_len: int, total_words: int, layers: int, dropout: ...@@ -86,9 +93,11 @@ def create_model(max_sequence_len: int, total_words: int, layers: int, dropout:
# Add Input Embedding Layer # Add Input Embedding Layer
model.add(Embedding(total_words, 10, input_length=input_len)) model.add(Embedding(total_words, 10, input_length=input_len))
if bidirectional:
model.add(Bidirectional(LSTM(layers), input_shape=(max_sequence_len, total_words)))
else:
# Add Hidden Layer 1 - LSTM Layer # Add Hidden Layer 1 - LSTM Layer
model.add(LSTM(layers)) model.add(LSTM(layers))
# model.add(Bidirectional(LSTM(layers), input_shape=(max_sequence_len, total_words)))
model.add(Dropout(dropout)) model.add(Dropout(dropout))
# Add Output Layer # Add Output Layer
......
...@@ -10,6 +10,7 @@ class PoemTokenizer(Tokenizer): ...@@ -10,6 +10,7 @@ class PoemTokenizer(Tokenizer):
**kwargs) # TODO: keep newlines **kwargs) # TODO: keep newlines
def get_sequence_of_tokens(self, corpus): def get_sequence_of_tokens(self, corpus):
corpus = self.preprocess(corpus)
self.fit_on_texts(corpus) self.fit_on_texts(corpus)
total_words = len(self.word_index) + 1 total_words = len(self.word_index) + 1
...@@ -30,6 +31,10 @@ class PoemTokenizer(Tokenizer): ...@@ -30,6 +31,10 @@ class PoemTokenizer(Tokenizer):
def get_text(self, sequence): def get_text(self, sequence):
return self.sequences_to_texts(sequence) return self.sequences_to_texts(sequence)
def preprocess(self, corpus):
# TODO: Preprocess lines
return corpus
if __name__ == '__main__': if __name__ == '__main__':
kawa = load_texts("../") kawa = load_texts("../")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment