refactor(dawa): Extract loader & tokens

parent 5338bfc8
import os
import string
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed
def load_kawa(root="./"):
# set seeds for reproducibility
set_random_seed(2)
seed(1)
data_dir = root + 'data/'
all_lines = []
for filename in os.listdir(data_dir):
with open(data_dir + filename) as f:
content = f.readlines()
all_lines.extend(content)
all_lines = [h for h in all_lines if h[0] not in ["[", "#"]
]
len(all_lines)
print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
return all_lines
def clean_text(lines):
"""
In dataset preparation step, we will first perform text cleaning of the data
which includes removal of punctuations and lower casing all the words.
"""
lines = " ".join(v for v in lines if v not in string.punctuation).lower()
lines = lines.encode("utf8").decode("ascii", 'ignore')
return lines
def main():
lines = load_kawa("../")
clean = clean_text(lines)
print(clean)
if __name__ == '__main__':
main()
import os
import string
import warnings
import numpy as np
from keras import Sequential
from keras.engine.saving import load_model
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed
from keras_preprocessing.text import Tokenizer
from KoozDawa.dawa.loader import load_kawa, clean_text
from KoozDawa.dawa.tokens import get_sequence_of_tokens
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
def load():
# set seeds for reproducibility
set_random_seed(2)
seed(1)
data_dir = 'data/'
all_lines = []
for filename in os.listdir(data_dir):
with open(data_dir + filename) as f:
content = f.readlines()
all_lines.extend(content)
all_lines = [h for h in all_lines if h[0] not in ["[", "#"]
]
len(all_lines)
print("Loaded %i lines of data: %s." % (len(all_lines), all_lines[0]))
return all_lines
# 3.1 Dataset cleaning
# In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations
# and lower casing all the words.
def clean_text(txt):
txt = " ".join(v for v in txt if v not in string.punctuation).lower()
txt = txt.encode("utf8").decode("ascii", 'ignore')
return txt
# 3.2 Generating Sequence of N-gram Tokens
#
# Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token.
#
# The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens.
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
# TODO Tokenize while keeping accents
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
# convert data to sequence of tokens
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i + 1]
input_sequences.append(n_gram_sequence)
return input_sequences, total_words
# 3.3 Padding the Sequences and obtain Variables : Predictors and Target¶
# 3.3 Padding the Sequences and obtain Variables : Predictors and Target
def generate_padded_sequences(input_sequences, total_words):
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
print("Max len:", max_sequence_len)
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = to_categorical(label, num_classes=total_words)
return predictors, label, max_sequence_len
......@@ -98,7 +45,7 @@ def create_model(max_sequence_len, total_words, layers=100, dropout=0.1): # TOD
return model
def generate_text(seed_text, nb_words, model, max_sequence_len):
def generate_text(model, tokenizer, seed_text="", nb_words=5, max_sequence_len=0):
for _ in range(nb_words):
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
......@@ -115,17 +62,18 @@ def generate_text(seed_text, nb_words, model, max_sequence_len):
def main():
should_train = True
nb_epoch = 20
nb_epoch = 100
model_file = "../models/dawa_lstm_%i.hd5" % nb_epoch
max_sequence_len = 5 # TODO: Test different default
max_sequence_len = 5 # TODO: Test different default
tokenizer = Tokenizer()
if should_train:
lines = load()
lines = load_kawa()
corpus = [clean_text(x) for x in lines]
print(corpus[:10])
inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences, total_words = get_sequence_of_tokens(corpus, tokenizer)
print(inp_sequences[:10])
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
......@@ -139,12 +87,12 @@ def main():
else:
model = load_model(model_file)
print(generate_text("", 10, model, max_sequence_len))
print(generate_text("L'étoile", 10, model, max_sequence_len))
for sample in ["", "L'étoile ", "Elle ", "Les punchlines "]:
print(generate_text(model, tokenizer, sample, 100, max_sequence_len))
while True:
input_text = input("> ")
print(generate_text(input_text, 10, model, max_sequence_len))
print(generate_text(model, tokenizer, input_text, 100, max_sequence_len))
if __name__ == '__main__':
......
from keras_preprocessing.text import Tokenizer
from KoozDawa.dawa.loader import load_kawa
def get_sequence_of_tokens(corpus, tokenizer=Tokenizer()):
# TODO Tokenize while keeping accents
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
# convert data to sequence of tokens
input_sequences = []
# FIXME Debug: truncate corpus
corpus = corpus[:50]
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i + 1]
input_sequences.append(n_gram_sequence)
texts = tokenizer.sequences_to_texts(input_sequences)
print("Tokenized:", texts)
return input_sequences, total_words
if __name__ == '__main__':
kawa = load_kawa("../")
seqs, words = get_sequence_of_tokens(kawa)
print("%i words." % words)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment