feat(LSTM): MVP

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
import os
import string
import warnings
import numpy as np
from keras import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from numpy.random import seed
from tensorflow_core.python.framework.random_seed import set_random_seed
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
def load():
# set seeds for reproducibility
set_random_seed(2)
seed(1)
data_dir = 'data/'
all_lines = []
for filename in os.listdir(data_dir):
with open(data_dir + filename) as f:
content = f.readlines()
all_lines.extend(content)
all_lines = [h for h in all_lines if
h[0] != "["]
len(all_lines)
print("Loaded data:", all_lines[0])
return all_lines
# 3.1 Dataset cleaning
# In dataset preparation step, we will first perform text cleaning of the data which includes removal of punctuations
# and lower casing all the words.
def clean_text(txt):
txt = " ".join(v for v in txt if v not in string.punctuation).lower()
txt = txt.encode("utf8").decode("ascii", 'ignore')
return txt
# 3.2 Generating Sequence of N-gram Tokens
#
# Language modelling requires a sequence input data, as given a sequence (of words/tokens) the aim is the predict next word/token.
#
# The next step is Tokenization. Tokenization is a process of extracting tokens (terms / words) from a corpus. Python’s library Keras has inbuilt model for tokenization which can be used to obtain the tokens and their index in the corpus. After this step, every text document in the dataset is converted into sequence of tokens.
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
# TODO Tokenize while keeping accents
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
# convert data to sequence of tokens
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i + 1]
input_sequences.append(n_gram_sequence)
return input_sequences, total_words
# 3.3 Padding the Sequences and obtain Variables : Predictors and Target¶
def generate_padded_sequences(input_sequences, total_words):
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = to_categorical(label, num_classes=total_words)
return predictors, label, max_sequence_len
def create_model(max_sequence_len, total_words):
input_len = max_sequence_len - 1
model = Sequential()
# Add Input Embedding Layer
model.add(Embedding(total_words, 10, input_length=input_len))
# Add Hidden Layer 1 - LSTM Layer
model.add(LSTM(100)) # TODO finetune
model.add(Dropout(0.1)) # TODO finetune
# Add Output Layer
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
return model
def generate_text(seed_text, nb_words, model, max_sequence_len):
for _ in range(nb_words):
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
predicted = model.predict_classes(token_list, verbose=0)
output_word = ""
for word, index in tokenizer.word_index.items():
if index == predicted:
output_word = word
break
seed_text += " " + output_word
return seed_text.title()
def main():
lines = load()
corpus = [clean_text(x) for x in lines]
print(corpus[:10])
inp_sequences, total_words = get_sequence_of_tokens(corpus[:10]) # Fixme: Corpus cliff for debug
print(inp_sequences[:10])
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences, total_words)
print(predictors, label, max_sequence_len)
model = create_model(max_sequence_len, total_words)
model.summary()
model.fit(predictors, label, epochs=10, verbose=5)
print(generate_text("", 10, model, max_sequence_len))
print(generate_text("L'étoile", 10, model, max_sequence_len))
if __name__ == '__main__':
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment