From be8f1bc238a15321fef5798e91e99e78a208ed47 Mon Sep 17 00:00:00 2001 From: pm3310 Date: Tue, 24 Apr 2018 19:31:41 +0100 Subject: [PATCH] Deep Learning addition commands --- .flake8 | 2 + .gitignore | 55 +++++----------- Makefile | 114 ++++++++++++++++++++++++++++++++ README.md | 19 +++++- models/.gitkeep | 0 notebooks/.gitkeep | 0 references/.gitkeep | 0 reports/.gitkeep | 0 reports/figures/.gitkeep | 0 requirements.txt | 51 +++++++++++++++ src/__init__.py | 0 src/character_encoder.py | 36 ++++++++++ src/data/.gitkeep | 0 src/data/__init__.py | 0 src/data/make_dataset.py | 105 +++++++++++++++++++++++++++++ src/encoding_utils.py | 21 ++++++ src/models/.gitkeep | 0 src/models/__init__.py | 0 src/models/train_model.py | 116 +++++++++++++++++++++++++++++++++ src/settings.py | 21 ++++++ src/visualization/.gitkeep | 0 src/visualization/visualize.py | 0 test_environment.py | 25 +++++++ tox.ini | 3 + 24 files changed, 530 insertions(+), 38 deletions(-) create mode 100644 .flake8 create mode 100644 Makefile create mode 100644 models/.gitkeep create mode 100644 notebooks/.gitkeep create mode 100644 references/.gitkeep create mode 100644 reports/.gitkeep create mode 100644 reports/figures/.gitkeep create mode 100644 requirements.txt create mode 100644 src/__init__.py create mode 100644 src/character_encoder.py create mode 100644 src/data/.gitkeep create mode 100644 src/data/__init__.py create mode 100644 src/data/make_dataset.py create mode 100644 src/encoding_utils.py create mode 100644 src/models/.gitkeep create mode 100644 src/models/__init__.py create mode 100644 src/models/train_model.py create mode 100644 src/settings.py create mode 100644 src/visualization/.gitkeep create mode 100644 src/visualization/visualize.py create mode 100644 test_environment.py create mode 100644 tox.ini diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..bebbaeb --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length=100 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7bbc71c..ad7ed72 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] -*$py.class # C extensions *.so @@ -20,7 +19,6 @@ lib64/ parts/ sdist/ var/ -wheels/ *.egg-info/ .installed.cfg *.egg @@ -43,8 +41,7 @@ htmlcov/ .cache nosetests.xml coverage.xml -*.cover -.hypothesis/ +*,cover # Translations *.mo @@ -52,14 +49,6 @@ coverage.xml # Django stuff: *.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy # Sphinx documentation docs/_build/ @@ -67,35 +56,27 @@ docs/_build/ # PyBuilder target/ -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule +# DotEnv configuration +.env -# SageMath parsed files -*.sage.py +# Database +*.db +*.rdb -# dotenv -.env +# Pycharm +.idea -# virtualenv -.venv -venv/ -ENV/ +# VS Code +.vscode/ -# Spyder project settings -.spyderproject -.spyproject +# Spyder +.spyproject/ -# Rope project settings -.ropeproject +# Jupyter NB Checkpoints +.ipynb_checkpoints/ -# mkdocs documentation -/site +# exclude data from source control by default +/data/ -# mypy -.mypy_cache/ +# Mac OS-specific storage files +.DS_Store diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5e7fc02 --- /dev/null +++ b/Makefile @@ -0,0 +1,114 @@ +.PHONY: clean data lint requirements train + +################################################################################# +# GLOBALS # +################################################################################# + +PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +PROFILE = default +PROJECT_NAME = deep-learning-addition +PYTHON_INTERPRETER = python3 + +################################################################################# +# COMMANDS # +################################################################################# + +## Install Python Dependencies +requirements: test_environment + pip install -r requirements.txt + +## Make Dataset +data: requirements + PYTHONPATH='.' $(PYTHON_INTERPRETER) src/data/make_dataset.py ./data/processed/ + +## Train Deep Learning Model +train: requirements + PYTHONPATH='.' $(PYTHON_INTERPRETER) src/models/train_model.py ./data/processed/ ./models/ + +## Delete all compiled Python files +clean: + find . -type f -name "*.py[co]" -delete + find . -type d -name "__pycache__" -delete + +## Lint using flake8 +lint: + flake8 src + +## Set up python interpreter environment +create_environment: + @pip install -q virtualenv virtualenvwrapper + @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\ + export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" + @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" + @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" + +## Test python environment is setup correctly +test_environment: + $(PYTHON_INTERPRETER) test_environment.py + +################################################################################# +# PROJECT RULES # +################################################################################# + + + +################################################################################# +# Self Documenting Commands # +################################################################################# + +.DEFAULT_GOAL := show-help + +# Inspired by +# sed script explained: +# /^##/: +# * save line in hold space +# * purge line +# * Loop: +# * append newline + line to hold space +# * go to next line +# * if line starts with doc comment, strip comment character off and loop +# * remove target prerequisites +# * append hold space (+ newline) to line +# * replace newline plus comments by `---` +# * print line +# Separate expressions are necessary because labels cannot be delimited by +# semicolon; see +.PHONY: show-help +show-help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=19 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/README.md b/README.md index 0f9e118..4009e28 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,19 @@ # deep-learning-addition -An implementation of sequence to sequence learning for performing addition +A LSTM model that learns to perform arithmetic addition for integers. It's a simple Deep Learning calculator. + +## Development + +- Python 3.6 and 2.7 +- For Python 2.7 replace the value of `REQUIRED_PYTHON` and `PYTHON_INTERPRETER` in `test_environment.py` and `Makefile`, respectively, to `python2` +- [awscli](https://pypi.python.org/pypi/awscli) installed and configured + +## Commands +``` +clean Delete all compiled Python files +create_environment Set up python interpreter environment +data Make Dataset +lint Lint using flake8 +requirements Install Python Dependencies +test_environment Test python environment is setup correctly +train Train Deep Learning Model +``` diff --git a/models/.gitkeep b/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/references/.gitkeep b/references/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/.gitkeep b/reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8d5574a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,51 @@ +absl-py==0.1.13 +alabaster==0.7.10 +astor==0.6.2 +awscli==1.15.4 +Babel==2.5.3 +bleach==1.5.0 +botocore==1.10.4 +certifi==2018.4.16 +chardet==3.0.4 +click==6.7 +colorama==0.3.7 +coverage==4.5.1 +docutils==0.14 +flake8==3.5.0 +gast==0.2.0 +grpcio==1.11.0 +h5py==2.7.1 +html5lib==0.9999999 +idna==2.6 +imagesize==1.0.0 +Jinja2==2.10 +jmespath==0.9.3 +Keras==2.1.5 +Markdown==2.6.11 +MarkupSafe==1.0 +mccabe==0.6.1 +numpy==1.14.2 +packaging==17.1 +protobuf==3.5.2.post1 +pyasn1==0.4.2 +pycodestyle==2.3.1 +pyflakes==1.6.0 +Pygments==2.2.0 +pyparsing==2.2.0 +python-dateutil==2.6.1 +python-dotenv==0.8.2 +pytz==2018.4 +PyYAML==3.12 +requests==2.18.4 +rsa==3.4.2 +s3transfer==0.1.13 +scipy==1.0.1 +six==1.11.0 +snowballstemmer==1.2.1 +Sphinx==1.7.2 +sphinxcontrib-websupport==1.0.1 +tensorboard==1.7.0 +tensorflow==1.7.0 +termcolor==1.1.0 +urllib3==1.22 +Werkzeug==0.14.1 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/character_encoder.py b/src/character_encoder.py new file mode 100644 index 0000000..afac276 --- /dev/null +++ b/src/character_encoder.py @@ -0,0 +1,36 @@ +import numpy as np + + +class CharacterTable(object): + """Given a set of characters: + + Encode them to a one hot integer representation + + Decode the one hot integer representation to their character output + + Decode a vector of probabilities to their character output + """ + + def __init__(self, chars): + """Initialize character table. + + # Arguments + chars: Characters that can appear in the input. + """ + self.chars = sorted(set(chars)) + self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) + self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) + + def encode(self, input_string, num_rows): + """One hot encode given string input_string. + + # Arguments + num_rows: Number of rows in the returned one hot encoding. This is + used to keep the # of rows for each data the same. + """ + x = np.zeros((num_rows, len(self.chars))) + for i, c in enumerate(input_string): + x[i, self.char_indices[c]] = 1 + return x + + def decode(self, x, calc_argmax=True): + if calc_argmax: + x = x.argmax(axis=-1) + return ''.join(self.indices_char[x] for x in x) diff --git a/src/data/.gitkeep b/src/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/data/__init__.py b/src/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py new file mode 100644 index 0000000..a311f4c --- /dev/null +++ b/src/data/make_dataset.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals, absolute_import + +import logging +import os + +import click +import numpy as np + +from src.models.train_model import CHARS +from src.settings import TRAINING_SIZE, DIGITS, REVERSE, MAXLEN, CTABLE + +logger = logging.getLogger(__name__) + + +def _generate_data(): + questions = [] + expected = [] + seen = set() + print('Generating data...') + while len(questions) < TRAINING_SIZE: + def func(): + return int(''.join(np.random.choice(list('0123456789')) + for _ in range(np.random.randint(1, DIGITS + 1)))) + + a, b = func(), func() + # Skip any addition questions we've already seen + # Also skip any such that x+Y == Y+x (hence the sorting). + key = tuple(sorted((a, b))) + if key in seen: + continue + seen.add(key) + # Pad the data with spaces such that it is always MAXLEN. + q = '{}+{}'.format(a, b) + query = q + ' ' * (MAXLEN - len(q)) + ans = str(a + b) + # Answers can be of maximum size DIGITS + 1. + ans += ' ' * (DIGITS + 1 - len(ans)) + if REVERSE: + # Reverse the query, e.g., '12+345 ' becomes ' 543+21'. (Note the + # space used for padding.) + query = query[::-1] + questions.append(query) + expected.append(ans) + print('Total addition questions:', len(questions)) + + return questions, expected + + +def _vectorization(questions, expected): + print('Vectorization...') + + x = np.zeros((len(questions), MAXLEN, len(CHARS)), dtype=np.bool) + y = np.zeros((len(questions), DIGITS + 1, len(CHARS)), dtype=np.bool) + for i, sentence in enumerate(questions): + x[i] = CTABLE.encode(sentence, MAXLEN) + for i, sentence in enumerate(expected): + y[i] = CTABLE.encode(sentence, DIGITS + 1) + + # Shuffle (x, y) in unison as the later parts of x will almost all be larger + # digits. + indices = np.arange(len(y)) + np.random.shuffle(indices) + x = x[indices] + y = y[indices] + + # Explicitly set apart 10% for validation data that we never train over. + split_at = len(x) - len(x) // 10 + (x_train, x_val) = x[:split_at], x[split_at:] + (y_train, y_val) = y[:split_at], y[split_at:] + + print('Training Data:') + print(x_train.shape) + print(y_train.shape) + + print('Validation Data:') + print(x_val.shape) + print(y_val.shape) + + return x_train, y_train, x_val, y_val + + +@click.command() +@click.argument('output_path', type=click.Path()) +def main(output_path): + """ Runs data processing scripts to save data in ../processed. + """ + logger.info('making final data set') + + questions, expected = _generate_data() + + x_train, y_train, x_val, y_val = _vectorization(questions, expected) + + np.save(os.path.join(output_path, 'x_train.npy'), x_train) + np.save(os.path.join(output_path, 'y_train.npy'), y_train) + + np.save(os.path.join(output_path, 'x_val.npy'), x_val) + np.save(os.path.join(output_path, 'y_val.npy'), y_val) + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + main() diff --git a/src/encoding_utils.py b/src/encoding_utils.py new file mode 100644 index 0000000..2a9b237 --- /dev/null +++ b/src/encoding_utils.py @@ -0,0 +1,21 @@ +from src.settings import CTABLE, MAXLEN + + +def encode_query(input_string): + """ + Encode a query addition string + :param input_string: [str], input query string, i.e. '123+456' + :return: [str], encoded query ready to be used in model.predict(...) + """ + output = CTABLE.encode(input_string[::-1], MAXLEN) + + return output.reshape((1, output.shape[0], output.shape[1])) + + +def decode_prediction(input_array): + """ + Decode model prediction + :param input_array: [numpy.array], input numpy array + :return: [str], decoded array to string + """ + return CTABLE.decode(input_array[0]) diff --git a/src/models/.gitkeep b/src/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/train_model.py b/src/models/train_model.py new file mode 100644 index 0000000..28bf85e --- /dev/null +++ b/src/models/train_model.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function, absolute_import, unicode_literals + +import logging +import os + +import click +from keras.models import Sequential +from keras import layers +import numpy as np + +from src.settings import HIDDEN_SIZE, MAXLEN, RNN, DIGITS, LAYERS, BATCH_SIZE, REVERSE, CHARS, \ + CTABLE + +logger = logging.getLogger(__name__) + + +class Colors: + ok = '\033[92m' + fail = '\033[91m' + close = '\033[0m' + + +def _build_model(): + print('Build model...') + model = Sequential() + # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE. + # Note: In a situation where your input sequences have a variable length, + # use input_shape=(None, num_feature). + model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(CHARS)))) + # As the decoder RNN's input, repeatedly provide with the last hidden state of + # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum + # length of output, e.g., when DIGITS=3, max output is 999+999=1998. + model.add(layers.RepeatVector(DIGITS + 1)) + # The decoder RNN could be multiple layers stacked or a single layer. + for _ in range(LAYERS): + # By setting return_sequences to True, return not only the last output but + # all the outputs so far in the form of (num_samples, timesteps, + # output_dim). This is necessary as TimeDistributed in the below expects + # the first dimension to be the timesteps. + model.add(RNN(HIDDEN_SIZE, return_sequences=True)) + + # Apply a dense layer to the every temporal slice of an input. For each of step + # of the output sequence, decide which character should be chosen. + model.add(layers.TimeDistributed(layers.Dense(len(CHARS)))) + model.add(layers.Activation('softmax')) + model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + model.summary() + + return model + + +def _train_model(model, x_train, y_train, x_val, y_val): + # Train the model each generation and show predictions against the validation + # dataset. + for iteration in range(1, 15): + print() + print('-' * 50) + print('Iteration', iteration) + model.fit(x_train, y_train, + batch_size=BATCH_SIZE, + epochs=1, + validation_data=(x_val, y_val)) + # Select 10 samples from the validation set at random so we can visualize + # errors. + for i in range(10): + ind = np.random.randint(0, len(x_val)) + rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])] + preds = model.predict_classes(rowx, verbose=0) + query = CTABLE.decode(rowx[0]) + correct = CTABLE.decode(rowy[0]) + guess = CTABLE.decode(preds[0], calc_argmax=False) + print('Q', query[::-1] if REVERSE else query, end=' ') + print('T', correct, end=' ') + if correct == guess: + print(Colors.ok + '☑' + Colors.close, end=' ') + else: + print(Colors.fail + '☒' + Colors.close, end=' ') + print(guess) + + return model + + +def train(input_path, output_path): + x_train = np.load(os.path.join(input_path, 'x_train.npy')) + y_train = np.load(os.path.join(input_path, 'y_train.npy')) + x_val = np.load(os.path.join(input_path, 'x_val.npy')) + y_val = np.load(os.path.join(input_path, 'y_val.npy')) + + model = _build_model() + + trained_model = _train_model( + model=model, + x_train=x_train, + y_train=y_train, + x_val=x_val, + y_val=y_val + ) + + trained_model.save(os.path.join(output_path, 'model.h5')) + + +@click.command() +@click.argument('input_path', type=click.Path(exists=True)) +@click.argument('output_path', type=click.Path()) +def main(input_path, output_path): + train(input_path, output_path) + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + main() diff --git a/src/settings.py b/src/settings.py new file mode 100644 index 0000000..be7a99f --- /dev/null +++ b/src/settings.py @@ -0,0 +1,21 @@ +# Parameters for the model and dataset. +from keras import layers + +from src.character_encoder import CharacterTable + +TRAINING_SIZE = 50000 +DIGITS = 3 +REVERSE = True + +# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of +# int is DIGITS. +MAXLEN = DIGITS + 1 + DIGITS + +RNN = layers.LSTM +HIDDEN_SIZE = 128 +BATCH_SIZE = 128 +LAYERS = 1 + +# All the numbers, plus sign and space for padding. +CHARS = '0123456789+ ' +CTABLE = CharacterTable(CHARS) diff --git a/src/visualization/.gitkeep b/src/visualization/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py new file mode 100644 index 0000000..e69de29 diff --git a/test_environment.py b/test_environment.py new file mode 100644 index 0000000..30361a1 --- /dev/null +++ b/test_environment.py @@ -0,0 +1,25 @@ +import sys + +REQUIRED_PYTHON = "python3" + + +def main(): + system_major = sys.version_info.major + if REQUIRED_PYTHON == "python2": + required_major = 2 + elif REQUIRED_PYTHON == "python3": + required_major = 3 + else: + raise ValueError("Unrecognized python interpreter: {}".format( + REQUIRED_PYTHON)) + + if system_major != required_major: + raise TypeError( + "This project requires Python {}. Found: Python {}".format( + required_major, sys.version)) + else: + print(">>> Development environment passes all tests!") + + +if __name__ == '__main__': + main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..c32fbd8 --- /dev/null +++ b/tox.ini @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 79 +max-complexity = 10