From be8f1bc238a15321fef5798e91e99e78a208ed47 Mon Sep 17 00:00:00 2001
From: pm3310
Date: Tue, 24 Apr 2018 19:31:41 +0100
Subject: [PATCH] Deep Learning addition commands
---
.flake8 | 2 +
.gitignore | 55 +++++-----------
Makefile | 114 ++++++++++++++++++++++++++++++++
README.md | 19 +++++-
models/.gitkeep | 0
notebooks/.gitkeep | 0
references/.gitkeep | 0
reports/.gitkeep | 0
reports/figures/.gitkeep | 0
requirements.txt | 51 +++++++++++++++
src/__init__.py | 0
src/character_encoder.py | 36 ++++++++++
src/data/.gitkeep | 0
src/data/__init__.py | 0
src/data/make_dataset.py | 105 +++++++++++++++++++++++++++++
src/encoding_utils.py | 21 ++++++
src/models/.gitkeep | 0
src/models/__init__.py | 0
src/models/train_model.py | 116 +++++++++++++++++++++++++++++++++
src/settings.py | 21 ++++++
src/visualization/.gitkeep | 0
src/visualization/visualize.py | 0
test_environment.py | 25 +++++++
tox.ini | 3 +
24 files changed, 530 insertions(+), 38 deletions(-)
create mode 100644 .flake8
create mode 100644 Makefile
create mode 100644 models/.gitkeep
create mode 100644 notebooks/.gitkeep
create mode 100644 references/.gitkeep
create mode 100644 reports/.gitkeep
create mode 100644 reports/figures/.gitkeep
create mode 100644 requirements.txt
create mode 100644 src/__init__.py
create mode 100644 src/character_encoder.py
create mode 100644 src/data/.gitkeep
create mode 100644 src/data/__init__.py
create mode 100644 src/data/make_dataset.py
create mode 100644 src/encoding_utils.py
create mode 100644 src/models/.gitkeep
create mode 100644 src/models/__init__.py
create mode 100644 src/models/train_model.py
create mode 100644 src/settings.py
create mode 100644 src/visualization/.gitkeep
create mode 100644 src/visualization/visualize.py
create mode 100644 test_environment.py
create mode 100644 tox.ini
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..bebbaeb
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length=100
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 7bbc71c..ad7ed72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
-*$py.class
# C extensions
*.so
@@ -20,7 +19,6 @@ lib64/
parts/
sdist/
var/
-wheels/
*.egg-info/
.installed.cfg
*.egg
@@ -43,8 +41,7 @@ htmlcov/
.cache
nosetests.xml
coverage.xml
-*.cover
-.hypothesis/
+*,cover
# Translations
*.mo
@@ -52,14 +49,6 @@ coverage.xml
# Django stuff:
*.log
-local_settings.py
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
# Sphinx documentation
docs/_build/
@@ -67,35 +56,27 @@ docs/_build/
# PyBuilder
target/
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
+# DotEnv configuration
+.env
-# SageMath parsed files
-*.sage.py
+# Database
+*.db
+*.rdb
-# dotenv
-.env
+# Pycharm
+.idea
-# virtualenv
-.venv
-venv/
-ENV/
+# VS Code
+.vscode/
-# Spyder project settings
-.spyderproject
-.spyproject
+# Spyder
+.spyproject/
-# Rope project settings
-.ropeproject
+# Jupyter NB Checkpoints
+.ipynb_checkpoints/
-# mkdocs documentation
-/site
+# exclude data from source control by default
+/data/
-# mypy
-.mypy_cache/
+# Mac OS-specific storage files
+.DS_Store
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5e7fc02
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,114 @@
+.PHONY: clean data lint requirements train
+
+#################################################################################
+# GLOBALS #
+#################################################################################
+
+PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+PROFILE = default
+PROJECT_NAME = deep-learning-addition
+PYTHON_INTERPRETER = python3
+
+#################################################################################
+# COMMANDS #
+#################################################################################
+
+## Install Python Dependencies
+requirements: test_environment
+ pip install -r requirements.txt
+
+## Make Dataset
+data: requirements
+ PYTHONPATH='.' $(PYTHON_INTERPRETER) src/data/make_dataset.py ./data/processed/
+
+## Train Deep Learning Model
+train: requirements
+ PYTHONPATH='.' $(PYTHON_INTERPRETER) src/models/train_model.py ./data/processed/ ./models/
+
+## Delete all compiled Python files
+clean:
+ find . -type f -name "*.py[co]" -delete
+ find . -type d -name "__pycache__" -delete
+
+## Lint using flake8
+lint:
+ flake8 src
+
+## Set up python interpreter environment
+create_environment:
+ @pip install -q virtualenv virtualenvwrapper
+ @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\
+ export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
+ @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
+ @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
+
+## Test python environment is setup correctly
+test_environment:
+ $(PYTHON_INTERPRETER) test_environment.py
+
+#################################################################################
+# PROJECT RULES #
+#################################################################################
+
+
+
+#################################################################################
+# Self Documenting Commands #
+#################################################################################
+
+.DEFAULT_GOAL := show-help
+
+# Inspired by
+# sed script explained:
+# /^##/:
+# * save line in hold space
+# * purge line
+# * Loop:
+# * append newline + line to hold space
+# * go to next line
+# * if line starts with doc comment, strip comment character off and loop
+# * remove target prerequisites
+# * append hold space (+ newline) to line
+# * replace newline plus comments by `---`
+# * print line
+# Separate expressions are necessary because labels cannot be delimited by
+# semicolon; see
+.PHONY: show-help
+show-help:
+ @echo "$$(tput bold)Available rules:$$(tput sgr0)"
+ @echo
+ @sed -n -e "/^## / { \
+ h; \
+ s/.*//; \
+ :doc" \
+ -e "H; \
+ n; \
+ s/^## //; \
+ t doc" \
+ -e "s/:.*//; \
+ G; \
+ s/\\n## /---/; \
+ s/\\n/ /g; \
+ p; \
+ }" ${MAKEFILE_LIST} \
+ | LC_ALL='C' sort --ignore-case \
+ | awk -F '---' \
+ -v ncol=$$(tput cols) \
+ -v indent=19 \
+ -v col_on="$$(tput setaf 6)" \
+ -v col_off="$$(tput sgr0)" \
+ '{ \
+ printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
+ n = split($$2, words, " "); \
+ line_length = ncol - indent; \
+ for (i = 1; i <= n; i++) { \
+ line_length -= length(words[i]) + 1; \
+ if (line_length <= 0) { \
+ line_length = ncol - indent - length(words[i]) - 1; \
+ printf "\n%*s ", -indent, " "; \
+ } \
+ printf "%s ", words[i]; \
+ } \
+ printf "\n"; \
+ }' \
+ | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
diff --git a/README.md b/README.md
index 0f9e118..4009e28 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,19 @@
# deep-learning-addition
-An implementation of sequence to sequence learning for performing addition
+A LSTM model that learns to perform arithmetic addition for integers. It's a simple Deep Learning calculator.
+
+## Development
+
+- Python 3.6 and 2.7
+- For Python 2.7 replace the value of `REQUIRED_PYTHON` and `PYTHON_INTERPRETER` in `test_environment.py` and `Makefile`, respectively, to `python2`
+- [awscli](https://pypi.python.org/pypi/awscli) installed and configured
+
+## Commands
+```
+clean Delete all compiled Python files
+create_environment Set up python interpreter environment
+data Make Dataset
+lint Lint using flake8
+requirements Install Python Dependencies
+test_environment Test python environment is setup correctly
+train Train Deep Learning Model
+```
diff --git a/models/.gitkeep b/models/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/references/.gitkeep b/references/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/reports/.gitkeep b/reports/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/reports/figures/.gitkeep b/reports/figures/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8d5574a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,51 @@
+absl-py==0.1.13
+alabaster==0.7.10
+astor==0.6.2
+awscli==1.15.4
+Babel==2.5.3
+bleach==1.5.0
+botocore==1.10.4
+certifi==2018.4.16
+chardet==3.0.4
+click==6.7
+colorama==0.3.7
+coverage==4.5.1
+docutils==0.14
+flake8==3.5.0
+gast==0.2.0
+grpcio==1.11.0
+h5py==2.7.1
+html5lib==0.9999999
+idna==2.6
+imagesize==1.0.0
+Jinja2==2.10
+jmespath==0.9.3
+Keras==2.1.5
+Markdown==2.6.11
+MarkupSafe==1.0
+mccabe==0.6.1
+numpy==1.14.2
+packaging==17.1
+protobuf==3.5.2.post1
+pyasn1==0.4.2
+pycodestyle==2.3.1
+pyflakes==1.6.0
+Pygments==2.2.0
+pyparsing==2.2.0
+python-dateutil==2.6.1
+python-dotenv==0.8.2
+pytz==2018.4
+PyYAML==3.12
+requests==2.18.4
+rsa==3.4.2
+s3transfer==0.1.13
+scipy==1.0.1
+six==1.11.0
+snowballstemmer==1.2.1
+Sphinx==1.7.2
+sphinxcontrib-websupport==1.0.1
+tensorboard==1.7.0
+tensorflow==1.7.0
+termcolor==1.1.0
+urllib3==1.22
+Werkzeug==0.14.1
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/character_encoder.py b/src/character_encoder.py
new file mode 100644
index 0000000..afac276
--- /dev/null
+++ b/src/character_encoder.py
@@ -0,0 +1,36 @@
+import numpy as np
+
+
+class CharacterTable(object):
+ """Given a set of characters:
+ + Encode them to a one hot integer representation
+ + Decode the one hot integer representation to their character output
+ + Decode a vector of probabilities to their character output
+ """
+
+ def __init__(self, chars):
+ """Initialize character table.
+
+ # Arguments
+ chars: Characters that can appear in the input.
+ """
+ self.chars = sorted(set(chars))
+ self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
+ self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
+
+ def encode(self, input_string, num_rows):
+ """One hot encode given string input_string.
+
+ # Arguments
+ num_rows: Number of rows in the returned one hot encoding. This is
+ used to keep the # of rows for each data the same.
+ """
+ x = np.zeros((num_rows, len(self.chars)))
+ for i, c in enumerate(input_string):
+ x[i, self.char_indices[c]] = 1
+ return x
+
+ def decode(self, x, calc_argmax=True):
+ if calc_argmax:
+ x = x.argmax(axis=-1)
+ return ''.join(self.indices_char[x] for x in x)
diff --git a/src/data/.gitkeep b/src/data/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/src/data/__init__.py b/src/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
new file mode 100644
index 0000000..a311f4c
--- /dev/null
+++ b/src/data/make_dataset.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals, absolute_import
+
+import logging
+import os
+
+import click
+import numpy as np
+
+from src.models.train_model import CHARS
+from src.settings import TRAINING_SIZE, DIGITS, REVERSE, MAXLEN, CTABLE
+
+logger = logging.getLogger(__name__)
+
+
+def _generate_data():
+ questions = []
+ expected = []
+ seen = set()
+ print('Generating data...')
+ while len(questions) < TRAINING_SIZE:
+ def func():
+ return int(''.join(np.random.choice(list('0123456789'))
+ for _ in range(np.random.randint(1, DIGITS + 1))))
+
+ a, b = func(), func()
+ # Skip any addition questions we've already seen
+ # Also skip any such that x+Y == Y+x (hence the sorting).
+ key = tuple(sorted((a, b)))
+ if key in seen:
+ continue
+ seen.add(key)
+ # Pad the data with spaces such that it is always MAXLEN.
+ q = '{}+{}'.format(a, b)
+ query = q + ' ' * (MAXLEN - len(q))
+ ans = str(a + b)
+ # Answers can be of maximum size DIGITS + 1.
+ ans += ' ' * (DIGITS + 1 - len(ans))
+ if REVERSE:
+ # Reverse the query, e.g., '12+345 ' becomes ' 543+21'. (Note the
+ # space used for padding.)
+ query = query[::-1]
+ questions.append(query)
+ expected.append(ans)
+ print('Total addition questions:', len(questions))
+
+ return questions, expected
+
+
+def _vectorization(questions, expected):
+ print('Vectorization...')
+
+ x = np.zeros((len(questions), MAXLEN, len(CHARS)), dtype=np.bool)
+ y = np.zeros((len(questions), DIGITS + 1, len(CHARS)), dtype=np.bool)
+ for i, sentence in enumerate(questions):
+ x[i] = CTABLE.encode(sentence, MAXLEN)
+ for i, sentence in enumerate(expected):
+ y[i] = CTABLE.encode(sentence, DIGITS + 1)
+
+ # Shuffle (x, y) in unison as the later parts of x will almost all be larger
+ # digits.
+ indices = np.arange(len(y))
+ np.random.shuffle(indices)
+ x = x[indices]
+ y = y[indices]
+
+ # Explicitly set apart 10% for validation data that we never train over.
+ split_at = len(x) - len(x) // 10
+ (x_train, x_val) = x[:split_at], x[split_at:]
+ (y_train, y_val) = y[:split_at], y[split_at:]
+
+ print('Training Data:')
+ print(x_train.shape)
+ print(y_train.shape)
+
+ print('Validation Data:')
+ print(x_val.shape)
+ print(y_val.shape)
+
+ return x_train, y_train, x_val, y_val
+
+
+@click.command()
+@click.argument('output_path', type=click.Path())
+def main(output_path):
+ """ Runs data processing scripts to save data in ../processed.
+ """
+ logger.info('making final data set')
+
+ questions, expected = _generate_data()
+
+ x_train, y_train, x_val, y_val = _vectorization(questions, expected)
+
+ np.save(os.path.join(output_path, 'x_train.npy'), x_train)
+ np.save(os.path.join(output_path, 'y_train.npy'), y_train)
+
+ np.save(os.path.join(output_path, 'x_val.npy'), x_val)
+ np.save(os.path.join(output_path, 'y_val.npy'), y_val)
+
+
+if __name__ == '__main__':
+ log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
+
+ main()
diff --git a/src/encoding_utils.py b/src/encoding_utils.py
new file mode 100644
index 0000000..2a9b237
--- /dev/null
+++ b/src/encoding_utils.py
@@ -0,0 +1,21 @@
+from src.settings import CTABLE, MAXLEN
+
+
+def encode_query(input_string):
+ """
+ Encode a query addition string
+ :param input_string: [str], input query string, i.e. '123+456'
+ :return: [str], encoded query ready to be used in model.predict(...)
+ """
+ output = CTABLE.encode(input_string[::-1], MAXLEN)
+
+ return output.reshape((1, output.shape[0], output.shape[1]))
+
+
+def decode_prediction(input_array):
+ """
+ Decode model prediction
+ :param input_array: [numpy.array], input numpy array
+ :return: [str], decoded array to string
+ """
+ return CTABLE.decode(input_array[0])
diff --git a/src/models/.gitkeep b/src/models/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/models/train_model.py b/src/models/train_model.py
new file mode 100644
index 0000000..28bf85e
--- /dev/null
+++ b/src/models/train_model.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function, absolute_import, unicode_literals
+
+import logging
+import os
+
+import click
+from keras.models import Sequential
+from keras import layers
+import numpy as np
+
+from src.settings import HIDDEN_SIZE, MAXLEN, RNN, DIGITS, LAYERS, BATCH_SIZE, REVERSE, CHARS, \
+ CTABLE
+
+logger = logging.getLogger(__name__)
+
+
+class Colors:
+ ok = '\033[92m'
+ fail = '\033[91m'
+ close = '\033[0m'
+
+
+def _build_model():
+ print('Build model...')
+ model = Sequential()
+ # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
+ # Note: In a situation where your input sequences have a variable length,
+ # use input_shape=(None, num_feature).
+ model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(CHARS))))
+ # As the decoder RNN's input, repeatedly provide with the last hidden state of
+ # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
+ # length of output, e.g., when DIGITS=3, max output is 999+999=1998.
+ model.add(layers.RepeatVector(DIGITS + 1))
+ # The decoder RNN could be multiple layers stacked or a single layer.
+ for _ in range(LAYERS):
+ # By setting return_sequences to True, return not only the last output but
+ # all the outputs so far in the form of (num_samples, timesteps,
+ # output_dim). This is necessary as TimeDistributed in the below expects
+ # the first dimension to be the timesteps.
+ model.add(RNN(HIDDEN_SIZE, return_sequences=True))
+
+ # Apply a dense layer to the every temporal slice of an input. For each of step
+ # of the output sequence, decide which character should be chosen.
+ model.add(layers.TimeDistributed(layers.Dense(len(CHARS))))
+ model.add(layers.Activation('softmax'))
+ model.compile(loss='categorical_crossentropy',
+ optimizer='adam',
+ metrics=['accuracy'])
+ model.summary()
+
+ return model
+
+
+def _train_model(model, x_train, y_train, x_val, y_val):
+ # Train the model each generation and show predictions against the validation
+ # dataset.
+ for iteration in range(1, 15):
+ print()
+ print('-' * 50)
+ print('Iteration', iteration)
+ model.fit(x_train, y_train,
+ batch_size=BATCH_SIZE,
+ epochs=1,
+ validation_data=(x_val, y_val))
+ # Select 10 samples from the validation set at random so we can visualize
+ # errors.
+ for i in range(10):
+ ind = np.random.randint(0, len(x_val))
+ rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
+ preds = model.predict_classes(rowx, verbose=0)
+ query = CTABLE.decode(rowx[0])
+ correct = CTABLE.decode(rowy[0])
+ guess = CTABLE.decode(preds[0], calc_argmax=False)
+ print('Q', query[::-1] if REVERSE else query, end=' ')
+ print('T', correct, end=' ')
+ if correct == guess:
+ print(Colors.ok + '☑' + Colors.close, end=' ')
+ else:
+ print(Colors.fail + '☒' + Colors.close, end=' ')
+ print(guess)
+
+ return model
+
+
+def train(input_path, output_path):
+ x_train = np.load(os.path.join(input_path, 'x_train.npy'))
+ y_train = np.load(os.path.join(input_path, 'y_train.npy'))
+ x_val = np.load(os.path.join(input_path, 'x_val.npy'))
+ y_val = np.load(os.path.join(input_path, 'y_val.npy'))
+
+ model = _build_model()
+
+ trained_model = _train_model(
+ model=model,
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_val,
+ y_val=y_val
+ )
+
+ trained_model.save(os.path.join(output_path, 'model.h5'))
+
+
+@click.command()
+@click.argument('input_path', type=click.Path(exists=True))
+@click.argument('output_path', type=click.Path())
+def main(input_path, output_path):
+ train(input_path, output_path)
+
+
+if __name__ == '__main__':
+ log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
+
+ main()
diff --git a/src/settings.py b/src/settings.py
new file mode 100644
index 0000000..be7a99f
--- /dev/null
+++ b/src/settings.py
@@ -0,0 +1,21 @@
+# Parameters for the model and dataset.
+from keras import layers
+
+from src.character_encoder import CharacterTable
+
+TRAINING_SIZE = 50000
+DIGITS = 3
+REVERSE = True
+
+# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
+# int is DIGITS.
+MAXLEN = DIGITS + 1 + DIGITS
+
+RNN = layers.LSTM
+HIDDEN_SIZE = 128
+BATCH_SIZE = 128
+LAYERS = 1
+
+# All the numbers, plus sign and space for padding.
+CHARS = '0123456789+ '
+CTABLE = CharacterTable(CHARS)
diff --git a/src/visualization/.gitkeep b/src/visualization/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/src/visualization/visualize.py b/src/visualization/visualize.py
new file mode 100644
index 0000000..e69de29
diff --git a/test_environment.py b/test_environment.py
new file mode 100644
index 0000000..30361a1
--- /dev/null
+++ b/test_environment.py
@@ -0,0 +1,25 @@
+import sys
+
+REQUIRED_PYTHON = "python3"
+
+
+def main():
+ system_major = sys.version_info.major
+ if REQUIRED_PYTHON == "python2":
+ required_major = 2
+ elif REQUIRED_PYTHON == "python3":
+ required_major = 3
+ else:
+ raise ValueError("Unrecognized python interpreter: {}".format(
+ REQUIRED_PYTHON))
+
+ if system_major != required_major:
+ raise TypeError(
+ "This project requires Python {}. Found: Python {}".format(
+ required_major, sys.version))
+ else:
+ print(">>> Development environment passes all tests!")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..c32fbd8
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 79
+max-complexity = 10