Date: Tue, 24 Apr 2018 19:31:41 +0100
Subject: [PATCH] Deep Learning addition commands
+.PHONY: clean data lint requirements train
+PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+PROFILE = default
+PROJECT_NAME = deep-learning-addition
+## Install Python Dependencies
+requirements: test_environment
+ pip install -r requirements.txt
+## Make Dataset
+data: requirements
+ PYTHONPATH='.' $(PYTHON_INTERPRETER) src/data/ ./data/processed/
+## Train Deep Learning Model
+train: requirements
+ PYTHONPATH='.' $(PYTHON_INTERPRETER) src/models/ ./data/processed/ ./models/
+## Delete all compiled Python files
+ find . -type f -name "*.py[co]" -delete
+ find . -type d -name "__pycache__" -delete
+## Lint using flake8
+ flake8 src
+## Set up python interpreter environment
+ @pip install -q virtualenv virtualenvwrapper
+ @echo ">>> Installing virtualenvwrapper if not already intalled.\nMake sure the following lines are in shell startup file\n\
+ export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/\n"
+ @bash -c "source `which`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
+ @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
+## Test python environment is setup correctly
+# Self Documenting Commands #
+.DEFAULT_GOAL := show-help
+# Inspired by
+# sed script explained:
+# /^##/:
+# * save line in hold space
+# * purge line
+# * Loop:
+# * append newline + line to hold space
+# * go to next line
+# * if line starts with doc comment, strip comment character off and loop
+# * remove target prerequisites
+# * append hold space (+ newline) to line
+# * replace newline plus comments by `---`
+# * print line
+# Separate expressions are necessary because labels cannot be delimited by
+# semicolon; see
+.PHONY: show-help
+ @echo "$$(tput bold)Available rules:$$(tput sgr0)"
+ @echo
+ @sed -n -e "/^## / { \
+ h; \
+ s/.*//; \
+ :doc" \
+ -e "H; \
+ n; \
+ s/^## //; \
+ t doc" \
+ -e "s/:.*//; \
+ G; \
+ s/\\n## /---/; \
+ s/\\n/ /g; \
+ p; \
+ | LC_ALL='C' sort --ignore-case \
+ | awk -F '---' \
+ -v ncol=$$(tput cols) \
+ -v indent=19 \
+ -v col_on="$$(tput setaf 6)" \
+ -v col_off="$$(tput sgr0)" \
+ '{ \
+ printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
+ n = split($$2, words, " "); \
+ line_length = ncol - indent; \
+ for (i = 1; i <= n; i++) { \
+ line_length -= length(words[i]) + 1; \
+ if (line_length <= 0) { \
+ line_length = ncol - indent - length(words[i]) - 1; \
+ printf "\n%*s ", -indent, " "; \
+ } \
+ printf "%s ", words[i]; \
+ } \
+ printf "\n"; \
+ }' \
+ | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
# deep-learning-addition
-An implementation of sequence to sequence learning for performing addition
+A LSTM model that learns to perform arithmetic addition for integers. It's a simple Deep Learning calculator.
+## Development
+- Python 3.6 and 2.7
+- For Python 2.7 replace the value of `REQUIRED_PYTHON` and `PYTHON_INTERPRETER` in `` and `Makefile`, respectively, to `python2`
+- [awscli]( installed and configured
+## Commands
+clean Delete all compiled Python files
+create_environment Set up python interpreter environment
+data Make Dataset
+lint Lint using flake8
+requirements Install Python Dependencies
+test_environment Test python environment is setup correctly
+train Train Deep Learning Model
+import numpy as np
+class CharacterTable(object):
+ """Given a set of characters:
+ + Encode them to a one hot integer representation
+ + Decode the one hot integer representation to their character output
+ + Decode a vector of probabilities to their character output
+ """
+ def __init__(self, chars):
+ """Initialize character table.
+ # Arguments
+ chars: Characters that can appear in the input.
+ """
+ self.chars = sorted(set(chars))
+ self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
+ self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
+ def encode(self, input_string, num_rows):
+ """One hot encode given string input_string.
+ # Arguments
+ num_rows: Number of rows in the returned one hot encoding. This is
+ used to keep the # of rows for each data the same.
+ """
+ x = np.zeros((num_rows, len(self.chars)))
+ for i, c in enumerate(input_string):
+ x[i, self.char_indices[c]] = 1
+ return x
+ def decode(self, x, calc_argmax=True):
+ if calc_argmax:
+ x = x.argmax(axis=-1)
+ return ''.join(self.indices_char[x] for x in x)
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals, absolute_import
+import logging
+import os
+import click
+import numpy as np
+from src.models.train_model import CHARS
+logger = logging.getLogger(__name__)
+def _generate_data():
+ questions = []
+ expected = []
+ seen = set()
+ print('Generating data...')
+ while len(questions) < TRAINING_SIZE:
+ def func():
+ return int(''.join(np.random.choice(list('0123456789'))
+ for _ in range(np.random.randint(1, DIGITS + 1))))
+ a, b = func(), func()
+ # Skip any addition questions we've already seen
+ # Also skip any such that x+Y == Y+x (hence the sorting).
+ key = tuple(sorted((a, b)))
+ if key in seen:
+ continue
+ seen.add(key)
+ # Pad the data with spaces such that it is always MAXLEN.
+ q = '{}+{}'.format(a, b)
+ query = q + ' ' * (MAXLEN - len(q))
+ ans = str(a + b)
+ # Answers can be of maximum size DIGITS + 1.
+ ans += ' ' * (DIGITS + 1 - len(ans))
+ # Reverse the query, e.g., '12+345 ' becomes ' 543+21'. (Note the
+ # space used for padding.)
+ query = query[::-1]
+ questions.append(query)
+ expected.append(ans)
+ print('Total addition questions:', len(questions))
+ return questions, expected
+def _vectorization(questions, expected):
+ print('Vectorization...')
+ x = np.zeros((len(questions), MAXLEN, len(CHARS)), dtype=np.bool)
+ y = np.zeros((len(questions), DIGITS + 1, len(CHARS)), dtype=np.bool)
+ for i, sentence in enumerate(questions):
+ x[i] = CTABLE.encode(sentence, MAXLEN)
+ for i, sentence in enumerate(expected):
+ y[i] = CTABLE.encode(sentence, DIGITS + 1)
+ # Shuffle (x, y) in unison as the later parts of x will almost all be larger
+ # digits.
+ indices = np.arange(len(y))
+ np.random.shuffle(indices)
+ x = x[indices]
+ y = y[indices]
+ # Explicitly set apart 10% for validation data that we never train over.
+ split_at = len(x) - len(x) // 10
+ (x_train, x_val) = x[:split_at], x[split_at:]
+ (y_train, y_val) = y[:split_at], y[split_at:]
+ print('Training Data:')
+ print(x_train.shape)
+ print(y_train.shape)
+ print('Validation Data:')
+ print(x_val.shape)
+ print(y_val.shape)
+ return x_train, y_train, x_val, y_val
+@click.argument('output_path', type=click.Path())
+def main(output_path):
+ """ Runs data processing scripts to save data in ../processed.
+ """
+'making final data set')
+ questions, expected = _generate_data()
+ x_train, y_train, x_val, y_val = _vectorization(questions, expected)
+, 'x_train.npy'), x_train)
+, 'y_train.npy'), y_train)
+, 'x_val.npy'), x_val)
+, 'y_val.npy'), y_val)
+if __name__ == '__main__':
+ log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
+ main()
+from src.settings import CTABLE, MAXLEN
+def encode_query(input_string):
+ """
+ Encode a query addition string
+ :param input_string: [str], input query string, i.e. '123+456'
+ :return: [str], encoded query ready to be used in model.predict(...)
+ """
+ output = CTABLE.encode(input_string[::-1], MAXLEN)
+ return output.reshape((1, output.shape[0], output.shape[1]))
+def decode_prediction(input_array):
+ """
+ Decode model prediction
+ :param input_array: [numpy.array], input numpy array
+ :return: [str], decoded array to string
+ """
+ return CTABLE.decode(input_array[0])
+# -*- coding: utf-8 -*-
+from __future__ import print_function, absolute_import, unicode_literals
+import logging
+import os
+import click
+from keras.models import Sequential
+from keras import layers
+import numpy as np
+logger = logging.getLogger(__name__)
+class Colors:
+ ok = '\033[92m'
+ fail = '\033[91m'
+ close = '\033[0m'
+def _build_model():
+ print('Build model...')
+ model = Sequential()
+ # "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
+ # Note: In a situation where your input sequences have a variable length,
+ # use input_shape=(None, num_feature).
+ model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(CHARS))))
+ # As the decoder RNN's input, repeatedly provide with the last hidden state of
+ # RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
+ # length of output, e.g., when DIGITS=3, max output is 999+999=1998.
+ model.add(layers.RepeatVector(DIGITS + 1))
+ # The decoder RNN could be multiple layers stacked or a single layer.
+ for _ in range(LAYERS):
+ # By setting return_sequences to True, return not only the last output but
+ # all the outputs so far in the form of (num_samples, timesteps,
+ # output_dim). This is necessary as TimeDistributed in the below expects
+ # the first dimension to be the timesteps.
+ model.add(RNN(HIDDEN_SIZE, return_sequences=True))
+ # Apply a dense layer to the every temporal slice of an input. For each of step
+ # of the output sequence, decide which character should be chosen.
+ model.add(layers.TimeDistributed(layers.Dense(len(CHARS))))
+ model.add(layers.Activation('softmax'))
+ model.compile(loss='categorical_crossentropy',
+ optimizer='adam',
+ metrics=['accuracy'])
+ model.summary()
+ return model
+def _train_model(model, x_train, y_train, x_val, y_val):
+ # Train the model each generation and show predictions against the validation
+ # dataset.
+ for iteration in range(1, 15):
+ print()
+ print('-' * 50)
+ print('Iteration', iteration)
+, y_train,
+ batch_size=BATCH_SIZE,
+ epochs=1,
+ validation_data=(x_val, y_val))
+ # Select 10 samples from the validation set at random so we can visualize
+ # errors.
+ for i in range(10):
+ ind = np.random.randint(0, len(x_val))
+ rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
+ preds = model.predict_classes(rowx, verbose=0)
+ query = CTABLE.decode(rowx[0])
+ correct = CTABLE.decode(rowy[0])
+ guess = CTABLE.decode(preds[0], calc_argmax=False)
+ print('Q', query[::-1] if REVERSE else query, end=' ')
+ print('T', correct, end=' ')
+ if correct == guess:
+ print(Colors.ok + '☑' + Colors.close, end=' ')
+ else:
+ print( + '☒' + Colors.close, end=' ')
+ print(guess)
+ return model
+def train(input_path, output_path):
+ x_train = np.load(os.path.join(input_path, 'x_train.npy'))
+ y_train = np.load(os.path.join(input_path, 'y_train.npy'))
+ x_val = np.load(os.path.join(input_path, 'x_val.npy'))
+ y_val = np.load(os.path.join(input_path, 'y_val.npy'))
+ model = _build_model()
+ trained_model = _train_model(
+ model=model,
+ x_train=x_train,
+ y_train=y_train,
+ x_val=x_val,
+ y_val=y_val
+ )
+, 'model.h5'))
+@click.argument('input_path', type=click.Path(exists=True))
+@click.argument('output_path', type=click.Path())
+def main(input_path, output_path):
+ train(input_path, output_path)
+if __name__ == '__main__':
+ log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ logging.basicConfig(level=logging.INFO, format=log_fmt)
+ main()
+# Parameters for the model and dataset.
+from keras import layers
+from src.character_encoder import CharacterTable
+# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
+# int is DIGITS.
+RNN = layers.LSTM
+# All the numbers, plus sign and space for padding.
+CHARS = '0123456789+ '
+CTABLE = CharacterTable(CHARS)
+import sys
+REQUIRED_PYTHON = "python3"
+def main():
+ system_major = sys.version_info.major
+ if REQUIRED_PYTHON == "python2":
+ required_major = 2
+ elif REQUIRED_PYTHON == "python3":
+ required_major = 3
+ else:
+ raise ValueError("Unrecognized python interpreter: {}".format(
+ if system_major != required_major:
+ raise TypeError(
+ "This project requires Python {}. Found: Python {}".format(
+ required_major, sys.version))
+ else:
+ print(">>> Development environment passes all tests!")
+if __name__ == '__main__':
+ main()
+max-line-length = 79
+max-complexity = 10