Wrap

simsicon · May 11, 2017 · f6b93b7 · f6b93b7
1 parent 7574b74
commit f6b93b7
Show file tree

Hide file tree

Showing 6 changed files with 482 additions and 2 deletions.
diff --git a/ingredients.py b/ingredients.py
@@ -351,11 +351,26 @@ def plot(self):
         pylab.savefig('recipes2vec.png')
 
 class Ingredients2Recipes:
+
+    _PAD = b"_PAD"
+    _GO = b"_GO"
+    _EOS = b"_EOS"
+    _UNK = b"_UNK"
+    _START_VOCAB = [_PAD, _GO, _EOS, _UNK]
+
+    PAD_ID = 0
+    GO_ID = 1
+    EOS_ID = 2
+    UNK_ID = 3
+
     def __init__(self):
+        self.build_graph()
+
+    def build_graph(self):
         pass
 
-
-
+    def train(self):
+        pass
 
 
 def main(_):

diff --git a/ingredients2recipes.py b/ingredients2recipes.py
@@ -0,0 +1,104 @@
+import json
+import numpy as np
+import collections
+import tensorflow as tf
+
+from tensorflow.models.rnn.translate import seq2seq_model
+
+import pdb
+
+tf.app.flags.DEFINE_integer("ingredients_vocab_size", 1000, "Ingredients vocabulary size.")
+tf.app.flags.DEFINE_integer("recipes_vocab_size", 1000, "Recipes vocabulary size.")
+tf.app.flags.DEFINE_string("checkpoints_dir", "checkpoints/ingredients2recipes/", "Checkpoints dir")
+tf.app.flags.DEFINE_boolean("decode", False, "Set to True for interactive decoding.")
+
+FLAGS = tf.app.flags.FLAGS
+
+class Parser():
+    def __init__(self, file_path):
+        self.file_path = file_path
+        self.ingredient_size = 1000
+        self.extract()
+
+    def extract(self):
+        with open(self.file_path, "r") as f:
+            lines = f.readlines()
+            self.raw_data = [json.loads(line) for line in lines]
+
+        self.all_ingredients = []
+        self.all_recipes = []
+        for recipe in self.raw_data:
+            self.all_ingredients.extend([i[0] for i in recipe["ingredients"]])
+            self.all_recipes.extend(recipe["name"])
+
+        self.recipes_size = len(self.all_recipes)
+
+        self.ingredients_counter = collections.Counter(self.all_ingredients)
+
+        ingredients_count = [['UNK', -1]]
+        ingredients_count.extend(self.ingredients_counter.most_common(self.ingredient_size - 1))
+
+        self.ingredient_dict = dict()
+        for _ingredient, _ in ingredients_count:
+            self.ingredient_dict[_ingredient] = len(self.ingredient_dict)
+
+        self.reversed_dictionary = dict(zip(self.ingredient_dict.values(), self.ingredient_dict.keys()))
+
+    def generate_batch(self, batch_size=64):
+        recipes = np.random.choice(self.raw_data, batch_size, replace=False)
+        input_data = []
+        output_data = []
+        for recipe in recipes:
+            output_data.append(recipe["name"])
+            _group = []
+            for ingredient, _ in recipe["ingredients"]:
+                if ingredient in self.ingredient_dict:
+                    _group.append(self.ingredient_dict[ingredient])
+                else:
+                    _group.append(0)
+            input_data.append(_group)
+        return input_data, output_data
+
+
+class Engine():
+    def __init__(self):
+        self.parser = Parser("data/sitemap.json")
+        self.batch_size = 64
+        self.size = 256
+        self.num_layers = 3
+        self.num_encoder_symbols = 1000
+        self.num_decoder_symbols = 1000
+        self.embedding_size = 200
+
+    def build_model(self):
+        self.encoder_inputs = tf.placeholder(tf.int32, shape=[None], name="encoder")
+        self.decoder_inputs = tf.placeholder(tf.int32, shape=[None], name="decoder")
+        self.target_weights = tf.placeholder(tf.float32, shape=[None], name="weight")
+
+        single_cell = tf.nn.rnn_cell.GRUCell(self.size)
+        cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * self.num_layers)
+
+        return tf.nn.seq2seq.embedding_rnn_seq2seq(self.encoder_inputs,
+                                                   self.decoder_inputs,
+                                                   cell,
+                                                   self.num_encoder_symbols,
+                                                   self.num_decoder_sumbols,
+                                                   self.embedding_size)
+
+    def train(self):
+        with tf.Session() as sess:
+            outputs, states = self.build_model()
+
+
+
+
+def main(_):
+    engine = Engine()
+
+    if FLAGS.decode:
+        engine.decode()
+    else:
+        engine.train()
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/lstm_text_generation.py b/lstm_text_generation.py
@@ -0,0 +1,104 @@
+'''Example script to generate text from Nietzsche's writings.
+
+At least 20 epochs are required before the generated text
+starts sounding coherent.
+
+It is recommended to run this script on GPU, as recurrent
+networks are quite computationally intensive.
+
+If you try this script on new data, make sure your corpus
+has at least ~100k characters. ~1M is better.
+'''
+
+from __future__ import print_function
+from keras.models import Sequential
+from keras.layers import Dense, Activation, Dropout
+from keras.layers import LSTM
+from keras.optimizers import RMSprop
+from keras.utils.data_utils import get_file
+import numpy as np
+import random
+import sys
+
+path = "/data/ptb/train.txt"
+text = open(path).read().lower()
+print('corpus length:', len(text))
+
+chars = sorted(list(set(text)))
+print('total chars:', len(chars))
+char_indices = dict((c, i) for i, c in enumerate(chars))
+indices_char = dict((i, c) for i, c in enumerate(chars))
+
+# cut the text in semi-redundant sequences of maxlen characters
+maxlen = 40
+step = 3
+sentences = []
+next_chars = []
+for i in range(0, len(text) - maxlen, step):
+    sentences.append(text[i: i + maxlen])
+    next_chars.append(text[i + maxlen])
+print('nb sequences:', len(sentences))
+
+print('Vectorization...')
+X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
+y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
+for i, sentence in enumerate(sentences):
+    for t, char in enumerate(sentence):
+        X[i, t, char_indices[char]] = 1
+    y[i, char_indices[next_chars[i]]] = 1
+
+
+# build the model: a single LSTM
+print('Build model...')
+model = Sequential()
+model.add(LSTM(128, input_shape=(maxlen, len(chars))))
+model.add(Dense(len(chars)))
+model.add(Activation('softmax'))
+
+optimizer = RMSprop(lr=0.01)
+model.compile(loss='categorical_crossentropy', optimizer=optimizer)
+
+
+def sample(preds, temperature=1.0):
+    # helper function to sample an index from a probability array
+    preds = np.asarray(preds).astype('float64')
+    preds = np.log(preds) / temperature
+    exp_preds = np.exp(preds)
+    preds = exp_preds / np.sum(exp_preds)
+    probas = np.random.multinomial(1, preds, 1)
+    return np.argmax(probas)
+
+# train the model, output generated text after each iteration
+for iteration in range(1, 60):
+    print()
+    print('-' * 50)
+    print('Iteration', iteration)
+    model.fit(X, y, batch_size=128, nb_epoch=1)
+
+    start_index = random.randint(0, len(text) - maxlen - 1)
+
+    for diversity in [0.2, 0.5, 1.0, 1.2]:
+        print()
+        print('----- diversity:', diversity)
+
+        generated = ''
+        sentence = text[start_index: start_index + maxlen]
+        generated += sentence
+        print('----- Generating with seed: "' + sentence + '"')
+        sys.stdout.write(generated)
+
+        for i in range(400):
+            x = np.zeros((1, maxlen, len(chars)))
+            for t, char in enumerate(sentence):
+                x[0, t, char_indices[char]] = 1.
+
+            preds = model.predict(x, verbose=0)[0]
+            next_index = sample(preds, diversity)
+            next_char = indices_char[next_index]
+
+            generated += next_char
+            sentence = sentence[1:] + next_char
+
+            sys.stdout.write(next_char)
+            sys.stdout.flush()
+        print()
diff --git a/prepare.py b/prepare.py
@@ -0,0 +1,27 @@
+import json
+import codecs
+
+file_path = "data/sitemap.json"
+
+with open(file_path, "r") as f:
+    lines = f.readlines()
+    raw_data = [json.loads(line) for line in lines]
+
+ingredients_file_path = "data/ingredients"
+recipes_file_path = "data/recipes"
+
+ingredients_file = codecs.open(ingredients_file_path, "w", "utf-8-sig")
+recipes_file = codecs.open(recipes_file_path, "w", "utf-8-sig")
+
+for recipe in raw_data:
+    recipe_name = recipe["name"] + "\n"
+    ingredient_names = " ".join([i[0] for i in recipe["ingredients"]]) + "\n"
+
+    ingredients_file.write(ingredient_names)
+    recipes_file.write(recipe_name)
+
+ingredients_file.flush()
+ingredients_file.close()
+
+recipes_file.flush()
+recipes_file.close()