Add recipes2vec

simsicon · Nov 22, 2016 · 7574b74 · 7574b74
1 parent 71af0d2
commit 7574b74
Showing 1 changed file with 177 additions and 15 deletions.
diff --git a/ingredients.py b/ingredients.py
@@ -19,15 +19,12 @@ class Parser:
     def __init__(self, batch_size, vocabulary_size):
         self.filepath = FLAGS.train_data
         self.cursor = 0
+        self.recipe_cursor = 0
         self.batch_size = batch_size
         self.vocabulary_size = vocabulary_size
         self.extract()
-
-    def extract(self):
-        with open(self.filepath, "r") as f:
-            lines = f.readlines()
-            self.raw_data = [json.loads(line) for line in lines]
-
+
+    def extract_ingredient_pairs(self):
         self.ingredient_pairs = []
         _all_ingredients = []
 
@@ -63,10 +60,43 @@ def extract(self):
 
                         self.ingredient_pairs.append( (_ctx_index, _trgt_index) )
 
-        self.pairs_length = len(self.ingredient_pairs)
+        self.ingredient_pairs_length = len(self.ingredient_pairs)
+
+    def extract_recipe_pairs(self):
+        _all_recipe_names = [recipe['name'] for recipe in self.raw_data]
+        self.recipes_counter = collections.Counter(_all_recipe_names)
+        self.recipe_dictionary = dict()
+
+        for recipe_name, _ in self.recipes_counter.most_common():
+            self.recipe_dictionary[recipe_name] = len(self.recipe_dictionary)
+
+        self.reversed_recipe_dictionary = dict(zip(self.recipe_dictionary.values(), self.recipe_dictionary.keys()))
+        self.recipe_ingredient_pairs = list()
+
+        for recipe in self.raw_data:
+            _ingredients = recipe['ingredients']
+            _recipe_name = recipe['name']
+            _recipe_idx = self.recipe_dictionary[_recipe_name]
+            for _ingredient, _ in _ingredients:
+                if _ingredient in self.dictionary:
+                    _ingredient_idx = self.dictionary[_ingredient]
+                else:
+                    _ingredient_idx = 0
+
+                self.recipe_ingredient_pairs.append( (_recipe_idx, _ingredient_idx) )
+
+        self.recipe_ingredient_pairs_length = len(self.recipe_ingredient_pairs)
+
+    def extract(self):
+        with open(self.filepath, "r") as f:
+            lines = f.readlines()
+            self.raw_data = [json.loads(line) for line in lines]
+
+        self.extract_ingredient_pairs()
+        self.extract_recipe_pairs()
 
     def generate_ingredients_batch(self):
-        next_cursor = (self.cursor + self.batch_size) % self.pairs_length
+        next_cursor = (self.cursor + self.batch_size) % self.ingredient_pairs_length
         if next_cursor < self.cursor:
             batch_pairs = self.ingredient_pairs[self.cursor:] + self.ingredient_pairs[:next_cursor]
         else:
@@ -82,14 +112,28 @@ def generate_ingredients_batch(self):
         return batch_data, batch_labels
 
     def generate_recipes_batch(self):
-        pass
+        next_recipe_cursor = (self.recipe_cursor + self.batch_size) % self.recipe_ingredient_pairs_length
+
+        if next_recipe_cursor < self.recipe_cursor:
+            batch_pairs = self.recipe_ingredient_pairs[self.recipe_cursor:] + self.recipe_ingredient_pairs[:next_recipe_cursor]
+        else:
+            batch_pairs = self.recipe_ingredient_pairs[self.recipe_cursor:next_recipe_cursor]
+
+        batch_data = [pair[0] for pair in batch_pairs]
+        batch_labels = [pair[1] for pair in batch_pairs]
+        try:
+            batch_labels = np.reshape(batch_labels, (self.batch_size, 1))
+        except:
+            pdb.set_trace()
+        self.recipe_cursor = next_recipe_cursor
+        return batch_data, batch_labels
 
 class Ingredient2Vec:
     def __init__(self):
         self.batch_size = 128
         self.vocabulary_size = 1000
         self.emb_dim = 256
-        self.num_steps = 1000000
+        self.num_steps = 2000000
         self.parser = Parser(self.batch_size, self.vocabulary_size)
         self.build_graph()
 
@@ -124,9 +168,9 @@ def build_graph(self):
                 self.normalized_embeddings = embeddings / norm
 
     def restore(self):
-        ckpt_dir = "checkpoints/"
-        ckpt_filename = "bigrams2vec.ckpt"
-        
+        ckpt_dir = "checkpoints/ingredients/"
+        ckpt_filename = "ingredients2vec.ckpt"
+
         with tf.Session(graph=self.graph) as sess:
             init_op = tf.initialize_all_variables()
             sess.run(init_op)
@@ -138,8 +182,8 @@ def restore(self):
         self.final_embeddings = self.normalized_embeddings.eval()
 
     def train(self):
-        ckpt_dir = "checkpoints/"
-        ckpt_filename = "bigrams2vec.ckpt"
+        ckpt_dir = "checkpoints/ingredients/"
+        ckpt_filename = "ingredients2vec.ckpt"
         with tf.Session(graph=self.graph) as sess:
             init_op = tf.initialize_all_variables()
             sess.run(init_op)
@@ -196,9 +240,123 @@ def plot(self):
         pylab.savefig('ingredients2vec.png')
 
 class Recipe2Vec:
+    def __init__(self, parser):
+        self.batch_size = 128
+        self.vocabulary_size = parser.recipe_ingredient_pairs_length
+        self.emb_dim = 256
+        self.num_steps = 2000000
+        self.parser = parser
+        self.build_graph()
+
+    def build_graph(self):
+        self.graph = tf.Graph()
+        batch_size, voc_size, emb_dim = self.batch_size, self.vocabulary_size, self.emb_dim
+        num_sampled = 128
+
+        with self.graph.as_default():
+            with tf.variable_scope("recipes"):
+                self.train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
+                self.train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
+
+                with tf.variable_scope("embeddings"):
+                    embeddings = tf.get_variable("table", [voc_size, emb_dim],
+                                                initializer=tf.random_uniform_initializer(-1.0, 1.0))
+
+                with tf.variable_scope("softmax"):
+                    _initializer = tf.truncated_normal_initializer(stddev=1.0/np.sqrt(emb_dim))
+                    weights = tf.get_variable("weights", [voc_size, emb_dim], initializer=_initializer)
+                    biases = tf.get_variable("biases", initializer=tf.zeros_initializer(voc_size))
+
+                emb = tf.nn.embedding_lookup(embeddings, self.train_dataset)
+
+                self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
+                                weights, biases, emb, self.train_labels, num_sampled, voc_size))
+                self.global_step = tf.Variable(0, trainable=False)
+                self.learning_rate = tf.train.exponential_decay(10.0, self.global_step, 10000, 0.95, staircase=True)
+                self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
+
+                norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
+                self.normalized_embeddings = embeddings / norm
+
+    def restore(self):
+        ckpt_dir = "checkpoints/recipes/"
+        ckpt_filename = "recipes2vec.ckpt"
+
+        with tf.Session(graph=self.graph) as sess:
+            init_op = tf.initialize_all_variables()
+            sess.run(init_op)
+            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
+            saver = tf.train.Saver()
+            if ckpt and ckpt.model_checkpoint_path:
+                saver.restore(sess, ckpt.model_checkpoint_path)
+
+        self.final_embeddings = self.normalized_embeddings.eval()
+
+    def train(self):
+        ckpt_dir = "checkpoints/recipes/"
+        ckpt_filename = "recipes2vec.ckpt"
+        with tf.Session(graph=self.graph) as sess:
+            init_op = tf.initialize_all_variables()
+            sess.run(init_op)
+            ckpt = tf.train.get_checkpoint_state(ckpt_dir)
+            saver = tf.train.Saver()
+
+            def train_loop(start_at=0):
+                average_loss = 0
+                for step in range(start_at, self.num_steps, 1):
+                    batch_data, batch_labels = self.parser.generate_recipes_batch()
+                    feed_dict = {self.train_dataset : batch_data, self.train_labels : batch_labels}
+                    _, _l, _lr = sess.run([self.optimizer, self.loss, self.learning_rate], feed_dict=feed_dict)
+                    average_loss += _l
+
+                    if step % 1000 == 0:
+                        if step > 0:
+                            average_loss = average_loss / 1000
+                            print('Average loss at step %d: %f with learning rate %f' %
+                                  (step, average_loss, _lr))
+                            average_loss = 0
+
+                    if step % 100000 == 0:
+                        if step > 0:
+                            save_path = saver.save(sess, ckpt_dir + ckpt_filename,
+                                                   global_step=self.global_step)
+                            print "Recipes2Vec Model saved in file: %s" % save_path
+
+            if ckpt and ckpt.model_checkpoint_path:
+                saver.restore(sess, ckpt.model_checkpoint_path)
+                print "Recipes2Vec Model load from file: %s" % ckpt.model_checkpoint_path
+                current_step = self.global_step.eval()
+                if current_step < self.num_steps:
+                    train_loop(start_at=current_step)
+            else:
+                train_loop()
+
+            save_path = saver.save(sess, ckpt_dir + ckpt_filename)
+            print "Recipes2Vec Model saved in file: %s" % save_path
+
+            self.final_embeddings = self.normalized_embeddings.eval()
+
+    def plot(self):
+        fm = font_manager.FontProperties(fname='/usr/share/fonts/truetype/wqy/wqy-microhei.ttc')
+
+        num_points = 500
+        tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=10000)
+        two_d_embeddings = tsne.fit_transform(self.final_embeddings[1:num_points, :])
+        labels = [self.parser.reversed_recipe_dictionary[i] for i in range(1, num_points)]
+        pylab.figure(figsize=(15,15))
+        for i, label in enumerate(labels):
+            x, y = two_d_embeddings[i, :]
+            pylab.scatter(x, y)
+            pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom', fontproperties=fm)
+        pylab.savefig('recipes2vec.png')
+
+class Ingredients2Recipes:
     def __init__(self):
         pass
 
+
+
+
 
 def main(_):
     if not FLAGS.train_data:
@@ -208,6 +366,10 @@ def main(_):
     i2v = Ingredient2Vec()
     i2v.train()
     i2v.plot()
+
+    r2v = Recipe2Vec(i2v.parser)
+    r2v.train()
+    r2v.plot()
 
 if __name__ == "__main__":
     tf.app.run()