Skip to content

Commit

Permalink
Add recipes2vec
Browse files Browse the repository at this point in the history
  • Loading branch information
simsicon committed Nov 22, 2016
1 parent 71af0d2 commit 7574b74
Showing 1 changed file with 177 additions and 15 deletions.
192 changes: 177 additions & 15 deletions ingredients.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,12 @@ class Parser:
def __init__(self, batch_size, vocabulary_size):
self.filepath = FLAGS.train_data
self.cursor = 0
self.recipe_cursor = 0
self.batch_size = batch_size
self.vocabulary_size = vocabulary_size
self.extract()

def extract(self):
with open(self.filepath, "r") as f:
lines = f.readlines()
self.raw_data = [json.loads(line) for line in lines]


def extract_ingredient_pairs(self):
self.ingredient_pairs = []
_all_ingredients = []

Expand Down Expand Up @@ -63,10 +60,43 @@ def extract(self):

self.ingredient_pairs.append( (_ctx_index, _trgt_index) )

self.pairs_length = len(self.ingredient_pairs)
self.ingredient_pairs_length = len(self.ingredient_pairs)

def extract_recipe_pairs(self):
_all_recipe_names = [recipe['name'] for recipe in self.raw_data]
self.recipes_counter = collections.Counter(_all_recipe_names)
self.recipe_dictionary = dict()

for recipe_name, _ in self.recipes_counter.most_common():
self.recipe_dictionary[recipe_name] = len(self.recipe_dictionary)

self.reversed_recipe_dictionary = dict(zip(self.recipe_dictionary.values(), self.recipe_dictionary.keys()))
self.recipe_ingredient_pairs = list()

for recipe in self.raw_data:
_ingredients = recipe['ingredients']
_recipe_name = recipe['name']
_recipe_idx = self.recipe_dictionary[_recipe_name]
for _ingredient, _ in _ingredients:
if _ingredient in self.dictionary:
_ingredient_idx = self.dictionary[_ingredient]
else:
_ingredient_idx = 0

self.recipe_ingredient_pairs.append( (_recipe_idx, _ingredient_idx) )

self.recipe_ingredient_pairs_length = len(self.recipe_ingredient_pairs)

def extract(self):
with open(self.filepath, "r") as f:
lines = f.readlines()
self.raw_data = [json.loads(line) for line in lines]

self.extract_ingredient_pairs()
self.extract_recipe_pairs()

def generate_ingredients_batch(self):
next_cursor = (self.cursor + self.batch_size) % self.pairs_length
next_cursor = (self.cursor + self.batch_size) % self.ingredient_pairs_length
if next_cursor < self.cursor:
batch_pairs = self.ingredient_pairs[self.cursor:] + self.ingredient_pairs[:next_cursor]
else:
Expand All @@ -82,14 +112,28 @@ def generate_ingredients_batch(self):
return batch_data, batch_labels

def generate_recipes_batch(self):
pass
next_recipe_cursor = (self.recipe_cursor + self.batch_size) % self.recipe_ingredient_pairs_length

if next_recipe_cursor < self.recipe_cursor:
batch_pairs = self.recipe_ingredient_pairs[self.recipe_cursor:] + self.recipe_ingredient_pairs[:next_recipe_cursor]
else:
batch_pairs = self.recipe_ingredient_pairs[self.recipe_cursor:next_recipe_cursor]

batch_data = [pair[0] for pair in batch_pairs]
batch_labels = [pair[1] for pair in batch_pairs]
try:
batch_labels = np.reshape(batch_labels, (self.batch_size, 1))
except:
pdb.set_trace()
self.recipe_cursor = next_recipe_cursor
return batch_data, batch_labels

class Ingredient2Vec:
def __init__(self):
self.batch_size = 128
self.vocabulary_size = 1000
self.emb_dim = 256
self.num_steps = 1000000
self.num_steps = 2000000
self.parser = Parser(self.batch_size, self.vocabulary_size)
self.build_graph()

Expand Down Expand Up @@ -124,9 +168,9 @@ def build_graph(self):
self.normalized_embeddings = embeddings / norm

def restore(self):
ckpt_dir = "checkpoints/"
ckpt_filename = "bigrams2vec.ckpt"
ckpt_dir = "checkpoints/ingredients/"
ckpt_filename = "ingredients2vec.ckpt"

with tf.Session(graph=self.graph) as sess:
init_op = tf.initialize_all_variables()
sess.run(init_op)
Expand All @@ -138,8 +182,8 @@ def restore(self):
self.final_embeddings = self.normalized_embeddings.eval()

def train(self):
ckpt_dir = "checkpoints/"
ckpt_filename = "bigrams2vec.ckpt"
ckpt_dir = "checkpoints/ingredients/"
ckpt_filename = "ingredients2vec.ckpt"
with tf.Session(graph=self.graph) as sess:
init_op = tf.initialize_all_variables()
sess.run(init_op)
Expand Down Expand Up @@ -196,9 +240,123 @@ def plot(self):
pylab.savefig('ingredients2vec.png')

class Recipe2Vec:
def __init__(self, parser):
self.batch_size = 128
self.vocabulary_size = parser.recipe_ingredient_pairs_length
self.emb_dim = 256
self.num_steps = 2000000
self.parser = parser
self.build_graph()

def build_graph(self):
self.graph = tf.Graph()
batch_size, voc_size, emb_dim = self.batch_size, self.vocabulary_size, self.emb_dim
num_sampled = 128

with self.graph.as_default():
with tf.variable_scope("recipes"):
self.train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
self.train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

with tf.variable_scope("embeddings"):
embeddings = tf.get_variable("table", [voc_size, emb_dim],
initializer=tf.random_uniform_initializer(-1.0, 1.0))

with tf.variable_scope("softmax"):
_initializer = tf.truncated_normal_initializer(stddev=1.0/np.sqrt(emb_dim))
weights = tf.get_variable("weights", [voc_size, emb_dim], initializer=_initializer)
biases = tf.get_variable("biases", initializer=tf.zeros_initializer(voc_size))

emb = tf.nn.embedding_lookup(embeddings, self.train_dataset)

self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights, biases, emb, self.train_labels, num_sampled, voc_size))
self.global_step = tf.Variable(0, trainable=False)
self.learning_rate = tf.train.exponential_decay(10.0, self.global_step, 10000, 0.95, staircase=True)
self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)

norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
self.normalized_embeddings = embeddings / norm

def restore(self):
ckpt_dir = "checkpoints/recipes/"
ckpt_filename = "recipes2vec.ckpt"

with tf.Session(graph=self.graph) as sess:
init_op = tf.initialize_all_variables()
sess.run(init_op)
ckpt = tf.train.get_checkpoint_state(ckpt_dir)
saver = tf.train.Saver()
if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)

self.final_embeddings = self.normalized_embeddings.eval()

def train(self):
ckpt_dir = "checkpoints/recipes/"
ckpt_filename = "recipes2vec.ckpt"
with tf.Session(graph=self.graph) as sess:
init_op = tf.initialize_all_variables()
sess.run(init_op)
ckpt = tf.train.get_checkpoint_state(ckpt_dir)
saver = tf.train.Saver()

def train_loop(start_at=0):
average_loss = 0
for step in range(start_at, self.num_steps, 1):
batch_data, batch_labels = self.parser.generate_recipes_batch()
feed_dict = {self.train_dataset : batch_data, self.train_labels : batch_labels}
_, _l, _lr = sess.run([self.optimizer, self.loss, self.learning_rate], feed_dict=feed_dict)
average_loss += _l

if step % 1000 == 0:
if step > 0:
average_loss = average_loss / 1000
print('Average loss at step %d: %f with learning rate %f' %
(step, average_loss, _lr))
average_loss = 0

if step % 100000 == 0:
if step > 0:
save_path = saver.save(sess, ckpt_dir + ckpt_filename,
global_step=self.global_step)
print "Recipes2Vec Model saved in file: %s" % save_path

if ckpt and ckpt.model_checkpoint_path:
saver.restore(sess, ckpt.model_checkpoint_path)
print "Recipes2Vec Model load from file: %s" % ckpt.model_checkpoint_path
current_step = self.global_step.eval()
if current_step < self.num_steps:
train_loop(start_at=current_step)
else:
train_loop()

save_path = saver.save(sess, ckpt_dir + ckpt_filename)
print "Recipes2Vec Model saved in file: %s" % save_path

self.final_embeddings = self.normalized_embeddings.eval()

def plot(self):
fm = font_manager.FontProperties(fname='/usr/share/fonts/truetype/wqy/wqy-microhei.ttc')

num_points = 500
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=10000)
two_d_embeddings = tsne.fit_transform(self.final_embeddings[1:num_points, :])
labels = [self.parser.reversed_recipe_dictionary[i] for i in range(1, num_points)]
pylab.figure(figsize=(15,15))
for i, label in enumerate(labels):
x, y = two_d_embeddings[i, :]
pylab.scatter(x, y)
pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom', fontproperties=fm)
pylab.savefig('recipes2vec.png')

class Ingredients2Recipes:
def __init__(self):
pass





def main(_):
if not FLAGS.train_data:
Expand All @@ -208,6 +366,10 @@ def main(_):
i2v = Ingredient2Vec()
i2v.train()
i2v.plot()

r2v = Recipe2Vec(i2v.parser)
r2v.train()
r2v.plot()

if __name__ == "__main__":
tf.app.run()

0 comments on commit 7574b74

Please sign in to comment.