Skip to content

Commit

Permalink
Wrap
Browse files Browse the repository at this point in the history
  • Loading branch information
simsicon committed May 11, 2017
1 parent 7574b74 commit f6b93b7
Show file tree
Hide file tree
Showing 6 changed files with 482 additions and 2 deletions.
19 changes: 17 additions & 2 deletions ingredients.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,26 @@ def plot(self):
pylab.savefig('recipes2vec.png')

class Ingredients2Recipes:

_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

def __init__(self):
self.build_graph()

def build_graph(self):
pass



def train(self):
pass


def main(_):
Expand Down
104 changes: 104 additions & 0 deletions ingredients2recipes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import json
import numpy as np
import collections
import tensorflow as tf

from tensorflow.models.rnn.translate import seq2seq_model

import pdb

tf.app.flags.DEFINE_integer("ingredients_vocab_size", 1000, "Ingredients vocabulary size.")
tf.app.flags.DEFINE_integer("recipes_vocab_size", 1000, "Recipes vocabulary size.")
tf.app.flags.DEFINE_string("checkpoints_dir", "checkpoints/ingredients2recipes/", "Checkpoints dir")
tf.app.flags.DEFINE_boolean("decode", False, "Set to True for interactive decoding.")

FLAGS = tf.app.flags.FLAGS

class Parser():
def __init__(self, file_path):
self.file_path = file_path
self.ingredient_size = 1000
self.extract()

def extract(self):
with open(self.file_path, "r") as f:
lines = f.readlines()
self.raw_data = [json.loads(line) for line in lines]

self.all_ingredients = []
self.all_recipes = []
for recipe in self.raw_data:
self.all_ingredients.extend([i[0] for i in recipe["ingredients"]])
self.all_recipes.extend(recipe["name"])

self.recipes_size = len(self.all_recipes)

self.ingredients_counter = collections.Counter(self.all_ingredients)

ingredients_count = [['UNK', -1]]
ingredients_count.extend(self.ingredients_counter.most_common(self.ingredient_size - 1))

self.ingredient_dict = dict()
for _ingredient, _ in ingredients_count:
self.ingredient_dict[_ingredient] = len(self.ingredient_dict)

self.reversed_dictionary = dict(zip(self.ingredient_dict.values(), self.ingredient_dict.keys()))

def generate_batch(self, batch_size=64):
recipes = np.random.choice(self.raw_data, batch_size, replace=False)
input_data = []
output_data = []
for recipe in recipes:
output_data.append(recipe["name"])
_group = []
for ingredient, _ in recipe["ingredients"]:
if ingredient in self.ingredient_dict:
_group.append(self.ingredient_dict[ingredient])
else:
_group.append(0)
input_data.append(_group)
return input_data, output_data


class Engine():
def __init__(self):
self.parser = Parser("data/sitemap.json")
self.batch_size = 64
self.size = 256
self.num_layers = 3
self.num_encoder_symbols = 1000
self.num_decoder_symbols = 1000
self.embedding_size = 200

def build_model(self):
self.encoder_inputs = tf.placeholder(tf.int32, shape=[None], name="encoder")
self.decoder_inputs = tf.placeholder(tf.int32, shape=[None], name="decoder")
self.target_weights = tf.placeholder(tf.float32, shape=[None], name="weight")

single_cell = tf.nn.rnn_cell.GRUCell(self.size)
cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * self.num_layers)

return tf.nn.seq2seq.embedding_rnn_seq2seq(self.encoder_inputs,
self.decoder_inputs,
cell,
self.num_encoder_symbols,
self.num_decoder_sumbols,
self.embedding_size)

def train(self):
with tf.Session() as sess:
outputs, states = self.build_model()




def main(_):
engine = Engine()

if FLAGS.decode:
engine.decode()
else:
engine.train()

if __name__ == "__main__":
tf.app.run()
104 changes: 104 additions & 0 deletions lstm_text_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

path = "/data/ptb/train.txt"
text = open(path).read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)

# train the model, output generated text after each iteration
for iteration in range(1, 60):
print()
print('-' * 50)
print('Iteration', iteration)
model.fit(X, y, batch_size=128, nb_epoch=1)

start_index = random.randint(0, len(text) - maxlen - 1)

for diversity in [0.2, 0.5, 1.0, 1.2]:
print()
print('----- diversity:', diversity)

generated = ''
sentence = text[start_index: start_index + maxlen]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sys.stdout.write(generated)

for i in range(400):
x = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x[0, t, char_indices[char]] = 1.

preds = model.predict(x, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]

generated += next_char
sentence = sentence[1:] + next_char

sys.stdout.write(next_char)
sys.stdout.flush()
print()
27 changes: 27 additions & 0 deletions prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import json
import codecs

file_path = "data/sitemap.json"

with open(file_path, "r") as f:
lines = f.readlines()
raw_data = [json.loads(line) for line in lines]

ingredients_file_path = "data/ingredients"
recipes_file_path = "data/recipes"

ingredients_file = codecs.open(ingredients_file_path, "w", "utf-8-sig")
recipes_file = codecs.open(recipes_file_path, "w", "utf-8-sig")

for recipe in raw_data:
recipe_name = recipe["name"] + "\n"
ingredient_names = " ".join([i[0] for i in recipe["ingredients"]]) + "\n"

ingredients_file.write(ingredient_names)
recipes_file.write(recipe_name)

ingredients_file.flush()
ingredients_file.close()

recipes_file.flush()
recipes_file.close()
Loading

0 comments on commit f6b93b7

Please sign in to comment.