Skip to content

Commit

Permalink
Implemented ingredient descriptors and preparation extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Felipe-Caldeira committed Feb 28, 2021
1 parent 2618897 commit a78f98e
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 20 deletions.
7 changes: 7 additions & 0 deletions DecomposedText.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,10 @@ def getToken(self, str):
if token.text == str:
return token

def getTextFromNouns(self, str):
for noun in self.nouns:
for word in noun.text.split():
if str in word:
return word
return False

51 changes: 48 additions & 3 deletions Ingredient.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def __init__(self, str):
self.doc = DecomposedText(self.str)
self.measurement = None
self.quantity = None
self.descriptor = None
self.descriptors = None
self.preparation = None

self.extractProperties(self.str)
Expand All @@ -19,22 +19,67 @@ def extractProperties(self, str):

# Extract measurement
self.measurement = self.doc.text[1] if self.doc.text[1] == self.doc.parent[0].text else None
self.measurement = self.doc.text[1] if self.doc.doc[1].ent_type_ == 'QUANTITY' else self.measurement
self.measurement = self.doc.text[1] if self.doc.doc[1].ent_type_ == 'QUANTITY' and self.doc.doc[1].pos_ == 'NOUN' else self.measurement
self.measurement = self.doc.text[1] if containsAnyOf(self.doc.text[1], measures) else self.measurement
self.measurement = nextToken(self.doc.getToken(")")).text if self.doc.doc[1].tag_ == '-LRB-' else self.measurement


root = self.doc.getRoot()

# Extract ingredient name
if root.pos_ != 'NOUN' or root.text == self.measurement:
for child in root.children:
if child.pos_ == 'NOUN' and child.text != self.measurement:
if child.pos_ == 'NOUN' and child.text != self.measurement and child.i > root.i:
root = child
break

if root.pos_ != 'NOUN' and self.measurement != None:
if nextToken(self.doc.getToken(self.measurement)).pos_ == 'NOUN':
root = nextToken(self.doc.getToken(self.measurement))
elif nextToken(self.doc.getToken(self.measurement)).head.pos_ == 'NOUN':
root = nextToken(self.doc.getToken(self.measurement)).head

self.name = precedingWords(root, restrictions=[self.measurement]) + root.text + proceedingWords(root)


# Extract descriptors
descriptors = []
for token in self.doc.doc:
if token.text == self.measurement: continue
if (tokenHasProperties(token, "ADJ", "JJ", "amod") and token.head.text in self.name) or\
(tokenHasProperties(token, "NOUN", "NN", "nmod") and token.head.text in self.name):
desc = token.text
if (tokenHasProperties(previousToken(token), "ADV", "RB", "advmod")):
desc = previousToken(token).text + " " + desc
if self.doc.getTextFromNouns(desc):
if self.doc.getTextFromNouns(desc) in descriptors: continue
desc = self.doc.getTextFromNouns(desc)
descriptors.append(desc)
if (tokenHasProperties(token, "DET", "DT", "det")) and \
nextToken(token).tag_ == "HYPH" and nextToken(nextToken(token)).pos_ == "NOUN":
descriptors.append(self.doc.getTextFromNouns(nextToken(nextToken(token)).text))

self.descriptors = ', '.join(descriptors)

# Extract preparation
preparations = []
for token in self.doc.doc:
if token.text == self.measurement:
continue

nameInChild = tokenHasProperties(token, child=self.name)
if (tokenHasProperties(token, "VERB", "VBN", "amod") and True) or\
(tokenHasProperties(token, "VERB", "VBD", "acl") and True) or\
(tokenHasProperties(token, "VERB", "VBD", "ROOT") and True) or\
(tokenHasProperties(token, "VERB", "VBN", "acl") and True):
prep = token.text
if (tokenHasProperties(previousToken(token), "ADV", "RB", "advmod")):
prep = previousToken(token).text + " " + prep
preparations.append(prep)

self.preparation = ', '.join(preparations)

# Try to fix

def __repr__(self):
return f"{self.name}"
3 changes: 3 additions & 0 deletions RecipeInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
class RecipeInfo():
def __init__(self, url):
self.rcp = GetRecipe(url)
self.name = self.rcp['name']

self.Ingredients = []
self.Steps = []
Expand Down Expand Up @@ -36,3 +37,5 @@ def extractMethods(self, step):
def extractTools(self, step):
pass

def __repr__(self):
return f"{self.name}"
14 changes: 3 additions & 11 deletions fetch_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,6 @@
# documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/


# These things are just for debugging/testing
from DecomposedText import DecomposedText
from Ingredient import Ingredient
test_ingredients = []
# End of debugging stuffs


'''
Parser Must Recognize:
Ingredients
Expand Down Expand Up @@ -66,15 +59,14 @@ def GetRecipe(url):
recipe_ingredients = recipe_json[1]["recipeIngredient"]
# recipe_json is a list of two dictionaries
recipe_instructions = [dict['text'] for dict in recipe_json[1]["recipeInstructions"]]
recipe_name = recipe_json[1]['name']

recipe_info = dict()
# dictionary to store the ingredients and instructions
# dictionary to store the recipe name, ingredients and instructions
recipe_info["name"] = recipe_name
recipe_info["ingredients"] = recipe_ingredients
recipe_info["instructions"] = recipe_instructions

# This is for debugging/testing
for ing in recipe_ingredients:
test_ingredients.append(ing)



Expand Down
17 changes: 13 additions & 4 deletions helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def combineQuantity(txt):
return ' '.join(split_txt)


# Token helpers
def previousToken(token):
if token.i == 0:
return False
Expand All @@ -54,13 +55,13 @@ def nextToken(token):
else:
return token.doc[token.i + 1]

def precedingWords(token, pos='NOUN', restrictions=[]):
def precedingWords(token, pos=['NOUN', 'PROPN'], restrictions=[]):
words = ""

currToken = token
while True:
prevToken = previousToken(currToken)
if prevToken and prevToken.pos_ == 'NOUN' and prevToken.text not in restrictions:
if prevToken and prevToken.pos_ in pos and prevToken.text not in restrictions:
words = prevToken.text + " " + words
currToken = prevToken
else:
Expand All @@ -69,20 +70,28 @@ def precedingWords(token, pos='NOUN', restrictions=[]):
return words


def proceedingWords(token, pos='NOUN', restrictions=[]):
def proceedingWords(token, pos=['NOUN', 'PROPN'], restrictions=[]):
words = ""

currToken = token
while True:
nxtToken = nextToken(currToken)
if nxtToken and nxtToken.pos_ == 'NOUN' and nxtToken.text not in restrictions:
if nxtToken and nxtToken.pos_ in pos and nxtToken.text not in restrictions:
words = nxtToken.text + " " + words
currToken = nxtToken
else:
break

return words

def tokenHasProperties(token, pos="", tag="", dep="", parent="", child=[]):
if pos and token.pos_ != pos: return False
if tag and token.tag_ != tag: return False
if dep and token.dep_ != dep: return False
if parent and token.parent.text != parent: return False
if child and child not in [child.text for child in token.children]: return False
return True


def containsAnyOf(str, lst):
for aStr in lst:
Expand Down
10 changes: 8 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
import sys

recipes = []
test_ingredients = []

def main():
for i, url in enumerate(lst_of_urls[:2]):
for i, url in enumerate(lst_of_urls[:10]):
print("Checked {}/{} URLs".format(i+1, len(lst_of_urls)), end='\r')
recipes.append(RecipeInfo(url))
rcp = RecipeInfo(url)
recipes.append(rcp)
for ing in rcp.Ingredients:
test_ingredients.append(ing)



if __name__ == "__main__":
# from Ingredient import Ingredient
# Ingredient('2.00 ounces shredded extra-sharp white Cheddar cheese')
main()
print('\nBREAKPOINT HERE')

0 comments on commit a78f98e

Please sign in to comment.