Implemented ingredient descriptors and preparation extraction

ayobamibolaji · Feb 28, 2021 · a78f98e · a78f98e
1 parent 2618897
commit a78f98e
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 20 deletions.
diff --git a/DecomposedText.py b/DecomposedText.py
@@ -43,3 +43,10 @@ def getToken(self, str):
             if token.text == str:
                 return token
 
+    def getTextFromNouns(self, str):
+        for noun in self.nouns:
+            for word in noun.text.split():
+                if str in word:
+                    return word
+        return False
+
diff --git a/Ingredient.py b/Ingredient.py
@@ -8,7 +8,7 @@ def __init__(self, str):
         self.doc = DecomposedText(self.str)
         self.measurement = None
         self.quantity = None
-        self.descriptor = None
+        self.descriptors = None
         self.preparation = None
 
         self.extractProperties(self.str)
@@ -19,22 +19,67 @@ def extractProperties(self, str):
 
         # Extract measurement
         self.measurement = self.doc.text[1] if self.doc.text[1] == self.doc.parent[0].text else None
-        self.measurement = self.doc.text[1] if self.doc.doc[1].ent_type_ == 'QUANTITY' else self.measurement
+        self.measurement = self.doc.text[1] if self.doc.doc[1].ent_type_ == 'QUANTITY' and self.doc.doc[1].pos_ == 'NOUN' else self.measurement
         self.measurement = self.doc.text[1] if containsAnyOf(self.doc.text[1], measures) else self.measurement
+        self.measurement = nextToken(self.doc.getToken(")")).text if self.doc.doc[1].tag_ == '-LRB-' else self.measurement
 
 
         root = self.doc.getRoot()
 
         # Extract ingredient name
         if root.pos_ != 'NOUN' or root.text == self.measurement:
             for child in root.children:
-                if child.pos_ == 'NOUN' and child.text != self.measurement:
+                if child.pos_ == 'NOUN' and child.text != self.measurement and child.i > root.i:
                     root = child
                     break
 
         if root.pos_ != 'NOUN' and self.measurement != None:
             if nextToken(self.doc.getToken(self.measurement)).pos_ == 'NOUN':
                 root = nextToken(self.doc.getToken(self.measurement))
+            elif nextToken(self.doc.getToken(self.measurement)).head.pos_ == 'NOUN':
+                root = nextToken(self.doc.getToken(self.measurement)).head
 
         self.name = precedingWords(root, restrictions=[self.measurement]) + root.text + proceedingWords(root)
 
+
+        # Extract descriptors
+        descriptors = []
+        for token in self.doc.doc:
+            if token.text == self.measurement: continue
+            if (tokenHasProperties(token, "ADJ", "JJ", "amod") and token.head.text in self.name) or\
+                    (tokenHasProperties(token, "NOUN", "NN", "nmod") and token.head.text in self.name):
+                desc = token.text
+                if (tokenHasProperties(previousToken(token), "ADV", "RB", "advmod")):
+                    desc = previousToken(token).text + " " + desc
+                if self.doc.getTextFromNouns(desc):
+                    if self.doc.getTextFromNouns(desc) in descriptors: continue
+                    desc = self.doc.getTextFromNouns(desc)
+                descriptors.append(desc)
+            if (tokenHasProperties(token, "DET", "DT", "det")) and \
+                nextToken(token).tag_ == "HYPH" and nextToken(nextToken(token)).pos_ == "NOUN":
+                descriptors.append(self.doc.getTextFromNouns(nextToken(nextToken(token)).text))
+
+        self.descriptors = ', '.join(descriptors)
+
+        # Extract preparation
+        preparations = []
+        for token in self.doc.doc:
+            if token.text == self.measurement:
+                continue
+
+            nameInChild = tokenHasProperties(token, child=self.name)
+            if (tokenHasProperties(token, "VERB", "VBN", "amod") and True) or\
+                    (tokenHasProperties(token, "VERB", "VBD", "acl") and True) or\
+                        (tokenHasProperties(token, "VERB", "VBD", "ROOT") and True) or\
+                            (tokenHasProperties(token, "VERB", "VBN", "acl") and True):
+                prep = token.text
+                if (tokenHasProperties(previousToken(token), "ADV", "RB", "advmod")):
+                    prep = previousToken(token).text + " " + prep
+                preparations.append(prep)
+
+        self.preparation = ', '.join(preparations)
+
+    # Try to fix
+
+    def __repr__(self):
+        return f"{self.name}"
diff --git a/RecipeInfo.py b/RecipeInfo.py
@@ -5,6 +5,7 @@
 class RecipeInfo():
     def __init__(self, url):
         self.rcp = GetRecipe(url)
+        self.name = self.rcp['name']
 
         self.Ingredients = []
         self.Steps = []
@@ -36,3 +37,5 @@ def extractMethods(self, step):
     def extractTools(self, step):
         pass
 
+    def __repr__(self):
+        return f"{self.name}"
diff --git a/fetch_recipe.py b/fetch_recipe.py
@@ -8,13 +8,6 @@
 # documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 
 
-# These things are just for debugging/testing
-from DecomposedText import DecomposedText
-from Ingredient import Ingredient
-test_ingredients = []
-# End of debugging stuffs
-
-
 '''
 Parser Must Recognize:
     Ingredients
@@ -66,15 +59,14 @@ def GetRecipe(url):
     recipe_ingredients = recipe_json[1]["recipeIngredient"]
     # recipe_json is a list of two dictionaries
     recipe_instructions = [dict['text'] for dict in recipe_json[1]["recipeInstructions"]]
+    recipe_name = recipe_json[1]['name']
 
     recipe_info = dict()
-    # dictionary to store the ingredients and instructions
+    # dictionary to store the recipe name, ingredients and instructions
+    recipe_info["name"] = recipe_name
     recipe_info["ingredients"] = recipe_ingredients
     recipe_info["instructions"] = recipe_instructions
 
-    # This is for debugging/testing
-    for ing in recipe_ingredients:
-        test_ingredients.append(ing)
 
 
 

diff --git a/helpers.py b/helpers.py
@@ -42,6 +42,7 @@ def combineQuantity(txt):
     return ' '.join(split_txt)
 
 
+# Token helpers
 def previousToken(token):
     if token.i == 0:
         return False
@@ -54,13 +55,13 @@ def nextToken(token):
     else:
         return token.doc[token.i + 1]
 
-def precedingWords(token, pos='NOUN', restrictions=[]):
+def precedingWords(token, pos=['NOUN', 'PROPN'], restrictions=[]):
     words = ""
 
     currToken = token
     while True:
         prevToken = previousToken(currToken)
-        if prevToken and prevToken.pos_ == 'NOUN' and prevToken.text not in restrictions:
+        if prevToken and prevToken.pos_ in pos and prevToken.text not in restrictions:
             words = prevToken.text + " " + words
             currToken = prevToken
         else:
@@ -69,20 +70,28 @@ def precedingWords(token, pos='NOUN', restrictions=[]):
     return words
 
 
-def proceedingWords(token, pos='NOUN', restrictions=[]):
+def proceedingWords(token, pos=['NOUN', 'PROPN'], restrictions=[]):
     words = ""
 
     currToken = token
     while True:
         nxtToken = nextToken(currToken)
-        if nxtToken and nxtToken.pos_ == 'NOUN' and nxtToken.text not in restrictions:
+        if nxtToken and nxtToken.pos_ in pos and nxtToken.text not in restrictions:
             words = nxtToken.text + " " + words
             currToken = nxtToken
         else:
             break
 
     return words
 
+def tokenHasProperties(token, pos="", tag="", dep="", parent="", child=[]):
+    if pos and token.pos_ != pos: return False
+    if tag and token.tag_ != tag: return False
+    if dep and token.dep_ != dep: return False
+    if parent and token.parent.text != parent: return False
+    if child and child not in [child.text for child in token.children]: return False
+    return True
+
 
 def containsAnyOf(str, lst):
     for aStr in lst:

diff --git a/main.py b/main.py
@@ -3,14 +3,20 @@
 import sys
 
 recipes = []
+test_ingredients = []
 
 def main():
-    for i, url in enumerate(lst_of_urls[:2]):
+    for i, url in enumerate(lst_of_urls[:10]):
         print("Checked {}/{} URLs".format(i+1, len(lst_of_urls)), end='\r')
-        recipes.append(RecipeInfo(url))
+        rcp = RecipeInfo(url)
+        recipes.append(rcp)
+        for ing in rcp.Ingredients:
+            test_ingredients.append(ing)
 
 
 
 if __name__ == "__main__":
+    # from Ingredient import Ingredient
+    # Ingredient('2.00 ounces shredded extra-sharp white Cheddar cheese')
     main()
     print('\nBREAKPOINT HERE')