-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_recipe.py
77 lines (59 loc) · 3.04 KB
/
fetch_recipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import sys
import json
import re
import urllib.request # for fetching the html content of a url
# documentation: https://docs.python.org/3/howto/urllib2.html
from bs4 import BeautifulSoup # for parsing the html content
# documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
'''
Parser Must Recognize:
Ingredients
Ingredient name
Quantity
Measurement (cup, teaspoon, pinch, etc.)
(optional) Descriptor (e.g. fresh, extra-virgin)
(optional) Preparation (e.g. finely chopped)
Tools – pans, graters, whisks, etc.
Methods
Primary cooking method (e.g. sauté, broil, boil, poach, etc.)
(optional) Other cooking methods used (e.g. chop, grate, stir, shake, mince, crush, squeeze, etc.)
Steps – parse the directions into a series of steps that each consist of ingredients, tools, methods, and time
'''
def GetRecipe(url):
'''
This function fetches html doc/content from the recipe url.
It then uses BeautifulSoup to extract a JSON object that
contains the recipe ingredients and instructions
:param url: (string) url of recipe to fetch
:return: (dict) a dictionary containing the recipe ingredients and instructions
'''
fetch_response = urllib.request.urlopen(url)
html_doc = (fetch_response.read()).decode("utf-8") # we use decode here because .read() returns bytes
html_doc_parser = BeautifulSoup(html_doc, 'html.parser')
# returns a BeautifulSoup object that we can then use functions like
# find_all(tag name here), get_text(), etc
# documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigating-the-tree
recipe_json_as_tag = html_doc_parser.find_all('script')[0]
# extracts the JavaScript information from the HTML content,
# which contains the recipe ingredients and cooking directions
recipe_json_as_str = (recipe_json_as_tag.contents[0].replace("\n", ""))
# contents returns a list of strings, in which the 0th element
# is the entire json content, the 1st element is the JavaScript code
# newlines are replaced to aid the conversion of this string into a json object
recipe_json_array_as_str = re.sub("^<.script", '', recipe_json_as_str)
# uses regex to get rid of the <script ... > tags with empty strings
recipe_json = json.loads(recipe_json_array_as_str)
# converts the json array into a json object
recipe_ingredients = recipe_json[1]["recipeIngredient"]
# recipe_json is a list of two dictionaries
recipe_instructions = [dict['text'] for dict in recipe_json[1]["recipeInstructions"]]
recipe_name = recipe_json[1]['name']
recipe_info = dict()
# dictionary to store the recipe name, ingredients and instructions
recipe_info["name"] = recipe_name
recipe_info["ingredients"] = recipe_ingredients
recipe_info["instructions"] = recipe_instructions
# uncomment this line to visually see the final json object
# from: https://www.journaldev.com/33302/python-pretty-print-json
# print(json.dumps(recipe_json, indent=2))
return recipe_info