-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathParser.py
68 lines (59 loc) · 1.91 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import urllib.request
from html.parser import HTMLParser
from htmldom import htmldom
import re
class Parser:
vowels = {
"ა": True,
"ე": True,
"ი": True,
"ო": True,
"უ": True
}
@staticmethod
def get_data(link):
try:
data = urllib.request.urlopen(link).read().decode()
dom = htmldom.HtmlDom().createDom(data)
image = dom.find("img.attachment-post-full").first().attr("src")
title = dom.find("h1#title").first().text()
ul = dom.find("div.entry ul").first()
rules = []
ingredients = []
recipe_paragraphs = ul.next("p").first().nextUntil("div")
recipe = ""
regex = re.compile("[\xa0\s]+")
for p in recipe_paragraphs:
recipe += p.html()
for li in ul.find("li"):
rule = li.text()
rules.append(rule)
p = regex.split(rule.strip())
l = len(p)
i = l - 1
r = p[i]
ch = r[-1:]
while ch is not None and not (r[-1:] in Parser.vowels):
i -= 1
if i >= 0 and i < l:
r = p[i]
ch = r[-1:]
else:
r = None
break
if r is None:
continue
if i < (l - 1) and p[i - 1][-2:] == "ის":
r = p[i - 1] + " " + r
ingredients.append(r)
return {
"image": image,
"title": title,
"rules": rules,
"recipe": recipe,
"ingredients": ingredients,
"source": link
}
except:
print("Failed to request resource: ", link)
return None