-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathshow-missing-favicons.py
89 lines (73 loc) · 3 KB
/
show-missing-favicons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#Many thanks to http://www.nomachetejuggling.com/2012/03/30/how-to-clean-up-your-chrome-bookmark-bar/
from HTMLParser import HTMLParser
import hashlib, os
from urllib2 import urlopen
#try:
# import ujson as json
#except ImportError:
# print >> sys.stderr, '[WARN] Unable to load ujson, loading slower default json instead'
# import json
#Trailing whitespace but in ujson means we'll have to skip this for now.
import json
def get_tlds():
if get_tlds.cache is not None:
return get_tlds.cache
if not os.path.exists('tlds-alpha-by-domain.txt'):
with open('tlds-alpha-by-domain.txt', 'wb') as f:
f.readline() #Skip the initial comment line
f.write(urlopen("http://data.iana.org/TLD/tlds-alpha-by-domain.txt").read())
with open("tlds-alpha-by-domain.txt", 'rb') as f:
get_tlds.cache = [line.strip() for line in f]
return get_tlds.cache
get_tlds.cache = None
class BookmarkHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tlds = get_tlds()
self.in_name = False
self.hasher = hashlib.md5
def handle_starttag(self, tag, attrs):
if tag == "a":
self.defer_attrs = dict(attrs)
attrs = self.defer_attrs
if "icon" not in attrs and attrs['href'].startswith('javascript'):
self.in_name = True
print " {"
def handle_endtag(self, tag):
if self.in_name:
attrs = self.defer_attrs
if "icon" not in attrs and attrs['href'].startswith('javascript'):
self.in_name = True
s = attrs['href'].split("//")
detected_domain = None
for v in s[1:]:
c = v.split("/")[0]
e = c.split(".")[-1]
if e.upper() in self.tlds:
detected_domain = c
break
#For sanity's sake, keep this in sort order.
print ' "detected_domain": %s,' % json.dumps(detected_domain)
print ' "hash": %s,' % json.dumps(self.hasher(attrs['href']).hexdigest())
print ' "icon": %s,' % json.dumps(None)
print ' "name": %s,' % json.dumps(self.data)
print ' "raw_attrs": %s,' % json.dumps(attrs)
print ' "strip_name": %s' % json.dumps(True)
print " },"
self.in_name = False
self.defer_attrs = None
self.data = None
def handle_data(self, data):
if self.in_name:
self.data = data
### Begin the utility chunk of this script...
DEFAULT_FILEPATH = "userdata/bookmarks.unprocessed.html"
if __name__ == "__main__":
import sys, os, fileinput
get_tlds()
print '['
# instantiate the parser and fed it some HTML
parser = BookmarkHTMLParser()
for line in fileinput.input():
parser.feed(line)
print '{}]' #Small hack to remove the trailing comma problem.