-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfix_databaseconstraints.py
84 lines (68 loc) · 2.58 KB
/
fix_databaseconstraints.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pywikibot
import mwparserfromhell as mwp
import pyisbn
import json
en_wikipedia = pywikibot.Site('en', 'wikipedia')
wikidata = en_wikipedia.data_repository()
if not wikidata.logged_in(): wikidata.login()
if not en_wikipedia.logged_in(): en_wikipedia.login()
def cleanisbn(isbn):
isbn= isbn.strip()
cleanedisbn = filter( lambda a: a in '1234567890xX-', isbn)
numericisbn = filter( lambda a: a in '1234567890xX', isbn)
if len(numericisbn)==10:
return cleanedisbn
else:
return False
def boolvalidate(isbn):
try:
p = pyisbn.validate(isbn)
except:
return False
return p
rootpage = pywikibot.Page(wikidata, 'Wikidata:Database_reports/Constraint_violations/P212#Format')
rootpage = rootpage.get()
wikicode = mwp.parse(rootpage)
def savecases():
fixcasesJSON = open('fixcases.JSON', 'w')
json.dump(fixcases, fixcasesJSON, indent=4)
fixcasesJSON.close()
fixcasesJSON = open('fixcases.JSON','r')
fixcases = json.load(fixcasesJSON)
sections = wikicode.get_sections()
for section in sections:
if section[:10] == '== "Format':
linenum = 0
for line in section.split('\n',10000):
linenum+=1
print linenum
if fixcases['prevtouched'] > linenum-1:
continue
linecode = mwp.parse(line)
linebits = linecode.filter()
qid = ''
isbn = ''
for linebit in linebits:
if isinstance(linebit, mwp.nodes.wikilink.Wikilink):
qid = linebit[2:-2]
if isinstance(linebit, mwp.nodes.text.Text) and linebit != '*':
isbn = linebit[1:]
print 'qid', qid, ' isbn', isbn
if qid.startswith('Q'):
wditem = pywikibot.ItemPage(wikidata, qid)
cleanedisbn = cleanisbn(isbn)
if cleanedisbn:
if boolvalidate(cleanedisbn):
isbn10claim = pywikibot.Claim(site=wikidata, pid='P957')
isbn10claim.setTarget(cleanedisbn)
wditem.addClaim(isbn10claim)
page_parts = wditem.get()
claims = page_parts['claims']
for claimnum, claimlist in claims.iteritems():
if claimnum == 'P212':
for claim in claimlist:
isbn = claim.target
wditem.removeClaims(claim)
fixcases['prevtouched'] = linenum
savecases()
print 'done'