-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdb_api.py
167 lines (135 loc) · 5.16 KB
/
db_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
This module is a python API to the toolserver database.
"""
import datetime, time
import MySQLdb
class UnFlagged():
"""Class to hold a page and its revision info"""
def __init__(self):
self.error = False
def initialize_with_db( self, l ):
self.page_nr = l[0]
self.page_title = l[1]
self.latest = l[2]
self.stable = l[3]
def get_unflagged(self, c):
all_revisions = get_all_revisions( c, self.page_nr)
unflagged = [ rev for rev in reversed( all_revisions ) if rev[0] > self.stable]
# Sometimes we have an error
if len(unflagged) == 0:
self.first_time = datetime.datetime.now()
self.error = True
return
self.first_unflagged = unflagged[-1]
first_time = self.first_unflagged[6]
#now = datetime.datetime.now()
self.first_time = datetime.datetime(*time.strptime(
first_time, "%Y%m%d%H%M%S")[0:5])
#difference = now - then
def get_all_revisions(c, page_nr, wiki="dewiki_p"):
c.execute( "select * from %(wiki)s.revision where rev_page=%(page_id)s" % {
'wiki': wiki, 'page_id' : page_nr})
return c.fetchall()
def make_db_safe ( title ):
title = title.strip()
title = title.replace( " ", "_")
title = title.replace( '"', '\"')
return title
def db_get_articles_in_category( language , category, c , depth = 0 ,
namespace = 0 , exclude = [], done_cats = [],
no_redirects = False , limit = '' , project = 'wikipedia' ,
only_redirects = False ):
"""Returns a list of all articles in a category.
As input, the language, category name and a db-cursor is provided.
3x improvement when cursor is not regenerated in the fxn.
"""
category = make_db_safe( category )
if category in done_cats: return []
if category in exclude: return []
db = language + 'wiki_p' ;
if limit != '': limit = "LIMIT " + limit;
ret = []
subcats = []
red = ''
if no_redirects : red = ' AND page_is_redirect=0'
if only_redirects: red = ' AND page_is_redirect=1' ;
sql = """SELECT page_title,page_namespace FROM page,categorylinks
WHERE page_id=cl_from AND cl_to="%s" """ % category
sql += red + limit
c.execute( "use " + db)
tmp = c.execute( sql )
result = c.fetchall()
for o in result:
page_title = o[0]
page_namespace = o[1]
if ( page_namespace == 14 and (depth > 0 or depth < -99) ):
subcats.append( o[0] )
if page_namespace != namespace: continue
if page_title in exclude: continue
ret.append( page_title )
done_cats.append( category )
for sc in subcats:
ret2 = db_get_articles_in_category( language, sc, c, depth -1,
namespace, exclude, done_cats, no_redirects, limit,
project, only_redirects)
for r in ret2:
if not r in ret: ret.append( r )
return ret ;
class Page:
def __init__(self): pass
def db_get_articles_in_category_object( language, category, c , depth = 0 ,
namespace = 0 , exclude = [], done_cats = [],
no_redirects = False , limit = '' , project = 'wikipedia' ,
only_redirects = False ):
non_unique_result = _db_get_articles_in_category_object( language,
category, c, depth, namespace , exclude, done_cats,
no_redirects, limit, project, only_redirects)
result = []
rids = {}
for r in non_unique_result:
if not r.id in rids:
result.append( r)
rids[r.id] = ''
return result
def _db_get_articles_in_category_object( language , category, c , depth = 0 ,
namespace = 0 , exclude = [], done_cats = [],
no_redirects = False , limit = '' , project = 'wikipedia' ,
only_redirects = False ):
"""Returns a list of all articles in a category.
As input, the language, category name and a db-cursor is provided.
3x improvement when cursor is not regenerated in the fxn.
"""
category = make_db_safe( category )
if category in done_cats: return []
if category in exclude: return []
db = language + 'wiki_p' ;
if limit != '': limit = "LIMIT " + limit;
ret = []
subcats = []
red = ''
if no_redirects : red = ' AND page_is_redirect=0'
if only_redirects: red = ' AND page_is_redirect=1' ;
sql = """SELECT page_title,page_namespace, page_id
FROM page,categorylinks
WHERE page_id=cl_from AND cl_to="%s" """ % category
sql += red + limit
c.execute( "use " + db)
tmp = c.execute( sql )
result = c.fetchall()
for o in result:
page = Page()
page.title = o[0]
page.namespace = o[1]
page.id = o[2]
if ( page.namespace == 14 and (depth > 0 or depth < -99) ):
subcats.append( o[0] )
if page.namespace != namespace: continue
if page.title in exclude: continue
ret.append(page)
done_cats.append( category )
for sc in subcats:
ret2 = db_get_articles_in_category_object( language, sc, c, depth -1,
namespace, exclude, done_cats, no_redirects, limit,
project, only_redirects)
ret.extend( ret2 )
return ret ;