Skip to content

Commit

Permalink
Updates version to 0.1.0. Tests API listener and state storage. Minim…
Browse files Browse the repository at this point in the history
…al documentation.
  • Loading branch information
halfak committed Aug 3, 2014
1 parent a2d5974 commit d85e6df
Show file tree
Hide file tree
Showing 11 changed files with 303 additions and 38 deletions.
39 changes: 38 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,40 @@
MediaWiki events
================
Wiki-tool builders & researchers rely on various sources of information about what's happened and is currently happening in Wikipedia. These data sources tend to be structured in differently and contain incomplete or poorly structured information. Some datasources are queryable, but require complexity to "listen" to ongoing events while others are intended to only be used to "listen" to current events. ''MediaWiki events'' is designed to minimize the frustration involved in process MediaWiki's events.
Wiki-tool builders & researchers rely on various sources of information about what's happened and is currently happening in Wikipedia. These data sources tend to be structured in differently and contain incomplete or poorly structured information. Some datasources are queryable, but require complexity to "listen" to ongoing events while others are intended to only be used to "listen" to current events. ''MediaWiki events'' is designed to minimize the frustration involved in process MediaWiki's events.


**Instal with pip:** ``pip install mwevents``

**Note:** *Use of this library requires Python 3 or later.*

**Documentation:** *Comming soon!*

:Example:

.. code-block:: python
from mwevents.sources import API
from mwevents import RevisionSaved, PageCreated
api_source = API.from_api_url("http://en.wikipedia.org/w/api.php")
listener = api_source.listener(events={RevisionSaved, PageCreated})
for event in listener:
if isinstance(event, RevisionSaved):
print(event.revision)
else: # isinstance(event, PageCreated):
print(event.page)
About the author
================
:name:
Aaron Halfaker
:email:
[email protected]
:website:
http://halfaker.info --
http://en.wikipedia.org/wiki/User:EpochFail

Contributors
============
None yet. See http://github.com/halfak/MediaWiki-events. Pull requests are encouraged.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.0.3
0.1.0
41 changes: 34 additions & 7 deletions examples/listen.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
"""
Listens to a wiki's recentchanges feed.
Listens to a wiki's recentchanges feed and prints standardized JSON events.
Usage:
listen <api_url>
listen <api_url> [--revisions-only] [--store-state=<path>]
Options:
<api_url> The url for the MediaWiki API to connect to.
<api_url> The url for the MediaWiki API to connect to.
--revisions-only Only print RevisionSaved events.
--store-state=<path> A file location to read and store read state to.
"""
import json
import os.path
import pprint
import sys

Expand All @@ -15,25 +19,48 @@
try:
sys.path.insert(0, ".")
from mwevents.sources import API
from mwevents import RevisionSaved, StateMarker
except:
raise

def main():
args = docopt(__doc__)

run(args['<api_url>'])
run(args['<api_url>'], args['--revisions-only'], args['--store-state'])

def run(api_url):

def run(api_url, revisions_only, store_state):
api_source = API.from_api_url(api_url)

if revisions_only:
events = {RevisionSaved}
else:
events = None

state_marker = load_state_marker(store_state)

try:
for event, state in api_source.listen():
listener = api_source.listener(state_marker=state_marker, events=events)
for event in listener:

pprint.pprint(event.to_json())

if store_state is not None:
with open(store_state, "w") as f:
json.dump(listener.state_marker.to_json(), f)



except KeyboardInterrupt:
print("Keyboard interrupt received. Shutting down.")


def load_state_marker(path):
if path is not None:

if os.path.exists(path):
try:
return StateMarker(json.load(open(path)))
except ValueError:
pass

if __name__ == "__main__": main()
3 changes: 3 additions & 0 deletions mwevents/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

from .types import *
from .types.events import *
137 changes: 110 additions & 27 deletions mwevents/sources/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,48 +2,131 @@

from mw import api

from ..types import Event
from ..types import StateMarker, Timestamp
from ..types.events import Event, Match


class API:
"""
Example:
.. code-block::python
"""
RC_PROPS = {'user', 'userid', 'comment', 'timestamp', 'title', 'ids',
'sizes', 'loginfo', 'sha1'}
class RCListener:

def __init__(self, session):
def __init__(self, session, *, state_marker, events,
min_wait, rcs_per_request, stop):
self.session = session

def listen(self, *args, min_wait=5, rcs_per_request=50,
stop=lambda: False,
direction="newer",
properties=RC_PROPS, types=None, **kwargs):

kwargs['limit'] = rcs_per_request
kwargs['properties'] = properties
kwargs['direction'] = direction
self.state_marker = state_marker
self.events = events
self.min_wait = min_wait
self.rcs_per_request = rcs_per_request
self.stop = stop
self.kwargs = {
'limit': rcs_per_request,
'properties': API.RC_EVENT_PROPS,
'direction': "newer",
'start': self.state_marker.last_event
}

while not stop():
if self.events is None:
self.kwargs['type'] = set(m.rc_type \
for m in Event.MATCH_GROUPS.keys())
else:
self.kwargs['type'] = set(m.rc_type \
for e in self.events
for m in e.MATCHES)


def __iter__(self):
while not self.stop():
start = time.time()

rc_docs, kwargs['rccontinue'] = \
self.session.recent_changes._query(*args, **kwargs)
rc_docs, self.kwargs['rccontinue'] = \
self.session.recent_changes._query(**self.kwargs)

for rc_doc in rc_docs:
print(rc_doc)
state = rc_doc['timestamp'] + "|" + str(rc_doc['rcid'])
for event in Event.from_rc_doc(rc_doc):
if types is None or type(event) in types:
yield event, state
if self.state_marker.is_after(Timestamp(rc_doc['timestamp']),
rc_doc.get('rcid'),
rc_doc.get('revid'),
rc_doc.get('logid')):

for event in Event.from_rc_doc(rc_doc):
if self.events is None or type(event) in self.events:
yield event

self.state_marker.update(Timestamp(rc_doc['timestamp']),
rc_doc.get('rcid'),
rc_doc.get('revid'),
rc_doc.get('logid'))


if len(rc_docs) < rcs_per_request:
if len(rc_docs) < self.rcs_per_request:
time.sleep(min_wait - (time.time() - start))


class API:
"""
Constructs a source of :class:`mwevents.Event` that connects to a MediaWiki
API (api.php).
"""
RC_EVENT_PROPS = {'user', 'userid', 'comment', 'timestamp', 'title', 'ids',
'sizes', 'loginfo', 'sha1'}

def __init__(self, session):
self.session = session

def listener(self, state_marker=None, events=None, min_wait=5,
rcs_per_request=100, direction="newer",
properties=RC_EVENT_PROPS, stop=lambda: False):
"""
:Example:
.. code-block:: python
import sys
from mwevents.sources import API
from mwevents import RevisionSaved, PageCreated
API_URL = "http://en.wikipedia.org/w/api.php"
try:
api_source = API.from_api_url(API_URL)
listener = api_source.listener(events={RevisionSaved,
PageCreated})
for event in listener:
if isinstance(event, RevisionSaved):
print("Revision {0} of {1} saved by {2}."\
.format(event.revision.id,
event.revision.page_id,
event.user))
else: # isinstance(event, PageCreated):
print("Page {0}:{1} created by {2}."\
.format(event.page.namespace,
event.page.title,
event.user))
except KeyboardInterrupt:
sys.stderr.write("Keyboard Interrupt caught. " + \
"Shutting down.\n")
sys.stderr.write(str(listener.state_marker.to_json()) + "\n")
"""
state_marker = StateMarker(state_marker) \
if state_marker is not None else StateMarker()

events = set(events) if events is not None else None

min_wait = float(min_wait)
rcs_per_request = int(rcs_per_request)

if not callable(stop):
raise TypeError("'stop' must be a callable function")

return RCListener(self.session,
state_marker=state_marker,
events=events,
min_wait=min_wait,
rcs_per_request=rcs_per_request,
stop=stop)


def query(self, *args, **kwargs): raise NotImplemented Error

@classmethod
def from_api_url(cls, url):
return cls(api.Session(url))
9 changes: 9 additions & 0 deletions mwevents/sources/source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@


class Source:

def listen(self, *args, **kwargs):
raise NotImplementedError()

def query(self, start, end, *args, types=None, **kwargs):
raise NotImplementedError()
2 changes: 1 addition & 1 deletion mwevents/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
from .unavailable import Unavailable, UnavailableType
from .user import User
from .page import Page
from .events import Event
from .state_marker import StateMarker
1 change: 1 addition & 0 deletions mwevents/types/events/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .event import Event
from .match import Match
from .page_created import PageCreated
from .page_deleted import PageDeleted
from .page_moved import PageMoved
Expand Down
42 changes: 42 additions & 0 deletions mwevents/types/state_marker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

from jsonable import JSONable

from .timestamp import Timestamp


class StateMarker(JSONable):
__slots__ = ("last_event", "last_rev_id", "last_rc_id", "last_log_id")
def initialize(self, last_event=None, last_rc_id=None,
last_rev_id=None, last_log_id=None):
self.last_event = Timestamp(last_event) \
if last_event is not None else None
self.last_rc_id = int(last_rc_id) \
if last_rc_id is not None else None
self.last_rev_id = int(last_rev_id) \
if last_rev_id is not None else None
self.last_log_id = int(last_log_id) \
if last_log_id is not None else None

def update(self, timestamp, rc_id, rev_id, log_id):
self.last_event = timestamp or self.last_event
self.last_rc_id = rc_id or self.last_rc_id
self.last_rev_id = rev_id or self.last_rev_id
self.last_log_id = log_id or self.last_log_id

def is_after(self, timestamp, rc_id, rev_id, log_id):
timestamp = Timestamp(timestamp)

return (self.last_event is not None and
timestamp > self.last_event) or\
(
(self.last_event is None or
timestamp == self.last_event) and
(
(rc_id is not None and
rc_id > (self.last_rc_id or 0)) or\
(rev_id is not None and
rev_id > (self.last_rev_id or 0)) or\
(log_id is not None and
log_id > (self.last_log_id or 0))
)
)
Loading

0 comments on commit d85e6df

Please sign in to comment.