Skip to content

Commit

Permalink
Merge pull request CenterForOpenScience#369 from CenterForOpenScience…
Browse files Browse the repository at this point in the history
…/feature/doctests

Add support for doctests
  • Loading branch information
erinspace committed Sep 17, 2015
2 parents ad3ee08 + a962a13 commit 8c91643
Show file tree
Hide file tree
Showing 18 changed files with 151 additions and 133 deletions.
4 changes: 2 additions & 2 deletions scrapi/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
build_properties,
oai_get_records_and_token,
compose,
date_formatter,
datetime_formatter,
null_on_error,
coerce_to_list
)
Expand Down Expand Up @@ -152,7 +152,7 @@ def formatted_properties(self):

def format_property(self, property):
if property == 'date':
fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property)
fn = compose(lambda x: map(null_on_error(datetime_formatter), x), coerce_to_list, self.resolve_property)
else:
fn = self.resolve_property
return (property, (
Expand Down
117 changes: 93 additions & 24 deletions scrapi/base/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,19 @@
URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
DOI_REGEX = re.compile(r'(doi:10\.\S*)')

''' Takes a value, returns a function that always returns that value
Useful inside schemas for defining constants '''
CONSTANT = lambda x: lambda *_, **__: x

def CONSTANT(x):
''' Takes a value, returns a function that always returns that value
Useful inside schemas for defining constants
>>> CONSTANT(7)('my', 'name', verb='is')
7
>>> CONSTANT([123, 456])()
[123, 456]
'''
def inner(*y, **z):
return x
return inner


def build_properties(*args):
Expand Down Expand Up @@ -48,15 +58,37 @@ def build_property(name, expr, description=None, uri=None):


def single_result(l, default=''):
return l[0] if l else default
''' A function that will return the first element of a list if it exists
>>> print(single_result(['hello', None]))
hello
>>> print(single_result([], default='hello'))
hello
>>> print(single_result([]))
<BLANKLINE>
def compose(*functions):
'''
evaluates functions from right to left.
ex. compose(f, g)(*x, **y) = f(g(*x, **y))
return l[0] if l else default


credit to sloria
def compose(*functions):
''' evaluates functions from right to left.
>>> add = lambda x, y: x + y
>>> add3 = lambda x: x + 3
>>> divide2 = lambda x: x/2
>>> subtract4 = lambda x: x - 4
>>> subtract1 = compose(add3, subtract4)
>>> subtract1(1)
0
>>> compose(subtract1, add3)(4)
6
>>> compose(int, add3, add3, divide2)(4)
8
>>> compose(int, divide2, add3, add3)(4)
5
>>> compose(int, divide2, compose(add3, add3), add)(7, 3)
8
'''
def inner(func1, func2):
return lambda *x, **y: func1(func2(*x, **y))
Expand All @@ -65,8 +97,18 @@ def inner(func1, func2):

def updated_schema(old, new):
''' Creates a dictionary resulting from adding all keys/values of the second to the first
The second dictionary will overwrite the first.
>>> old, new = {'name': 'ric', 'job': None}, {'name': 'Rick'}
>>> updated = updated_schema(old, new)
>>> len(updated.keys())
2
>>> print(updated['name'])
Rick
>>> updated['job'] is None
True
The second dictionary will overwrite the first.'''
'''
d = deepcopy(old)
for key, value in new.items():
if isinstance(value, dict) and old.get(key) and isinstance(old[key], dict):
Expand All @@ -77,18 +119,28 @@ def updated_schema(old, new):


def default_name_parser(names):
contributor_list = []
for person in names:
name = HumanName(person)
contributor = {
'name': person,
'givenName': name.first,
'additionalName': name.middle,
'familyName': name.last,
}
contributor_list.append(contributor)
''' Takes a list of names, and attempts to parse them
'''
return list(map(maybe_parse_name, names))


def maybe_parse_name(name):
''' Tries to parse a name. If the parsing fails, returns a dictionary
with just the unparsed name (as per the SHARE schema)
'''
return null_on_error(parse_name)(name) or {'name': name}

return contributor_list

def parse_name(name):
''' Takes a human name, parses it into given/middle/last names
'''
person = HumanName(name)
return {
'name': name,
'givenName': person.first,
'additionalName': person.middle,
'familyName': person.last
}


def format_tags(all_tags, sep=','):
Expand Down Expand Up @@ -278,17 +330,34 @@ def inner(*args, **kwargs):


def coerce_to_list(thing):
''' If a value is not already a list or tuple, puts that value in a length 1 list'''
''' If a value is not already a list or tuple, puts that value in a length 1 list
>>> niceties = coerce_to_list('hello')
>>> len(niceties)
1
>>> print(niceties[0])
hello
>>> niceties2 = coerce_to_list(['hello'])
>>> niceties2 == niceties
True
>>> niceties3 = (coerce_to_list(('hello', 'goodbye')))
>>> len(niceties3)
2
>>> print(niceties3[0])
hello
>>> print(niceties3[1])
goodbye
'''
if not (isinstance(thing, list) or isinstance(thing, tuple)):
return [thing]
return thing
return list(thing)


def date_formatter(date_string):
def datetime_formatter(datetime_string):
'''Takes an arbitrary date/time string and parses it, adds time
zone information and returns a valid ISO-8601 datetime string
'''
date_time = parser.parse(date_string)
date_time = parser.parse(datetime_string)
if not date_time.tzinfo:
date_time = date_time.replace(tzinfo=pytz.UTC)
return date_time.isoformat()
6 changes: 3 additions & 3 deletions scrapi/base/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
format_tags,
single_result,
language_codes,
date_formatter,
datetime_formatter,
oai_process_uris,
build_properties,
default_name_parser,
Expand All @@ -17,7 +17,7 @@
"description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
"contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)),
"title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
"providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(date_formatter, single_result)),
"providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
"uris": {
"canonicalUri": ('//dcq:identifier-citation/node()', compose(lambda x: x.strip(), single_result)),
"objectUris": [('//dc:doi/node()', compose(lambda x: 'http://dx.doi.org/' + x, single_result))]
Expand Down Expand Up @@ -53,7 +53,7 @@
OAISCHEMA = {
"contributors": ('//dc:creator/node()', '//dc:contributor/node()', oai_process_contributors),
"uris": ('//dc:doi/node()', '//dc:identifier/node()', oai_process_uris),
'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(date_formatter, single_result)),
'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(datetime_formatter, single_result)),
'title': ('//dc:title/node()', single_result),
'description': ('//dc:description/node()', single_result),
'subjects': ('//dc:subject/node()', format_tags),
Expand Down
4 changes: 2 additions & 2 deletions scrapi/harvesters/biomedcentral.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from scrapi import settings
from scrapi.base import JSONHarvester
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import build_properties, date_formatter
from scrapi.base.helpers import build_properties, datetime_formatter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -65,7 +65,7 @@ def schema(self):
'providerUris': ('/articleFullUrl', '/abstractPath', lambda x, y: [x, y])
},
'title': ('/bibliographyTitle', '/blurbTitle', lambda x, y: x or y),
'providerUpdatedDateTime': ('/published Date', date_formatter),
'providerUpdatedDateTime': ('/published Date', datetime_formatter),
'description': '/blurbText',
'freeToRead': {
'startDate': ('/is_free', '/published Date', lambda x, y: y if x else None)
Expand Down
4 changes: 2 additions & 2 deletions scrapi/harvesters/clinicaltrials.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
from scrapi.base.schemas import default_name_parser
from scrapi.base.helpers import compose, single_result, build_properties, date_formatter
from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -49,7 +49,7 @@ class ClinicalTrialsHarvester(XMLHarvester):
"uris": {
"canonicalUri": ("//required_header/url/node()", single_result)
},
"providerUpdatedDateTime": ("lastchanged_date/node()", compose(date_formatter, single_result)),
"providerUpdatedDateTime": ("lastchanged_date/node()", compose(datetime_formatter, single_result)),
"title": ('//official_title/node()', '//brief_title/node()', lambda x, y: single_result(x) or single_result(y)),
"description": ('//brief_summary/textblock/node()', '//brief_summary/textblock/node()', lambda x, y: single_result(x) or single_result(y)),
"tags": ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]),
Expand Down
4 changes: 2 additions & 2 deletions scrapi/harvesters/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from scrapi import settings
from scrapi.base import JSONHarvester
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import build_properties, compose, date_formatter
from scrapi.base.helpers import build_properties, compose, datetime_formatter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -72,7 +72,7 @@ def schema(self):
return {
'title': ('/title', lambda x: x[0] if x else ''),
'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''),
'providerUpdatedDateTime': ('/issued/date-parts', compose(date_formatter, lambda x: ' '.join([str(part) for part in x[0]]))),
'providerUpdatedDateTime': ('/issued/date-parts', compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))),
'uris': {
'canonicalUri': '/URL'
},
Expand Down
19 changes: 2 additions & 17 deletions scrapi/harvesters/dataone.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

from __future__ import unicode_literals

import re

import logging
from datetime import timedelta, date

Expand All @@ -23,27 +21,14 @@
from scrapi.base import XMLHarvester
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import compose, single_result, build_properties, date_formatter
from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter

logger = logging.getLogger(__name__)

DEFAULT_ENCODING = 'UTF-8'
DATAONE_SOLR_ENDPOINT = 'https://cn.dataone.org/cn/v1/query/solr/'


def process_doi(service_id, doc_doi):
doi_re = '10\\.\\d{4}/\\w*\\.\\w*(/\\w*)?'

doi_list = map(lambda x: x.replace('doi', ''), doc_doi) if isinstance(doc_doi, list) else [doc_doi.replace('doi', '')]

for item in [service_id] + doi_list:
try:
return re.search(doi_re, item).group(0)
except AttributeError:
continue
return ''


def process_contributors(author, submitters, contributors,
investigators):
if not author:
Expand Down Expand Up @@ -149,7 +134,7 @@ class DataOneHarvester(XMLHarvester):
'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors),
'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)),
'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]),
'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)),
'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(datetime_formatter, single_result)),
'title': ("str[@name='title']/node()", single_result),
'description': ("str[@name='abstract']/node()", single_result)
}
Expand Down
14 changes: 0 additions & 14 deletions scrapi/harvesters/dryad.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,6 @@
logger = logging.getLogger(__name__)


def format_dois_dryad(*args):
prefix = 'http://dx.doi.org/{}'
urls = []
for arg in args:
if isinstance(arg, list):
for url in arg:
if 'doi:' in url:
urls.append(prefix.format(url.replace('doi:', '')))
elif arg:
if 'doi:' in arg:
urls.append(prefix.format(arg.replace('doi:', '')))
return urls


class DryadHarvester(OAIHarvester):
short_name = 'dryad'
long_name = 'Dryad Data Repository'
Expand Down
4 changes: 2 additions & 2 deletions scrapi/harvesters/figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from scrapi import settings
from scrapi.base import JSONHarvester
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import default_name_parser, build_properties, date_formatter
from scrapi.base.helpers import default_name_parser, build_properties, datetime_formatter

logger = logging.getLogger(__name__)

Expand All @@ -34,7 +34,7 @@ class FigshareHarvester(JSONHarvester):
'title': '/title',
'description': '/description',
'contributors': ('/authors', lambda x: default_name_parser([person['author_name'] for person in x])),
'providerUpdatedDateTime': ('/modified_date', date_formatter),
'providerUpdatedDateTime': ('/modified_date', datetime_formatter),
'uris': {
'canonicalUri': ('/DOI', lambda x: x[0] if isinstance(x, list) else x),
'providerUris': [
Expand Down
6 changes: 3 additions & 3 deletions scrapi/harvesters/harvarddataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
from scrapi import settings
from scrapi.base import JSONHarvester
from scrapi.linter.document import RawDocument
from scrapi.base.helpers import default_name_parser, build_properties, date_formatter
from scrapi.base.helpers import default_name_parser, build_properties, datetime_formatter

logger = logging.getLogger(__name__)


try:
from scrapi.settings import HARVARD_DATAVERSE_API_KEY
except ImportError:
except ImportError: # pragma: no cover
HARVARD_DATAVERSE_API_KEY = None
logger.error('No HARVARD_DATAVERSE_API_KEY found, Harvard Dataverse will always return []')

Expand All @@ -44,7 +44,7 @@ class HarvardDataverseHarvester(JSONHarvester):
'title': '/name',
'description': '/description',
'contributors': ('/authors', default_name_parser),
'providerUpdatedDateTime': ('/published_at', date_formatter),
'providerUpdatedDateTime': ('/published_at', datetime_formatter),
'uris': {
'canonicalUri': '/url',
'objectUris': [
Expand Down
Loading

0 comments on commit 8c91643

Please sign in to comment.