Skip to content

Commit

Permalink
Merge branch 'hotfix/0.9.6' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
fabianvf committed Sep 17, 2015
2 parents b02c3a0 + 75725c9 commit ad3ee08
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 24 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,5 @@ archive/*

# Emacs edited files:
*.*~

*.sublime*
11 changes: 11 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
ChangeLog
*********

0.9.6 (2015-09-17)
==================
- Elasticsearch now retries requests after connection errors
- Calhoun harvester now ignores that the SSL cert is invalid
- OAI url parser now terminates the regex capture after finding an invalid DOI
character
- harvester invoke task now puts the default start date as settings.DAYS_BACK
days before the end date
- scrapi.requests now exposes the requests.exceptions module
- Update README.md with updated date information

0.9.5 (2015-09-14)
==================
- Clinical Trials harvester now dumps lxml elements to dicionaries in
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,20 @@ For local development, running the ```mit``` harvester is recommended.

Note: harvester-name is the same as the defined harvester "short name".

Invoke a harvester a certain number of days back with the ```--days``` argument. For example, to run a harvester 5 days in the past, run:
Invoke a harvester for a certain start date with the ```--start``` or ```-s```argument. Invoke a harvester for a certain end date with the ```--end``` or ```-e```argument.

For example, to run a harvester between the dates of March 14th and March 16th 2015, run:

```bash
$ invoke harvester harvester-name --days=5
$ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16
```

Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.

If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date.


### Automated OAI PMH Harvester Creation
Writing a harvester for inclusion with scrAPI? If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you.


Expand Down
64 changes: 50 additions & 14 deletions scrapi/base/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from scrapi import requests


URL_REGEX = re.compile(r'(https?://\S*\.\S*)')
URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
DOI_REGEX = re.compile(r'(doi:10\.\S*)')

''' Takes a value, returns a function that always returns that value
Expand Down Expand Up @@ -105,7 +105,13 @@ def format_tags(all_tags, sep=','):
return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()]))


def oai_process_uris(*args):
def format_doi_as_url(doi):
if doi:
plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip()
return 'http://dx.doi.org/{}'.format(plain_doi)


def gather_identifiers(args):
identifiers = []
for arg in args:
if isinstance(arg, list):
Expand All @@ -114,28 +120,58 @@ def oai_process_uris(*args):
elif arg:
identifiers.append(arg)

return identifiers


def maybe_group(match):
'''
evaluates an regular expression match object, returns the group or none
'''
return match.group() if match else None


def gather_object_uris(identifiers):
object_uris = []
provider_uris = []
for item in identifiers:
if 'doi' in item.lower():
doi = item.replace('doi:', '').replace('DOI:', '').strip()
if 'http://dx.doi.org/' in doi:
object_uris.append(doi)
else:
object_uris.append('http://dx.doi.org/{}'.format(doi))
url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item)
url_doi = maybe_group(url_doi)
just_doi = maybe_group(just_doi)
object_uris.append(url_doi or format_doi_as_url(just_doi))

return object_uris


def seperate_provider_object_uris(identifiers):
object_uris = gather_object_uris(identifiers)
provider_uris = []
for item in identifiers:

found_url = maybe_group(URL_REGEX.search(item))

try:
found_url = URL_REGEX.search(item).group()
except AttributeError:
found_url = None
if found_url:
if 'viewcontent' in found_url:
object_uris.append(found_url)
else:
provider_uris.append(found_url)
if 'dx.doi.org' not in found_url:
provider_uris.append(found_url)

return provider_uris, object_uris


def oai_process_uris(*args, **kwargs):
use_doi = kwargs.get('use_doi', False)

identifiers = gather_identifiers(args)
provider_uris, object_uris = seperate_provider_object_uris(identifiers)

try:
canonical_uri = (provider_uris + object_uris)[0]
if use_doi:
for uri in object_uris:
if 'dx.doi.org' in uri:
canonical_uri = uri
else:
canonical_uri = (provider_uris + object_uris)[0]
except IndexError:
raise ValueError('No Canonical URI was returned for this record.')

Expand Down
1 change: 1 addition & 0 deletions scrapi/harvesters/calhoun.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester):
short_name = 'calhoun'
long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School'
url = 'http://calhoun.nps.edu'
verify = False

base_url = 'http://calhoun.nps.edu/oai/request'
property_list = [
Expand Down
7 changes: 3 additions & 4 deletions scrapi/harvesters/dataone.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
from datetime import timedelta, date

from lxml import etree
from functools import partial
from dateutil.parser import parse
from xml.etree import ElementTree

from nameparser import HumanName

from scrapi import requests
from scrapi import settings
from scrapi.base import helpers
from scrapi.base import XMLHarvester
from scrapi.util import copy_to_unicode
from scrapi.linter.document import RawDocument
Expand Down Expand Up @@ -145,10 +147,7 @@ class DataOneHarvester(XMLHarvester):
'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None)
},
'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors),
'uris': {
'canonicalUri': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", lambda x, y: y[0] if 'http' in single_result(y) else x[0] if 'http' in single_result(x) else ''),
'objectUri': ("arr[@name='resourceMap']/str/node()", compose(lambda x: x.replace('doi:', 'http://dx.doi.org/'), single_result))
},
'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)),
'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]),
'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)),
'title': ("str[@name='title']/node()", single_result),
Expand Down
2 changes: 1 addition & 1 deletion scrapi/processing/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

try:
# If we cant connect to elastic search dont define this class
es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT)
es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT, retry_on_timeout=True)

# body = {
# 'mappings': {
Expand Down
1 change: 1 addition & 0 deletions scrapi/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import six
import furl
import requests
from requests import exceptions # noqa
from cassandra.cqlengine import columns, models
from requests.structures import CaseInsensitiveDict

Expand Down
2 changes: 1 addition & 1 deletion tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ def harvester(harvester_name, async=False, start=None, end=None):
if not registry.get(harvester_name):
raise ValueError('No such harvesters {}'.format(harvester_name))

start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK)
end = parse(end).date() if end else date.today()
start = parse(start).date() if start else end - timedelta(settings.DAYS_BACK)

run_harvester.delay(harvester_name, start_date=start, end_date=end)

Expand Down
59 changes: 57 additions & 2 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,18 @@ def oai_process_uris(self):

def test_extract_uris(self):
identifiers = ['doi:10.whateverwhatever', 'http://alloutofbubblegum.com',
'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com']
'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com',
'Vahedifard, F. et al. (2013). G??otechnique 63, No. 6, 451???462 [http://dx.doi.org/10.1680/geot.11.P.130] ',
'I am a bunch of text but I also have a doi:10.10.thisisarealdoi']

uri_dict = helpers.oai_process_uris(identifiers)

assert uri_dict == {
'canonicalUri': 'http://alloutofbubblegum.com',
'objectUris': ['http://dx.doi.org/10.whateverwhatever', 'http://viewcontent.cgi/iamacoolpdf'],
'objectUris': ['http://dx.doi.org/10.whateverwhatever',
'http://dx.doi.org/10.1680/geot.11.P.130',
'http://dx.doi.org/10.10.thisisarealdoi',
'http://viewcontent.cgi/iamacoolpdf'],
'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com']
}

Expand Down Expand Up @@ -78,3 +83,53 @@ def test_extract_doi_from_text(self):
extracted_doi = helpers.extract_doi_from_text(text)

assert extracted_doi == 'http://dx.doi.org/10.1021/woowoowoo'

def test_gather_identifiers(self):
identifiers = [['doi:10.whateverwhatever',
'http://viewcontent.cgi/iamacoolpdf'],
'451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
['http://bubbaray.com', 'http://devon.net']]

gathered = helpers.gather_identifiers(identifiers)

assert gathered == ['doi:10.whateverwhatever',
'http://viewcontent.cgi/iamacoolpdf',
'451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
'http://bubbaray.com',
'http://devon.net']

def test_gather_object_uris(self):
identifiers = ['doi:10.whateverwhatever',
'http://viewcontent.cgi/iamacoolpdf',
'451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
'http://bubbaray.com',
'http://devon.net']
object_uris = helpers.gather_object_uris(identifiers)

assert object_uris == [
'http://dx.doi.org/10.whateverwhatever',
'http://dx.doi.org/10.1680/geot.11.P.130',
'http://dx.doi.org/10.10.thisisarealdoi'
]

def test_seperate_provider_object_uris(self):
identifiers = [
'http://dx.doi.org/10.whateverwhatever',
'http://cgi.viewcontent.apdf.pdf',
'http://get_the_tables.net'
]

provider_uris, object_uris = helpers.seperate_provider_object_uris(identifiers)

assert provider_uris == ['http://get_the_tables.net']
assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf']

def test_format_doi_as_url(self):
doi1 = ' doi:10.dudleyzrule '
doi2 = 'DOI:10.getthetables '

assert helpers.format_doi_as_url(doi1) == 'http://dx.doi.org/10.dudleyzrule'
assert helpers.format_doi_as_url(doi2) == 'http://dx.doi.org/10.getthetables'

0 comments on commit ad3ee08

Please sign in to comment.