diff --git a/.gitignore b/.gitignore index e4109c0a..a92a9650 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,5 @@ archive/* # Emacs edited files: *.*~ + +*.sublime* diff --git a/CHANGELOG b/CHANGELOG index a7a90a7b..ad02c471 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,17 @@ ChangeLog ********* +0.9.6 (2015-09-17) +================== +- Elasticsearch now retries requests after connection errors +- Calhoun harvester now ignores that the SSL cert is invalid +- OAI url parser now terminates the regex capture after finding an invalid DOI +character +- harvester invoke task now puts the default start date as settings.DAYS_BACK +days before the end date +- scrapi.requests now exposes the requests.exceptions module +- Update README.md with updated date information + 0.9.5 (2015-09-14) ================== - Clinical Trials harvester now dumps lxml elements to dicionaries in diff --git a/README.md b/README.md index 2597bdac..cc079d37 100644 --- a/README.md +++ b/README.md @@ -202,12 +202,20 @@ For local development, running the ```mit``` harvester is recommended. Note: harvester-name is the same as the defined harvester "short name". -Invoke a harvester a certain number of days back with the ```--days``` argument. For example, to run a harvester 5 days in the past, run: +Invoke a harvester for a certain start date with the ```--start``` or ```-s```argument. Invoke a harvester for a certain end date with the ```--end``` or ```-e```argument. + +For example, to run a harvester between the dates of March 14th and March 16th 2015, run: ```bash -$ invoke harvester harvester-name --days=5 +$ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16 ``` +Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date. + +If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date. + + +### Automated OAI PMH Harvester Creation Writing a harvester for inclusion with scrAPI? If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you. diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index 1aa18feb..cf663c24 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -15,7 +15,7 @@ from scrapi import requests -URL_REGEX = re.compile(r'(https?://\S*\.\S*)') +URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)') DOI_REGEX = re.compile(r'(doi:10\.\S*)') ''' Takes a value, returns a function that always returns that value @@ -105,7 +105,13 @@ def format_tags(all_tags, sep=','): return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()])) -def oai_process_uris(*args): +def format_doi_as_url(doi): + if doi: + plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip() + return 'http://dx.doi.org/{}'.format(plain_doi) + + +def gather_identifiers(args): identifiers = [] for arg in args: if isinstance(arg, list): @@ -114,28 +120,58 @@ def oai_process_uris(*args): elif arg: identifiers.append(arg) + return identifiers + + +def maybe_group(match): + ''' + evaluates an regular expression match object, returns the group or none + ''' + return match.group() if match else None + + +def gather_object_uris(identifiers): object_uris = [] - provider_uris = [] for item in identifiers: if 'doi' in item.lower(): - doi = item.replace('doi:', '').replace('DOI:', '').strip() - if 'http://dx.doi.org/' in doi: - object_uris.append(doi) - else: - object_uris.append('http://dx.doi.org/{}'.format(doi)) + url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item) + url_doi = maybe_group(url_doi) + just_doi = maybe_group(just_doi) + object_uris.append(url_doi or format_doi_as_url(just_doi)) + + return object_uris + + +def seperate_provider_object_uris(identifiers): + object_uris = gather_object_uris(identifiers) + provider_uris = [] + for item in identifiers: + + found_url = maybe_group(URL_REGEX.search(item)) - try: - found_url = URL_REGEX.search(item).group() - except AttributeError: - found_url = None if found_url: if 'viewcontent' in found_url: object_uris.append(found_url) else: - provider_uris.append(found_url) + if 'dx.doi.org' not in found_url: + provider_uris.append(found_url) + + return provider_uris, object_uris + + +def oai_process_uris(*args, **kwargs): + use_doi = kwargs.get('use_doi', False) + + identifiers = gather_identifiers(args) + provider_uris, object_uris = seperate_provider_object_uris(identifiers) try: - canonical_uri = (provider_uris + object_uris)[0] + if use_doi: + for uri in object_uris: + if 'dx.doi.org' in uri: + canonical_uri = uri + else: + canonical_uri = (provider_uris + object_uris)[0] except IndexError: raise ValueError('No Canonical URI was returned for this record.') diff --git a/scrapi/harvesters/calhoun.py b/scrapi/harvesters/calhoun.py index 575ec1b0..2ff1cc25 100644 --- a/scrapi/harvesters/calhoun.py +++ b/scrapi/harvesters/calhoun.py @@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester): short_name = 'calhoun' long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School' url = 'http://calhoun.nps.edu' + verify = False base_url = 'http://calhoun.nps.edu/oai/request' property_list = [ diff --git a/scrapi/harvesters/dataone.py b/scrapi/harvesters/dataone.py index bb756753..e3cb76c9 100644 --- a/scrapi/harvesters/dataone.py +++ b/scrapi/harvesters/dataone.py @@ -11,6 +11,7 @@ from datetime import timedelta, date from lxml import etree +from functools import partial from dateutil.parser import parse from xml.etree import ElementTree @@ -18,6 +19,7 @@ from scrapi import requests from scrapi import settings +from scrapi.base import helpers from scrapi.base import XMLHarvester from scrapi.util import copy_to_unicode from scrapi.linter.document import RawDocument @@ -145,10 +147,7 @@ class DataOneHarvester(XMLHarvester): 'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None) }, 'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors), - 'uris': { - 'canonicalUri': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", lambda x, y: y[0] if 'http' in single_result(y) else x[0] if 'http' in single_result(x) else ''), - 'objectUri': ("arr[@name='resourceMap']/str/node()", compose(lambda x: x.replace('doi:', 'http://dx.doi.org/'), single_result)) - }, + 'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)), 'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]), 'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)), 'title': ("str[@name='title']/node()", single_result), diff --git a/scrapi/processing/elasticsearch.py b/scrapi/processing/elasticsearch.py index 85afe966..ad922464 100644 --- a/scrapi/processing/elasticsearch.py +++ b/scrapi/processing/elasticsearch.py @@ -22,7 +22,7 @@ try: # If we cant connect to elastic search dont define this class - es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT) + es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT, retry_on_timeout=True) # body = { # 'mappings': { diff --git a/scrapi/requests.py b/scrapi/requests.py index 6797c954..930897e6 100644 --- a/scrapi/requests.py +++ b/scrapi/requests.py @@ -13,6 +13,7 @@ import six import furl import requests +from requests import exceptions # noqa from cassandra.cqlengine import columns, models from requests.structures import CaseInsensitiveDict diff --git a/tasks.py b/tasks.py index e4adbc1a..33d87b17 100644 --- a/tasks.py +++ b/tasks.py @@ -158,8 +158,8 @@ def harvester(harvester_name, async=False, start=None, end=None): if not registry.get(harvester_name): raise ValueError('No such harvesters {}'.format(harvester_name)) - start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK) end = parse(end).date() if end else date.today() + start = parse(start).date() if start else end - timedelta(settings.DAYS_BACK) run_harvester.delay(harvester_name, start_date=start, end_date=end) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 8037090e..eb1fbc9d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -38,13 +38,18 @@ def oai_process_uris(self): def test_extract_uris(self): identifiers = ['doi:10.whateverwhatever', 'http://alloutofbubblegum.com', - 'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com'] + 'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com', + 'Vahedifard, F. et al. (2013). G??otechnique 63, No. 6, 451???462 [http://dx.doi.org/10.1680/geot.11.P.130] ', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi'] uri_dict = helpers.oai_process_uris(identifiers) assert uri_dict == { 'canonicalUri': 'http://alloutofbubblegum.com', - 'objectUris': ['http://dx.doi.org/10.whateverwhatever', 'http://viewcontent.cgi/iamacoolpdf'], + 'objectUris': ['http://dx.doi.org/10.whateverwhatever', + 'http://dx.doi.org/10.1680/geot.11.P.130', + 'http://dx.doi.org/10.10.thisisarealdoi', + 'http://viewcontent.cgi/iamacoolpdf'], 'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com'] } @@ -78,3 +83,53 @@ def test_extract_doi_from_text(self): extracted_doi = helpers.extract_doi_from_text(text) assert extracted_doi == 'http://dx.doi.org/10.1021/woowoowoo' + + def test_gather_identifiers(self): + identifiers = [['doi:10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf'], + '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi', + ['http://bubbaray.com', 'http://devon.net']] + + gathered = helpers.gather_identifiers(identifiers) + + assert gathered == ['doi:10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf', + '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi', + 'http://bubbaray.com', + 'http://devon.net'] + + def test_gather_object_uris(self): + identifiers = ['doi:10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf', + '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi', + 'http://bubbaray.com', + 'http://devon.net'] + object_uris = helpers.gather_object_uris(identifiers) + + assert object_uris == [ + 'http://dx.doi.org/10.whateverwhatever', + 'http://dx.doi.org/10.1680/geot.11.P.130', + 'http://dx.doi.org/10.10.thisisarealdoi' + ] + + def test_seperate_provider_object_uris(self): + identifiers = [ + 'http://dx.doi.org/10.whateverwhatever', + 'http://cgi.viewcontent.apdf.pdf', + 'http://get_the_tables.net' + ] + + provider_uris, object_uris = helpers.seperate_provider_object_uris(identifiers) + + assert provider_uris == ['http://get_the_tables.net'] + assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf'] + + def test_format_doi_as_url(self): + doi1 = ' doi:10.dudleyzrule ' + doi2 = 'DOI:10.getthetables ' + + assert helpers.format_doi_as_url(doi1) == 'http://dx.doi.org/10.dudleyzrule' + assert helpers.format_doi_as_url(doi2) == 'http://dx.doi.org/10.getthetables'