From 660078d2c8873288e99720d179e4d43b167fa713 Mon Sep 17 00:00:00 2001 From: erinspace Date: Fri, 11 Sep 2015 12:00:24 -0400 Subject: [PATCH 01/22] Make start and end date parsing more sane closes [#SHARE-31] --- tasks.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tasks.py b/tasks.py index e4adbc1a..0c610328 100644 --- a/tasks.py +++ b/tasks.py @@ -158,8 +158,23 @@ def harvester(harvester_name, async=False, start=None, end=None): if not registry.get(harvester_name): raise ValueError('No such harvesters {}'.format(harvester_name)) - start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK) - end = parse(end).date() if end else date.today() + if end: + if parse(end).date() > date.today(): + logger.info('End date is in the future, deafaulting to today!') + end = date.today() + else: + end = parse(end).date() + else: + end = date.today() + + if start: + if parse(start).date() > end: + logger.info('Start date is after end date, defaulting to standard start') + start = end - timedelta(settings.DAYS_BACK) + else: + start = parse(start).date() + else: + start = end - timedelta(settings.DAYS_BACK) run_harvester.delay(harvester_name, start_date=start, end_date=end) From f0bfcde0ce2e98320e59a9db6193b912b6626cdc Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 16 Sep 2015 15:29:56 -0400 Subject: [PATCH 02/22] Make requests.exceptions. work as expected for scrapi.requests [#SHARE-32] --- scrapi/requests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapi/requests.py b/scrapi/requests.py index 6797c954..930897e6 100644 --- a/scrapi/requests.py +++ b/scrapi/requests.py @@ -13,6 +13,7 @@ import six import furl import requests +from requests import exceptions # noqa from cassandra.cqlengine import columns, models from requests.structures import CaseInsensitiveDict From 61f6f7dc9a35fa050631c6945b94f9c2a048fdfd Mon Sep 17 00:00:00 2001 From: erinspace Date: Wed, 16 Sep 2015 15:47:50 -0400 Subject: [PATCH 03/22] Fix URL parsing when given a paragraph with a url closes [#SHARE-54] Before, a parahraph of text with a valid URL inside of it would return with the last square bracket included, which was throwing errors in some harvesters. This improves the URL regex and captures more DOIs --- scrapi/base/helpers.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index 1aa18feb..bbbcf366 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -15,7 +15,7 @@ from scrapi import requests -URL_REGEX = re.compile(r'(https?://\S*\.\S*)') +URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)') DOI_REGEX = re.compile(r'(doi:10\.\S*)') ''' Takes a value, returns a function that always returns that value @@ -118,11 +118,15 @@ def oai_process_uris(*args): provider_uris = [] for item in identifiers: if 'doi' in item.lower(): - doi = item.replace('doi:', '').replace('DOI:', '').strip() - if 'http://dx.doi.org/' in doi: - object_uris.append(doi) - else: - object_uris.append('http://dx.doi.org/{}'.format(doi)) + try: + found_url_doi = URL_REGEX.search(item).group() + object_uris.append(found_url_doi) + except AttributeError: + try: + just_doi = DOI_REGEX.search(item).group() + object_uris.append('http://dx.doi.org/{}'.format(just_doi.replace('doi:', '').replace('DOI:', '').strip())) + except AttributeError: + pass try: found_url = URL_REGEX.search(item).group() @@ -132,7 +136,8 @@ def oai_process_uris(*args): if 'viewcontent' in found_url: object_uris.append(found_url) else: - provider_uris.append(found_url) + if 'dx.doi.org' not in found_url: + provider_uris.append(found_url) try: canonical_uri = (provider_uris + object_uris)[0] From da353711709ed0ca34ba4a9cc4dc82b7ff60ca2e Mon Sep 17 00:00:00 2001 From: erinspace Date: Wed, 16 Sep 2015 15:50:25 -0400 Subject: [PATCH 04/22] Fix tests to reflect more accurate URL and DOI parsing in identifiers --- tests/test_helpers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 8037090e..8366befd 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -38,13 +38,18 @@ def oai_process_uris(self): def test_extract_uris(self): identifiers = ['doi:10.whateverwhatever', 'http://alloutofbubblegum.com', - 'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com'] + 'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com', + 'Vahedifard, F. et al. (2013). G??otechnique 63, No. 6, 451???462 [http://dx.doi.org/10.1680/geot.11.P.130] ', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi'] uri_dict = helpers.oai_process_uris(identifiers) assert uri_dict == { 'canonicalUri': 'http://alloutofbubblegum.com', - 'objectUris': ['http://dx.doi.org/10.whateverwhatever', 'http://viewcontent.cgi/iamacoolpdf'], + 'objectUris': ['http://dx.doi.org/10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf', + 'http://dx.doi.org/10.1680/geot.11.P.130', + 'http://dx.doi.org/10.10.thisisarealdoi'], 'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com'] } From f099fc93840519535aa85142c9dee8abb3972892 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 16 Sep 2015 16:03:31 -0400 Subject: [PATCH 05/22] Turn off SSL cert verification for calhoun harvester --- scrapi/harvesters/calhoun.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapi/harvesters/calhoun.py b/scrapi/harvesters/calhoun.py index 575ec1b0..2ff1cc25 100644 --- a/scrapi/harvesters/calhoun.py +++ b/scrapi/harvesters/calhoun.py @@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester): short_name = 'calhoun' long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School' url = 'http://calhoun.nps.edu' + verify = False base_url = 'http://calhoun.nps.edu/oai/request' property_list = [ From 4f80fbcf08fbf6194caa68d52df2e9428a44a819 Mon Sep 17 00:00:00 2001 From: erinspace Date: Wed, 16 Sep 2015 16:09:17 -0400 Subject: [PATCH 06/22] Add correct harvester args to inv harvester README closes [#SHARE-30] --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9b24c47a..aef699b6 100644 --- a/README.md +++ b/README.md @@ -202,12 +202,16 @@ For local development, running the ```mit``` harvester is recommended. Note: harvester-name is the same as the defined harvester "short name". -Invoke a harvester a certain number of days back with the ```--days``` argument. For example, to run a harvester 5 days in the past, run: +Invoke a harvester for a certain start date with the ```--start``` argument. Invole a harvester for a certain end date with the ```--end``` argument. + +For example, to run a harvester between the dates of March 14th and March 16th 2015, run: ```bash -$ invoke harvester harvester-name --days=5 +$ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16 ``` +Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date. + Writing a harvester for inclusion with scrAPI? If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you. From c2d203cf2d7bef0c372d4f46344cbb7e1f1b19ae Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 16 Sep 2015 16:35:20 -0400 Subject: [PATCH 07/22] Make Elasticsearch retry requests on connection failures [#SHARE-58] --- scrapi/processing/elasticsearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapi/processing/elasticsearch.py b/scrapi/processing/elasticsearch.py index 85afe966..ad922464 100644 --- a/scrapi/processing/elasticsearch.py +++ b/scrapi/processing/elasticsearch.py @@ -22,7 +22,7 @@ try: # If we cant connect to elastic search dont define this class - es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT) + es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT, retry_on_timeout=True) # body = { # 'mappings': { From ae28cb9d6c744b6cd483f7cf8a1ffb06d860d040 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 10:38:42 -0400 Subject: [PATCH 08/22] Update order of returned URIs for tests --- tests/test_helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 8366befd..cfb9c22f 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -47,9 +47,9 @@ def test_extract_uris(self): assert uri_dict == { 'canonicalUri': 'http://alloutofbubblegum.com', 'objectUris': ['http://dx.doi.org/10.whateverwhatever', - 'http://viewcontent.cgi/iamacoolpdf', 'http://dx.doi.org/10.1680/geot.11.P.130', - 'http://dx.doi.org/10.10.thisisarealdoi'], + 'http://dx.doi.org/10.10.thisisarealdoi', + 'http://viewcontent.cgi/iamacoolpdf'], 'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com'] } From 4abccb4f0253a2d808d32cb8fb2b4d3d269b468f Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 10:39:03 -0400 Subject: [PATCH 09/22] break process URIs down into smaller helper functions for better clarity --- scrapi/base/helpers.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index bbbcf366..bc7e370c 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -105,7 +105,7 @@ def format_tags(all_tags, sep=','): return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()])) -def oai_process_uris(*args): +def gather_identifiers(args): identifiers = [] for arg in args: if isinstance(arg, list): @@ -114,8 +114,11 @@ def oai_process_uris(*args): elif arg: identifiers.append(arg) + return identifiers + + +def gather_object_uris(identifiers): object_uris = [] - provider_uris = [] for item in identifiers: if 'doi' in item.lower(): try: @@ -127,7 +130,13 @@ def oai_process_uris(*args): object_uris.append('http://dx.doi.org/{}'.format(just_doi.replace('doi:', '').replace('DOI:', '').strip())) except AttributeError: pass + return object_uris + +def seperate_provider_object_uris(identifiers): + object_uris = gather_object_uris(identifiers) + provider_uris = [] + for item in identifiers: try: found_url = URL_REGEX.search(item).group() except AttributeError: @@ -139,6 +148,14 @@ def oai_process_uris(*args): if 'dx.doi.org' not in found_url: provider_uris.append(found_url) + return provider_uris, object_uris + + +def oai_process_uris(*args): + identifiers = gather_identifiers(args) + + provider_uris, object_uris = seperate_provider_object_uris(identifiers) + try: canonical_uri = (provider_uris + object_uris)[0] except IndexError: From fd8e4b3069892768e76ae931080c95ee82c92593 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 11:06:03 -0400 Subject: [PATCH 10/22] Update helpers with use_doi kwarg --- scrapi/base/helpers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index bc7e370c..1bed1d67 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -151,13 +151,19 @@ def seperate_provider_object_uris(identifiers): return provider_uris, object_uris -def oai_process_uris(*args): - identifiers = gather_identifiers(args) +def oai_process_uris(*args, **kwargs): + use_doi = kwargs.get('use_doi', False) + identifiers = gather_identifiers(args) provider_uris, object_uris = seperate_provider_object_uris(identifiers) try: - canonical_uri = (provider_uris + object_uris)[0] + if use_doi: + for uri in object_uris: + if 'dx.doi.org' in uri: + canonical_uri = uri + else: + canonical_uri = (provider_uris + object_uris)[0] except IndexError: raise ValueError('No Canonical URI was returned for this record.') From bb89f5822148f4a51d63c0046818ab3bea0f3364 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 11:06:22 -0400 Subject: [PATCH 11/22] Update dataone to use the new use_doi kwarg --- scrapi/harvesters/dataone.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapi/harvesters/dataone.py b/scrapi/harvesters/dataone.py index bb756753..e3cb76c9 100644 --- a/scrapi/harvesters/dataone.py +++ b/scrapi/harvesters/dataone.py @@ -11,6 +11,7 @@ from datetime import timedelta, date from lxml import etree +from functools import partial from dateutil.parser import parse from xml.etree import ElementTree @@ -18,6 +19,7 @@ from scrapi import requests from scrapi import settings +from scrapi.base import helpers from scrapi.base import XMLHarvester from scrapi.util import copy_to_unicode from scrapi.linter.document import RawDocument @@ -145,10 +147,7 @@ class DataOneHarvester(XMLHarvester): 'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None) }, 'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors), - 'uris': { - 'canonicalUri': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", lambda x, y: y[0] if 'http' in single_result(y) else x[0] if 'http' in single_result(x) else ''), - 'objectUri': ("arr[@name='resourceMap']/str/node()", compose(lambda x: x.replace('doi:', 'http://dx.doi.org/'), single_result)) - }, + 'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)), 'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]), 'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)), 'title': ("str[@name='title']/node()", single_result), From e6daf5a5e9127dad247c1bd758dd128f1099e643 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 11:27:24 -0400 Subject: [PATCH 12/22] Add tests for new seperated URIs --- tests/test_helpers.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index cfb9c22f..49a3e168 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -83,3 +83,46 @@ def test_extract_doi_from_text(self): extracted_doi = helpers.extract_doi_from_text(text) assert extracted_doi == 'http://dx.doi.org/10.1021/woowoowoo' + + def test_gather_identifiers(self): + identifiers = [['doi:10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf'], + '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi', + ['http://bubbaray.com', 'http://devon.net']] + + gathered = helpers.gather_identifiers(identifiers) + + assert gathered == ['doi:10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf', + '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi', + 'http://bubbaray.com', + 'http://devon.net'] + + def test_gather_object_uris(self): + identifiers = ['doi:10.whateverwhatever', + 'http://viewcontent.cgi/iamacoolpdf', + '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]', + 'I am a bunch of text but I also have a doi:10.10.thisisarealdoi', + 'http://bubbaray.com', + 'http://devon.net'] + object_uris = helpers.gather_object_uris(identifiers) + + assert object_uris == [ + 'http://dx.doi.org/10.whateverwhatever', + 'http://dx.doi.org/10.1680/geot.11.P.130', + 'http://dx.doi.org/10.10.thisisarealdoi' + ] + + def test_seperate_provider_object_uris(self): + identifiers = [ + 'http://dx.doi.org/10.whateverwhatever', + 'http://cgi.viewcontent.apdf.pdf', + 'http://get_the_tables.net' + ] + + provider_uris, object_uris = helpers.seperate_provider_object_uris(identifiers) + + assert provider_uris == ['http://get_the_tables.net'] + assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf'] From a8847e44a01b8fd337c1d9840b7fb2571e7e1bf5 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Wed, 16 Sep 2015 16:03:31 -0400 Subject: [PATCH 13/22] Turn off SSL cert verification for calhoun harvester [#SHARE-55] --- scrapi/harvesters/calhoun.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapi/harvesters/calhoun.py b/scrapi/harvesters/calhoun.py index 575ec1b0..2ff1cc25 100644 --- a/scrapi/harvesters/calhoun.py +++ b/scrapi/harvesters/calhoun.py @@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester): short_name = 'calhoun' long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School' url = 'http://calhoun.nps.edu' + verify = False base_url = 'http://calhoun.nps.edu/oai/request' property_list = [ From 889ef2d8d04e8172461cc6e6122811a8551f9889 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 12:16:06 -0400 Subject: [PATCH 14/22] Add test for format doi as url --- tests/test_helpers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 49a3e168..eb1fbc9d 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -126,3 +126,10 @@ def test_seperate_provider_object_uris(self): assert provider_uris == ['http://get_the_tables.net'] assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf'] + + def test_format_doi_as_url(self): + doi1 = ' doi:10.dudleyzrule ' + doi2 = 'DOI:10.getthetables ' + + assert helpers.format_doi_as_url(doi1) == 'http://dx.doi.org/10.dudleyzrule' + assert helpers.format_doi_as_url(doi2) == 'http://dx.doi.org/10.getthetables' From aad2433d682c520ba312541d7965769d20e7c272 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 12:21:24 -0400 Subject: [PATCH 15/22] Remove some exception driven logic, thanks @fabianvf --- scrapi/base/helpers.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index 1bed1d67..ee788717 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -105,6 +105,11 @@ def format_tags(all_tags, sep=','): return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()])) +def format_doi_as_url(doi): + plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip() + return 'http://dx.doi.org/{}'.format(plain_doi) + + def gather_identifiers(args): identifiers = [] for arg in args: @@ -121,15 +126,11 @@ def gather_object_uris(identifiers): object_uris = [] for item in identifiers: if 'doi' in item.lower(): - try: - found_url_doi = URL_REGEX.search(item).group() - object_uris.append(found_url_doi) - except AttributeError: - try: - just_doi = DOI_REGEX.search(item).group() - object_uris.append('http://dx.doi.org/{}'.format(just_doi.replace('doi:', '').replace('DOI:', '').strip())) - except AttributeError: - pass + url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item) + url_doi = url_doi.group() if url_doi else None + just_doi = format_doi_as_url(just_doi.group()) if just_doi else None + object_uris.append(url_doi or just_doi) + return object_uris @@ -137,10 +138,10 @@ def seperate_provider_object_uris(identifiers): object_uris = gather_object_uris(identifiers) provider_uris = [] for item in identifiers: - try: - found_url = URL_REGEX.search(item).group() - except AttributeError: - found_url = None + + found_url = URL_REGEX.search(item) + found_url = found_url.group() if found_url else None + if found_url: if 'viewcontent' in found_url: object_uris.append(found_url) From 8fc24fba99f7ea91eb652fd5dd4c1d79df4c4bed Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 13:25:53 -0400 Subject: [PATCH 16/22] Scale back extent of logic for parsing start times --- tasks.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tasks.py b/tasks.py index 0c610328..33d87b17 100644 --- a/tasks.py +++ b/tasks.py @@ -158,23 +158,8 @@ def harvester(harvester_name, async=False, start=None, end=None): if not registry.get(harvester_name): raise ValueError('No such harvesters {}'.format(harvester_name)) - if end: - if parse(end).date() > date.today(): - logger.info('End date is in the future, deafaulting to today!') - end = date.today() - else: - end = parse(end).date() - else: - end = date.today() - - if start: - if parse(start).date() > end: - logger.info('Start date is after end date, defaulting to standard start') - start = end - timedelta(settings.DAYS_BACK) - else: - start = parse(start).date() - else: - start = end - timedelta(settings.DAYS_BACK) + end = parse(end).date() if end else date.today() + start = parse(start).date() if start else end - timedelta(settings.DAYS_BACK) run_harvester.delay(harvester_name, start_date=start, end_date=end) From 80e778eaac95d4cccc998e477afa25e6da4bb06f Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 13:33:03 -0400 Subject: [PATCH 17/22] Update to specify what will happen with just end --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index aef699b6..b79c2bef 100644 --- a/README.md +++ b/README.md @@ -210,8 +210,12 @@ For example, to run a harvester between the dates of March 14th and March 16th 2 $ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16 ``` -Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date. +Either --start (-s) or --end (-e) can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date. +If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date. + + +### Automated OAI PMH Harvester Creation Writing a harvester for inclusion with scrAPI? If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you. From af54b885a406fca1dd9a84f3cc322ebe2ecaca27 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 13:37:23 -0400 Subject: [PATCH 18/22] Invole invoke --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b79c2bef..9122cf1f 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,7 @@ For local development, running the ```mit``` harvester is recommended. Note: harvester-name is the same as the defined harvester "short name". -Invoke a harvester for a certain start date with the ```--start``` argument. Invole a harvester for a certain end date with the ```--end``` argument. +Invoke a harvester for a certain start date with the ```--start``` or ```-s```argument. Invoke a harvester for a certain end date with the ```--end``` or ```-e```argument. For example, to run a harvester between the dates of March 14th and March 16th 2015, run: @@ -210,7 +210,7 @@ For example, to run a harvester between the dates of March 14th and March 16th 2 $ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16 ``` -Either --start (-s) or --end (-e) can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date. +Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date. If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date. From a51b57394e65a890aa3f9c604b4b0e3f88788cac Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 13:46:03 -0400 Subject: [PATCH 19/22] Make match group or none a seperate function --- scrapi/base/helpers.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index ee788717..60b2e6e6 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -122,14 +122,21 @@ def gather_identifiers(args): return identifiers +def maybe_group(match): + ''' + evaluates an regular expression match object, returns the group or none + ''' + return match.group() if match else None + + def gather_object_uris(identifiers): object_uris = [] for item in identifiers: if 'doi' in item.lower(): url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item) - url_doi = url_doi.group() if url_doi else None - just_doi = format_doi_as_url(just_doi.group()) if just_doi else None - object_uris.append(url_doi or just_doi) + url_doi = maybe_group(url_doi) + just_doi = maybe_group(just_doi) + object_uris.append(url_doi or format_doi_as_url(just_doi)) return object_uris @@ -139,8 +146,7 @@ def seperate_provider_object_uris(identifiers): provider_uris = [] for item in identifiers: - found_url = URL_REGEX.search(item) - found_url = found_url.group() if found_url else None + found_url = maybe_group(URL_REGEX.search(item)) if found_url: if 'viewcontent' in found_url: From 30e4b6007b4ce6ae486b66b99e7c751e11685cf1 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 14:53:41 -0400 Subject: [PATCH 20/22] Add sublime files to gitignore now also closes [#SHARE-53] --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index e4109c0a..a92a9650 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,5 @@ archive/* # Emacs edited files: *.*~ + +*.sublime* From 7d47e9c556c73ddfebf0ca8c48102750a7484520 Mon Sep 17 00:00:00 2001 From: erinspace Date: Thu, 17 Sep 2015 15:35:35 -0400 Subject: [PATCH 21/22] Add check for doi or none --- scrapi/base/helpers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py index 60b2e6e6..cf663c24 100644 --- a/scrapi/base/helpers.py +++ b/scrapi/base/helpers.py @@ -106,8 +106,9 @@ def format_tags(all_tags, sep=','): def format_doi_as_url(doi): - plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip() - return 'http://dx.doi.org/{}'.format(plain_doi) + if doi: + plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip() + return 'http://dx.doi.org/{}'.format(plain_doi) def gather_identifiers(args): From 75725c92b9a353be9a3aabb8b21adb027f5ce267 Mon Sep 17 00:00:00 2001 From: Fabian von Feilitzsch Date: Thu, 17 Sep 2015 15:56:48 -0400 Subject: [PATCH 22/22] update CHANGELOG --- CHANGELOG | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index a7a90a7b..ad02c471 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,17 @@ ChangeLog ********* +0.9.6 (2015-09-17) +================== +- Elasticsearch now retries requests after connection errors +- Calhoun harvester now ignores that the SSL cert is invalid +- OAI url parser now terminates the regex capture after finding an invalid DOI +character +- harvester invoke task now puts the default start date as settings.DAYS_BACK +days before the end date +- scrapi.requests now exposes the requests.exceptions module +- Update README.md with updated date information + 0.9.5 (2015-09-14) ================== - Clinical Trials harvester now dumps lxml elements to dicionaries in