Merge branch 'hotfix/0.9.6' into develop

NeuroVault · Sep 17, 2015 · ad3ee08 · ad3ee08
2 parents b02c3a0 + 75725c9
commit ad3ee08
Show file tree

Hide file tree

Showing 10 changed files with 137 additions and 24 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,5 @@ archive/*
 
 # Emacs edited files:
 *.*~
+
+*.sublime*
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,17 @@
 ChangeLog
 *********
 
+0.9.6 (2015-09-17)
+==================
+- Elasticsearch now retries requests after connection errors
+- Calhoun harvester now ignores that the SSL cert is invalid
+- OAI url parser now terminates the regex capture after finding an invalid DOI
+character
+- harvester invoke task now puts the default start date as settings.DAYS_BACK
+days before the end date
+- scrapi.requests now exposes the requests.exceptions module
+- Update README.md with updated date information
+
 0.9.5 (2015-09-14)
 ==================
 - Clinical Trials harvester now dumps lxml elements to dicionaries in

diff --git a/README.md b/README.md
@@ -202,12 +202,20 @@ For local development, running the ```mit``` harvester is recommended.
 
 Note: harvester-name is the same as the defined harvester "short name".
 
-Invoke a harvester a certain number of days back with the ```--days``` argument. For example, to run a harvester 5 days in the past, run:
+Invoke a harvester for a certain start date with the ```--start``` or ```-s```argument. Invoke a harvester for a certain end date with the ```--end``` or ```-e```argument.
+
+For example, to run a harvester between the dates of March 14th and March 16th 2015, run:
 
 ```bash
-$ invoke harvester harvester-name --days=5
+$ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16
 ```
 
+Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
+
+If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date.
+
+
+### Automated OAI PMH Harvester Creation
 Writing a harvester for inclusion with scrAPI?  If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you.
 
 

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
@@ -15,7 +15,7 @@
 from scrapi import requests
 
 
-URL_REGEX = re.compile(r'(https?://\S*\.\S*)')
+URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
 
 ''' Takes a value, returns a function that always returns that value
@@ -105,7 +105,13 @@ def format_tags(all_tags, sep=','):
     return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()]))
 
 
-def oai_process_uris(*args):
+def format_doi_as_url(doi):
+    if doi:
+        plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip()
+        return 'http://dx.doi.org/{}'.format(plain_doi)
+
+
+def gather_identifiers(args):
     identifiers = []
     for arg in args:
         if isinstance(arg, list):
@@ -114,28 +120,58 @@ def oai_process_uris(*args):
         elif arg:
             identifiers.append(arg)
 
+    return identifiers
+
+
+def maybe_group(match):
+    '''
+    evaluates an regular expression match object, returns the group or none
+    '''
+    return match.group() if match else None
+
+
+def gather_object_uris(identifiers):
     object_uris = []
-    provider_uris = []
     for item in identifiers:
         if 'doi' in item.lower():
-            doi = item.replace('doi:', '').replace('DOI:', '').strip()
-            if 'http://dx.doi.org/' in doi:
-                object_uris.append(doi)
-            else:
-                object_uris.append('http://dx.doi.org/{}'.format(doi))
+            url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item)
+            url_doi = maybe_group(url_doi)
+            just_doi = maybe_group(just_doi)
+            object_uris.append(url_doi or format_doi_as_url(just_doi))
+
+    return object_uris
+
+
+def seperate_provider_object_uris(identifiers):
+    object_uris = gather_object_uris(identifiers)
+    provider_uris = []
+    for item in identifiers:
+
+        found_url = maybe_group(URL_REGEX.search(item))
 
-        try:
-            found_url = URL_REGEX.search(item).group()
-        except AttributeError:
-            found_url = None
         if found_url:
             if 'viewcontent' in found_url:
                 object_uris.append(found_url)
             else:
-                provider_uris.append(found_url)
+                if 'dx.doi.org' not in found_url:
+                    provider_uris.append(found_url)
+
+    return provider_uris, object_uris
+
+
+def oai_process_uris(*args, **kwargs):
+    use_doi = kwargs.get('use_doi', False)
+
+    identifiers = gather_identifiers(args)
+    provider_uris, object_uris = seperate_provider_object_uris(identifiers)
 
     try:
-        canonical_uri = (provider_uris + object_uris)[0]
+        if use_doi:
+            for uri in object_uris:
+                if 'dx.doi.org' in uri:
+                    canonical_uri = uri
+        else:
+            canonical_uri = (provider_uris + object_uris)[0]
     except IndexError:
         raise ValueError('No Canonical URI was returned for this record.')
 

diff --git a/scrapi/harvesters/calhoun.py b/scrapi/harvesters/calhoun.py
@@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester):
     short_name = 'calhoun'
     long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School'
     url = 'http://calhoun.nps.edu'
+    verify = False
 
     base_url = 'http://calhoun.nps.edu/oai/request'
     property_list = [

diff --git a/scrapi/harvesters/dataone.py b/scrapi/harvesters/dataone.py
@@ -11,13 +11,15 @@
 from datetime import timedelta, date
 
 from lxml import etree
+from functools import partial
 from dateutil.parser import parse
 from xml.etree import ElementTree
 
 from nameparser import HumanName
 
 from scrapi import requests
 from scrapi import settings
+from scrapi.base import helpers
 from scrapi.base import XMLHarvester
 from scrapi.util import copy_to_unicode
 from scrapi.linter.document import RawDocument
@@ -145,10 +147,7 @@ class DataOneHarvester(XMLHarvester):
             'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None)
         },
         'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors),
-        'uris': {
-            'canonicalUri': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", lambda x, y: y[0] if 'http' in single_result(y) else x[0] if 'http' in single_result(x) else ''),
-            'objectUri': ("arr[@name='resourceMap']/str/node()", compose(lambda x: x.replace('doi:', 'http://dx.doi.org/'), single_result))
-        },
+        'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)),
         'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]),
         'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)),
         'title': ("str[@name='title']/node()", single_result),

diff --git a/scrapi/processing/elasticsearch.py b/scrapi/processing/elasticsearch.py
@@ -22,7 +22,7 @@
 
 try:
     # If we cant connect to elastic search dont define this class
-    es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT)
+    es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT, retry_on_timeout=True)
 
     # body = {
     #     'mappings': {

diff --git a/scrapi/requests.py b/scrapi/requests.py
@@ -13,6 +13,7 @@
 import six
 import furl
 import requests
+from requests import exceptions  # noqa
 from cassandra.cqlengine import columns, models
 from requests.structures import CaseInsensitiveDict
 

diff --git a/tasks.py b/tasks.py
@@ -158,8 +158,8 @@ def harvester(harvester_name, async=False, start=None, end=None):
     if not registry.get(harvester_name):
         raise ValueError('No such harvesters {}'.format(harvester_name))
 
-    start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK)
     end = parse(end).date() if end else date.today()
+    start = parse(start).date() if start else end - timedelta(settings.DAYS_BACK)
 
     run_harvester.delay(harvester_name, start_date=start, end_date=end)
 

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -38,13 +38,18 @@ def oai_process_uris(self):
 
     def test_extract_uris(self):
         identifiers = ['doi:10.whateverwhatever', 'http://alloutofbubblegum.com',
-                       'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com']
+                       'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com',
+                       'Vahedifard, F. et al. (2013). G??otechnique 63, No. 6, 451???462 [http://dx.doi.org/10.1680/geot.11.P.130] ',
+                       'I am a bunch of text but I also have a doi:10.10.thisisarealdoi']
 
         uri_dict = helpers.oai_process_uris(identifiers)
 
         assert uri_dict == {
             'canonicalUri': 'http://alloutofbubblegum.com',
-            'objectUris': ['http://dx.doi.org/10.whateverwhatever', 'http://viewcontent.cgi/iamacoolpdf'],
+            'objectUris': ['http://dx.doi.org/10.whateverwhatever',
+                           'http://dx.doi.org/10.1680/geot.11.P.130',
+                           'http://dx.doi.org/10.10.thisisarealdoi',
+                           'http://viewcontent.cgi/iamacoolpdf'],
             'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com']
         }
 
@@ -78,3 +83,53 @@ def test_extract_doi_from_text(self):
         extracted_doi = helpers.extract_doi_from_text(text)
 
         assert extracted_doi == 'http://dx.doi.org/10.1021/woowoowoo'
+
+    def test_gather_identifiers(self):
+        identifiers = [['doi:10.whateverwhatever',
+                       'http://viewcontent.cgi/iamacoolpdf'],
+                       '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
+                       'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
+                       ['http://bubbaray.com', 'http://devon.net']]
+
+        gathered = helpers.gather_identifiers(identifiers)
+
+        assert gathered == ['doi:10.whateverwhatever',
+                            'http://viewcontent.cgi/iamacoolpdf',
+                            '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
+                            'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
+                            'http://bubbaray.com',
+                            'http://devon.net']
+
+    def test_gather_object_uris(self):
+        identifiers = ['doi:10.whateverwhatever',
+                       'http://viewcontent.cgi/iamacoolpdf',
+                       '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
+                       'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
+                       'http://bubbaray.com',
+                       'http://devon.net']
+        object_uris = helpers.gather_object_uris(identifiers)
+
+        assert object_uris == [
+            'http://dx.doi.org/10.whateverwhatever',
+            'http://dx.doi.org/10.1680/geot.11.P.130',
+            'http://dx.doi.org/10.10.thisisarealdoi'
+        ]
+
+    def test_seperate_provider_object_uris(self):
+        identifiers = [
+            'http://dx.doi.org/10.whateverwhatever',
+            'http://cgi.viewcontent.apdf.pdf',
+            'http://get_the_tables.net'
+        ]
+
+        provider_uris, object_uris = helpers.seperate_provider_object_uris(identifiers)
+
+        assert provider_uris == ['http://get_the_tables.net']
+        assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf']
+
+    def test_format_doi_as_url(self):
+        doi1 = ' doi:10.dudleyzrule '
+        doi2 = 'DOI:10.getthetables '
+
+        assert helpers.format_doi_as_url(doi1) == 'http://dx.doi.org/10.dudleyzrule'
+        assert helpers.format_doi_as_url(doi2) == 'http://dx.doi.org/10.getthetables'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -65,3 +65,5 @@ archive/*

		# Emacs edited files:
		.~

		.sublime