From 660078d2c8873288e99720d179e4d43b167fa713 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Fri, 11 Sep 2015 12:00:24 -0400
Subject: [PATCH 01/22] Make start and end date parsing more sane

closes [#SHARE-31]
---
 tasks.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/tasks.py b/tasks.py
index e4adbc1a..0c610328 100644
--- a/tasks.py
+++ b/tasks.py
@@ -158,8 +158,23 @@ def harvester(harvester_name, async=False, start=None, end=None):
     if not registry.get(harvester_name):
         raise ValueError('No such harvesters {}'.format(harvester_name))
 
-    start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK)
-    end = parse(end).date() if end else date.today()
+    if end:
+        if parse(end).date() > date.today():
+            logger.info('End date is in the future, deafaulting to today!')
+            end = date.today()
+        else:
+            end = parse(end).date()
+    else:
+        end = date.today()
+
+    if start:
+        if parse(start).date() > end:
+            logger.info('Start date is after end date, defaulting to standard start')
+            start = end - timedelta(settings.DAYS_BACK)
+        else:
+            start = parse(start).date()
+    else:
+        start = end - timedelta(settings.DAYS_BACK)
 
     run_harvester.delay(harvester_name, start_date=start, end_date=end)
 

From f0bfcde0ce2e98320e59a9db6193b912b6626cdc Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 16 Sep 2015 15:29:56 -0400
Subject: [PATCH 02/22] Make requests.exceptions.<arbitrary exception> work as
 expected for scrapi.requests

[#SHARE-32]
---
 scrapi/requests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrapi/requests.py b/scrapi/requests.py
index 6797c954..930897e6 100644
--- a/scrapi/requests.py
+++ b/scrapi/requests.py
@@ -13,6 +13,7 @@
 import six
 import furl
 import requests
+from requests import exceptions  # noqa
 from cassandra.cqlengine import columns, models
 from requests.structures import CaseInsensitiveDict
 

From 61f6f7dc9a35fa050631c6945b94f9c2a048fdfd Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Wed, 16 Sep 2015 15:47:50 -0400
Subject: [PATCH 03/22] Fix URL parsing when given a paragraph with a url

closes [#SHARE-54]

Before, a parahraph of text with a valid URL inside of it would return
with the last square bracket included, which was throwing errors in some
harvesters. This improves the URL regex and captures more DOIs
---
 scrapi/base/helpers.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index 1aa18feb..bbbcf366 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -15,7 +15,7 @@
 from scrapi import requests
 
 
-URL_REGEX = re.compile(r'(https?://\S*\.\S*)')
+URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
 
 ''' Takes a value, returns a function that always returns that value
@@ -118,11 +118,15 @@ def oai_process_uris(*args):
     provider_uris = []
     for item in identifiers:
         if 'doi' in item.lower():
-            doi = item.replace('doi:', '').replace('DOI:', '').strip()
-            if 'http://dx.doi.org/' in doi:
-                object_uris.append(doi)
-            else:
-                object_uris.append('http://dx.doi.org/{}'.format(doi))
+            try:
+                found_url_doi = URL_REGEX.search(item).group()
+                object_uris.append(found_url_doi)
+            except AttributeError:
+                try:
+                    just_doi = DOI_REGEX.search(item).group()
+                    object_uris.append('http://dx.doi.org/{}'.format(just_doi.replace('doi:', '').replace('DOI:', '').strip()))
+                except AttributeError:
+                    pass
 
         try:
             found_url = URL_REGEX.search(item).group()
@@ -132,7 +136,8 @@ def oai_process_uris(*args):
             if 'viewcontent' in found_url:
                 object_uris.append(found_url)
             else:
-                provider_uris.append(found_url)
+                if 'dx.doi.org' not in found_url:
+                    provider_uris.append(found_url)
 
     try:
         canonical_uri = (provider_uris + object_uris)[0]

From da353711709ed0ca34ba4a9cc4dc82b7ff60ca2e Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Wed, 16 Sep 2015 15:50:25 -0400
Subject: [PATCH 04/22] Fix tests to reflect more accurate URL and DOI parsing
 in identifiers

---
 tests/test_helpers.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 8037090e..8366befd 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -38,13 +38,18 @@ def oai_process_uris(self):
 
     def test_extract_uris(self):
         identifiers = ['doi:10.whateverwhatever', 'http://alloutofbubblegum.com',
-                       'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com']
+                       'http://viewcontent.cgi/iamacoolpdf', 'http://GETTHETABLES.com',
+                       'Vahedifard, F. et al. (2013). G??otechnique 63, No. 6, 451???462 [http://dx.doi.org/10.1680/geot.11.P.130] ',
+                       'I am a bunch of text but I also have a doi:10.10.thisisarealdoi']
 
         uri_dict = helpers.oai_process_uris(identifiers)
 
         assert uri_dict == {
             'canonicalUri': 'http://alloutofbubblegum.com',
-            'objectUris': ['http://dx.doi.org/10.whateverwhatever', 'http://viewcontent.cgi/iamacoolpdf'],
+            'objectUris': ['http://dx.doi.org/10.whateverwhatever',
+                           'http://viewcontent.cgi/iamacoolpdf',
+                           'http://dx.doi.org/10.1680/geot.11.P.130',
+                           'http://dx.doi.org/10.10.thisisarealdoi'],
             'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com']
         }
 

From f099fc93840519535aa85142c9dee8abb3972892 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 16 Sep 2015 16:03:31 -0400
Subject: [PATCH 05/22] Turn off SSL cert verification for calhoun harvester

---
 scrapi/harvesters/calhoun.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrapi/harvesters/calhoun.py b/scrapi/harvesters/calhoun.py
index 575ec1b0..2ff1cc25 100644
--- a/scrapi/harvesters/calhoun.py
+++ b/scrapi/harvesters/calhoun.py
@@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester):
     short_name = 'calhoun'
     long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School'
     url = 'http://calhoun.nps.edu'
+    verify = False
 
     base_url = 'http://calhoun.nps.edu/oai/request'
     property_list = [

From 4f80fbcf08fbf6194caa68d52df2e9428a44a819 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Wed, 16 Sep 2015 16:09:17 -0400
Subject: [PATCH 06/22] Add correct harvester args to inv harvester README

closes [#SHARE-30]
---
 README.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9b24c47a..aef699b6 100644
--- a/README.md
+++ b/README.md
@@ -202,12 +202,16 @@ For local development, running the ```mit``` harvester is recommended.
 
 Note: harvester-name is the same as the defined harvester "short name".
 
-Invoke a harvester a certain number of days back with the ```--days``` argument. For example, to run a harvester 5 days in the past, run:
+Invoke a harvester for a certain start date with the ```--start``` argument. Invole a harvester for a certain end date with the ```--end``` argument.
+
+For example, to run a harvester between the dates of March 14th and March 16th 2015, run:
 
 ```bash
-$ invoke harvester harvester-name --days=5
+$ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16
 ```
 
+Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
+
 Writing a harvester for inclusion with scrAPI?  If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you.
 
 

From c2d203cf2d7bef0c372d4f46344cbb7e1f1b19ae Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 16 Sep 2015 16:35:20 -0400
Subject: [PATCH 07/22] Make Elasticsearch retry requests on connection
 failures

[#SHARE-58]
---
 scrapi/processing/elasticsearch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapi/processing/elasticsearch.py b/scrapi/processing/elasticsearch.py
index 85afe966..ad922464 100644
--- a/scrapi/processing/elasticsearch.py
+++ b/scrapi/processing/elasticsearch.py
@@ -22,7 +22,7 @@
 
 try:
     # If we cant connect to elastic search dont define this class
-    es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT)
+    es = Elasticsearch(settings.ELASTIC_URI, request_timeout=settings.ELASTIC_TIMEOUT, retry_on_timeout=True)
 
     # body = {
     #     'mappings': {

From ae28cb9d6c744b6cd483f7cf8a1ffb06d860d040 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 10:38:42 -0400
Subject: [PATCH 08/22] Update order of returned URIs for tests

---
 tests/test_helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 8366befd..cfb9c22f 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -47,9 +47,9 @@ def test_extract_uris(self):
         assert uri_dict == {
             'canonicalUri': 'http://alloutofbubblegum.com',
             'objectUris': ['http://dx.doi.org/10.whateverwhatever',
-                           'http://viewcontent.cgi/iamacoolpdf',
                            'http://dx.doi.org/10.1680/geot.11.P.130',
-                           'http://dx.doi.org/10.10.thisisarealdoi'],
+                           'http://dx.doi.org/10.10.thisisarealdoi',
+                           'http://viewcontent.cgi/iamacoolpdf'],
             'providerUris': ['http://alloutofbubblegum.com', 'http://GETTHETABLES.com']
         }
 

From 4abccb4f0253a2d808d32cb8fb2b4d3d269b468f Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 10:39:03 -0400
Subject: [PATCH 09/22] break process URIs down into smaller helper functions
 for better clarity

---
 scrapi/base/helpers.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index bbbcf366..bc7e370c 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -105,7 +105,7 @@ def format_tags(all_tags, sep=','):
     return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()]))
 
 
-def oai_process_uris(*args):
+def gather_identifiers(args):
     identifiers = []
     for arg in args:
         if isinstance(arg, list):
@@ -114,8 +114,11 @@ def oai_process_uris(*args):
         elif arg:
             identifiers.append(arg)
 
+    return identifiers
+
+
+def gather_object_uris(identifiers):
     object_uris = []
-    provider_uris = []
     for item in identifiers:
         if 'doi' in item.lower():
             try:
@@ -127,7 +130,13 @@ def oai_process_uris(*args):
                     object_uris.append('http://dx.doi.org/{}'.format(just_doi.replace('doi:', '').replace('DOI:', '').strip()))
                 except AttributeError:
                     pass
+    return object_uris
+
 
+def seperate_provider_object_uris(identifiers):
+    object_uris = gather_object_uris(identifiers)
+    provider_uris = []
+    for item in identifiers:
         try:
             found_url = URL_REGEX.search(item).group()
         except AttributeError:
@@ -139,6 +148,14 @@ def oai_process_uris(*args):
                 if 'dx.doi.org' not in found_url:
                     provider_uris.append(found_url)
 
+    return provider_uris, object_uris
+
+
+def oai_process_uris(*args):
+    identifiers = gather_identifiers(args)
+
+    provider_uris, object_uris = seperate_provider_object_uris(identifiers)
+
     try:
         canonical_uri = (provider_uris + object_uris)[0]
     except IndexError:

From fd8e4b3069892768e76ae931080c95ee82c92593 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 11:06:03 -0400
Subject: [PATCH 10/22] Update helpers with use_doi kwarg

---
 scrapi/base/helpers.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index bc7e370c..1bed1d67 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -151,13 +151,19 @@ def seperate_provider_object_uris(identifiers):
     return provider_uris, object_uris
 
 
-def oai_process_uris(*args):
-    identifiers = gather_identifiers(args)
+def oai_process_uris(*args, **kwargs):
+    use_doi = kwargs.get('use_doi', False)
 
+    identifiers = gather_identifiers(args)
     provider_uris, object_uris = seperate_provider_object_uris(identifiers)
 
     try:
-        canonical_uri = (provider_uris + object_uris)[0]
+        if use_doi:
+            for uri in object_uris:
+                if 'dx.doi.org' in uri:
+                    canonical_uri = uri
+        else:
+            canonical_uri = (provider_uris + object_uris)[0]
     except IndexError:
         raise ValueError('No Canonical URI was returned for this record.')
 

From bb89f5822148f4a51d63c0046818ab3bea0f3364 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 11:06:22 -0400
Subject: [PATCH 11/22] Update dataone to use the new use_doi kwarg

---
 scrapi/harvesters/dataone.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scrapi/harvesters/dataone.py b/scrapi/harvesters/dataone.py
index bb756753..e3cb76c9 100644
--- a/scrapi/harvesters/dataone.py
+++ b/scrapi/harvesters/dataone.py
@@ -11,6 +11,7 @@
 from datetime import timedelta, date
 
 from lxml import etree
+from functools import partial
 from dateutil.parser import parse
 from xml.etree import ElementTree
 
@@ -18,6 +19,7 @@
 
 from scrapi import requests
 from scrapi import settings
+from scrapi.base import helpers
 from scrapi.base import XMLHarvester
 from scrapi.util import copy_to_unicode
 from scrapi.linter.document import RawDocument
@@ -145,10 +147,7 @@ class DataOneHarvester(XMLHarvester):
             'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None)
         },
         'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors),
-        'uris': {
-            'canonicalUri': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", lambda x, y: y[0] if 'http' in single_result(y) else x[0] if 'http' in single_result(x) else ''),
-            'objectUri': ("arr[@name='resourceMap']/str/node()", compose(lambda x: x.replace('doi:', 'http://dx.doi.org/'), single_result))
-        },
+        'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)),
         'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]),
         'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)),
         'title': ("str[@name='title']/node()", single_result),

From e6daf5a5e9127dad247c1bd758dd128f1099e643 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 11:27:24 -0400
Subject: [PATCH 12/22] Add tests for new seperated URIs

---
 tests/test_helpers.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index cfb9c22f..49a3e168 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -83,3 +83,46 @@ def test_extract_doi_from_text(self):
         extracted_doi = helpers.extract_doi_from_text(text)
 
         assert extracted_doi == 'http://dx.doi.org/10.1021/woowoowoo'
+
+    def test_gather_identifiers(self):
+        identifiers = [['doi:10.whateverwhatever',
+                       'http://viewcontent.cgi/iamacoolpdf'],
+                       '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
+                       'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
+                       ['http://bubbaray.com', 'http://devon.net']]
+
+        gathered = helpers.gather_identifiers(identifiers)
+
+        assert gathered == ['doi:10.whateverwhatever',
+                            'http://viewcontent.cgi/iamacoolpdf',
+                            '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
+                            'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
+                            'http://bubbaray.com',
+                            'http://devon.net']
+
+    def test_gather_object_uris(self):
+        identifiers = ['doi:10.whateverwhatever',
+                       'http://viewcontent.cgi/iamacoolpdf',
+                       '451???462 [http://dx.doi.org/10.1680/geot.11.P.130]',
+                       'I am a bunch of text but I also have a doi:10.10.thisisarealdoi',
+                       'http://bubbaray.com',
+                       'http://devon.net']
+        object_uris = helpers.gather_object_uris(identifiers)
+
+        assert object_uris == [
+            'http://dx.doi.org/10.whateverwhatever',
+            'http://dx.doi.org/10.1680/geot.11.P.130',
+            'http://dx.doi.org/10.10.thisisarealdoi'
+        ]
+
+    def test_seperate_provider_object_uris(self):
+        identifiers = [
+            'http://dx.doi.org/10.whateverwhatever',
+            'http://cgi.viewcontent.apdf.pdf',
+            'http://get_the_tables.net'
+        ]
+
+        provider_uris, object_uris = helpers.seperate_provider_object_uris(identifiers)
+
+        assert provider_uris == ['http://get_the_tables.net']
+        assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf']

From a8847e44a01b8fd337c1d9840b7fb2571e7e1bf5 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Wed, 16 Sep 2015 16:03:31 -0400
Subject: [PATCH 13/22] Turn off SSL cert verification for calhoun harvester

[#SHARE-55]
---
 scrapi/harvesters/calhoun.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrapi/harvesters/calhoun.py b/scrapi/harvesters/calhoun.py
index 575ec1b0..2ff1cc25 100644
--- a/scrapi/harvesters/calhoun.py
+++ b/scrapi/harvesters/calhoun.py
@@ -14,6 +14,7 @@ class CalhounHarvester(OAIHarvester):
     short_name = 'calhoun'
     long_name = 'Calhoun: Institutional Archive of the Naval Postgraduate School'
     url = 'http://calhoun.nps.edu'
+    verify = False
 
     base_url = 'http://calhoun.nps.edu/oai/request'
     property_list = [

From 889ef2d8d04e8172461cc6e6122811a8551f9889 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 12:16:06 -0400
Subject: [PATCH 14/22] Add test for format doi as url

---
 tests/test_helpers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index 49a3e168..eb1fbc9d 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -126,3 +126,10 @@ def test_seperate_provider_object_uris(self):
 
         assert provider_uris == ['http://get_the_tables.net']
         assert object_uris == ['http://dx.doi.org/10.whateverwhatever', 'http://cgi.viewcontent.apdf.pdf']
+
+    def test_format_doi_as_url(self):
+        doi1 = ' doi:10.dudleyzrule '
+        doi2 = 'DOI:10.getthetables '
+
+        assert helpers.format_doi_as_url(doi1) == 'http://dx.doi.org/10.dudleyzrule'
+        assert helpers.format_doi_as_url(doi2) == 'http://dx.doi.org/10.getthetables'

From aad2433d682c520ba312541d7965769d20e7c272 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 12:21:24 -0400
Subject: [PATCH 15/22] Remove some exception driven logic, thanks @fabianvf

---
 scrapi/base/helpers.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index 1bed1d67..ee788717 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -105,6 +105,11 @@ def format_tags(all_tags, sep=','):
     return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()]))
 
 
+def format_doi_as_url(doi):
+    plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip()
+    return 'http://dx.doi.org/{}'.format(plain_doi)
+
+
 def gather_identifiers(args):
     identifiers = []
     for arg in args:
@@ -121,15 +126,11 @@ def gather_object_uris(identifiers):
     object_uris = []
     for item in identifiers:
         if 'doi' in item.lower():
-            try:
-                found_url_doi = URL_REGEX.search(item).group()
-                object_uris.append(found_url_doi)
-            except AttributeError:
-                try:
-                    just_doi = DOI_REGEX.search(item).group()
-                    object_uris.append('http://dx.doi.org/{}'.format(just_doi.replace('doi:', '').replace('DOI:', '').strip()))
-                except AttributeError:
-                    pass
+            url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item)
+            url_doi = url_doi.group() if url_doi else None
+            just_doi = format_doi_as_url(just_doi.group()) if just_doi else None
+            object_uris.append(url_doi or just_doi)
+
     return object_uris
 
 
@@ -137,10 +138,10 @@ def seperate_provider_object_uris(identifiers):
     object_uris = gather_object_uris(identifiers)
     provider_uris = []
     for item in identifiers:
-        try:
-            found_url = URL_REGEX.search(item).group()
-        except AttributeError:
-            found_url = None
+
+        found_url = URL_REGEX.search(item)
+        found_url = found_url.group() if found_url else None
+
         if found_url:
             if 'viewcontent' in found_url:
                 object_uris.append(found_url)

From 8fc24fba99f7ea91eb652fd5dd4c1d79df4c4bed Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 13:25:53 -0400
Subject: [PATCH 16/22] Scale back extent of logic for parsing start times

---
 tasks.py | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/tasks.py b/tasks.py
index 0c610328..33d87b17 100644
--- a/tasks.py
+++ b/tasks.py
@@ -158,23 +158,8 @@ def harvester(harvester_name, async=False, start=None, end=None):
     if not registry.get(harvester_name):
         raise ValueError('No such harvesters {}'.format(harvester_name))
 
-    if end:
-        if parse(end).date() > date.today():
-            logger.info('End date is in the future, deafaulting to today!')
-            end = date.today()
-        else:
-            end = parse(end).date()
-    else:
-        end = date.today()
-
-    if start:
-        if parse(start).date() > end:
-            logger.info('Start date is after end date, defaulting to standard start')
-            start = end - timedelta(settings.DAYS_BACK)
-        else:
-            start = parse(start).date()
-    else:
-        start = end - timedelta(settings.DAYS_BACK)
+    end = parse(end).date() if end else date.today()
+    start = parse(start).date() if start else end - timedelta(settings.DAYS_BACK)
 
     run_harvester.delay(harvester_name, start_date=start, end_date=end)
 

From 80e778eaac95d4cccc998e477afa25e6da4bb06f Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 13:33:03 -0400
Subject: [PATCH 17/22] Update to specify what will happen with just end

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index aef699b6..b79c2bef 100644
--- a/README.md
+++ b/README.md
@@ -210,8 +210,12 @@ For example, to run a harvester between the dates of March 14th and March 16th 2
 $ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16
 ```
 
-Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
+Either --start (-s) or --end (-e) can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
 
+If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date.
+
+
+### Automated OAI PMH Harvester Creation
 Writing a harvester for inclusion with scrAPI?  If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you.
 
 

From af54b885a406fca1dd9a84f3cc322ebe2ecaca27 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 13:37:23 -0400
Subject: [PATCH 18/22] Invole invoke

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b79c2bef..9122cf1f 100644
--- a/README.md
+++ b/README.md
@@ -202,7 +202,7 @@ For local development, running the ```mit``` harvester is recommended.
 
 Note: harvester-name is the same as the defined harvester "short name".
 
-Invoke a harvester for a certain start date with the ```--start``` argument. Invole a harvester for a certain end date with the ```--end``` argument.
+Invoke a harvester for a certain start date with the ```--start``` or ```-s```argument. Invoke a harvester for a certain end date with the ```--end``` or ```-e```argument.
 
 For example, to run a harvester between the dates of March 14th and March 16th 2015, run:
 
@@ -210,7 +210,7 @@ For example, to run a harvester between the dates of March 14th and March 16th 2
 $ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16
 ```
 
-Either --start (-s) or --end (-e) can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
+Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
 
 If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date.
 

From a51b57394e65a890aa3f9c604b4b0e3f88788cac Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 13:46:03 -0400
Subject: [PATCH 19/22] Make match group or none a seperate function

---
 scrapi/base/helpers.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index ee788717..60b2e6e6 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -122,14 +122,21 @@ def gather_identifiers(args):
     return identifiers
 
 
+def maybe_group(match):
+    '''
+    evaluates an regular expression match object, returns the group or none
+    '''
+    return match.group() if match else None
+
+
 def gather_object_uris(identifiers):
     object_uris = []
     for item in identifiers:
         if 'doi' in item.lower():
             url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item)
-            url_doi = url_doi.group() if url_doi else None
-            just_doi = format_doi_as_url(just_doi.group()) if just_doi else None
-            object_uris.append(url_doi or just_doi)
+            url_doi = maybe_group(url_doi)
+            just_doi = maybe_group(just_doi)
+            object_uris.append(url_doi or format_doi_as_url(just_doi))
 
     return object_uris
 
@@ -139,8 +146,7 @@ def seperate_provider_object_uris(identifiers):
     provider_uris = []
     for item in identifiers:
 
-        found_url = URL_REGEX.search(item)
-        found_url = found_url.group() if found_url else None
+        found_url = maybe_group(URL_REGEX.search(item))
 
         if found_url:
             if 'viewcontent' in found_url:

From 30e4b6007b4ce6ae486b66b99e7c751e11685cf1 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 14:53:41 -0400
Subject: [PATCH 20/22] Add sublime files to gitignore

now also closes [#SHARE-53]
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index e4109c0a..a92a9650 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,5 @@ archive/*
 
 # Emacs edited files:
 *.*~
+
+*.sublime*

From 7d47e9c556c73ddfebf0ca8c48102750a7484520 Mon Sep 17 00:00:00 2001
From: erinspace <erin.braswell@gmail.com>
Date: Thu, 17 Sep 2015 15:35:35 -0400
Subject: [PATCH 21/22] Add check for doi or none

---
 scrapi/base/helpers.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
index 60b2e6e6..cf663c24 100644
--- a/scrapi/base/helpers.py
+++ b/scrapi/base/helpers.py
@@ -106,8 +106,9 @@ def format_tags(all_tags, sep=','):
 
 
 def format_doi_as_url(doi):
-    plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip()
-    return 'http://dx.doi.org/{}'.format(plain_doi)
+    if doi:
+        plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip()
+        return 'http://dx.doi.org/{}'.format(plain_doi)
 
 
 def gather_identifiers(args):

From 75725c92b9a353be9a3aabb8b21adb027f5ce267 Mon Sep 17 00:00:00 2001
From: Fabian von Feilitzsch <fabian@fabianism.us>
Date: Thu, 17 Sep 2015 15:56:48 -0400
Subject: [PATCH 22/22] update CHANGELOG

---
 CHANGELOG | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index a7a90a7b..ad02c471 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,17 @@
 ChangeLog
 *********
 
+0.9.6 (2015-09-17)
+==================
+- Elasticsearch now retries requests after connection errors
+- Calhoun harvester now ignores that the SSL cert is invalid
+- OAI url parser now terminates the regex capture after finding an invalid DOI
+character
+- harvester invoke task now puts the default start date as settings.DAYS_BACK
+days before the end date
+- scrapi.requests now exposes the requests.exceptions module
+- Update README.md with updated date information
+
 0.9.5 (2015-09-14)
 ==================
 - Clinical Trials harvester now dumps lxml elements to dicionaries in