Merge branch 'develop' into feature/postgres_api

NeuroVault · Sep 18, 2015 · e53776e · e53776e
2 parents bb36778 + cb3896e
commit e53776e
Show file tree

Hide file tree

Showing 65 changed files with 40,776 additions and 714 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,5 @@ archive/*
 
 # Emacs edited files:
 *.*~
+
+*.sublime*
diff --git a/CHANGELOG b/CHANGELOG
@@ -2,6 +2,33 @@
 ChangeLog
 *********
 
+0.9.6 (2015-09-17)
+==================
+- Elasticsearch now retries requests after connection errors
+- Calhoun harvester now ignores that the SSL cert is invalid
+- OAI url parser now terminates the regex capture after finding an invalid DOI
+character
+- harvester invoke task now puts the default start date as settings.DAYS_BACK
+days before the end date
+- scrapi.requests now exposes the requests.exceptions module
+- Update README.md with updated date information
+
+0.9.5 (2015-09-14)
+==================
+- Clinical Trials harvester now dumps lxml elements to dicionaries in
+	the otherProperties field
+
+
+0.9.4 (2015-09-10)
+==================
+- Biomedcentral harvester now filters out results from the future
+
+0.9.3 (2015-09-01)
+==================
+- Capture more uris from pubmedcentral harvester
+- Update favicons so that all favicons are .icos (fixes IE display bug)
+- Fix longname for Portland State University harvester
+
 0.9.2 (2015-09-01)
 ==================
 - fix specification of canonicalUri requirements in schema

diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
 scrapi
 ======
 
-```master``` build status: [![Build Status](https://travis-ci.org/fabianvf/scrapi.svg?branch=master)](https://travis-ci.org/fabianvf/scrapi)
+```master``` build status: [![Build Status](https://travis-ci.org/CenterForOpenScience/scrapi.svg?branch=master)](https://travis-ci.org/CenterForOpenScience/scrapi)
 
 
-```develop``` build status: [![Build Status](https://travis-ci.org/fabianvf/scrapi.svg?branch=develop)](https://travis-ci.org/fabianvf/scrapi)
+```develop``` build status: [![Build Status](https://travis-ci.org/CenterForOpenScience/scrapi.svg?branch=develop)](https://travis-ci.org/CenterForOpenScience/scrapi)
 
 
-[![Coverage Status](https://coveralls.io/repos/fabianvf/scrapi/badge.svg?branch=develop)](https://coveralls.io/r/fabianvf/scrapi?branch=develop)
-[![Code Climate](https://codeclimate.com/github/fabianvf/scrapi/badges/gpa.svg)](https://codeclimate.com/github/fabianvf/scrapi)
+[![Coverage Status](https://coveralls.io/repos/CenterForOpenScience/scrapi/badge.svg?branch=develop)](https://coveralls.io/r/CenterForOpenScience/scrapi?branch=develop)
+[![Code Climate](https://codeclimate.com/github/CenterForOpenScience/scrapi/badges/gpa.svg)](https://codeclimate.com/github/CenterForOpenScience/scrapi)
 
 ## Getting started
 
@@ -240,12 +240,20 @@ For local development, running the ```mit``` harvester is recommended.
 
 Note: harvester-name is the same as the defined harvester "short name".
 
-Invoke a harvester a certain number of days back with the ```--days``` argument. For example, to run a harvester 5 days in the past, run:
+Invoke a harvester for a certain start date with the ```--start``` or ```-s```argument. Invoke a harvester for a certain end date with the ```--end``` or ```-e```argument.
+
+For example, to run a harvester between the dates of March 14th and March 16th 2015, run:
 
 ```bash
-$ invoke harvester harvester-name --days=5
+$ invoke harvester harvester-name --start 2015-03-14 --end 2015-03-16
 ```
 
+Either --start or --end can also be used on their own. Not supplying arguments will default to starting the number of days specified in ```settings.DAYS_BACK``` and ending on the current date.
+
+If --end is given with no --start, start will default to the number of days specified in ```settings.DAYS_BACK``` before the given end date.
+
+
+### Automated OAI PMH Harvester Creation
 Writing a harvester for inclusion with scrAPI?  If the provider makes their metadata available using the OAI-PMH standard, then [autooai](https://github.com/erinspace/autooai) is a utility that will do most of the work for you.
 
 

diff --git a/img/favicons/asu_favicon.ico b/img/favicons/asu_favicon.ico
diff --git a/img/favicons/calhoun_favicon.ico b/img/favicons/calhoun_favicon.ico
diff --git a/img/favicons/calpoly_favicon.ico b/img/favicons/calpoly_favicon.ico
diff --git a/img/favicons/dash_favicon.ico b/img/favicons/dash_favicon.ico
diff --git a/img/favicons/dryad_favicon.ico b/img/favicons/dryad_favicon.ico
diff --git a/img/favicons/hacettepe_favicon.ico b/img/favicons/hacettepe_favicon.ico
diff --git a/img/favicons/iowaresearch_favicon.ico b/img/favicons/iowaresearch_favicon.ico
diff --git a/img/favicons/kent_favicon.ico b/img/favicons/kent_favicon.ico
diff --git a/img/favicons/mblwhoilibrary_favicon.ico b/img/favicons/mblwhoilibrary_favicon.ico
diff --git a/img/favicons/mit_favicon.ico b/img/favicons/mit_favicon.ico
diff --git a/img/favicons/oaktrust_favicon.ico b/img/favicons/oaktrust_favicon.ico
diff --git a/img/favicons/osf_favicon.ico b/img/favicons/osf_favicon.ico
diff --git a/img/favicons/pcom_favicon.ico b/img/favicons/pcom_favicon.ico
diff --git a/img/favicons/pdxscholar_favicon.ico b/img/favicons/pdxscholar_favicon.ico
diff --git a/img/favicons/plos_favicon.ico b/img/favicons/plos_favicon.ico
diff --git a/img/favicons/pubmedcentral_favicon.ico b/img/favicons/pubmedcentral_favicon.ico
diff --git a/img/favicons/scholarsarchiveosu_favicon.ico b/img/favicons/scholarsarchiveosu_favicon.ico
diff --git a/img/favicons/shareok_favicon.ico b/img/favicons/shareok_favicon.ico
diff --git a/img/favicons/smithsonian_favicon.ico b/img/favicons/smithsonian_favicon.ico
diff --git a/img/favicons/springer_favicon.ico b/img/favicons/springer_favicon.ico
diff --git a/img/favicons/tdar_favicon.ico b/img/favicons/tdar_favicon.ico
diff --git a/img/favicons/texasstate_favicon.ico b/img/favicons/texasstate_favicon.ico
diff --git a/img/favicons/ucescholarship_favicon.ico b/img/favicons/ucescholarship_favicon.ico
diff --git a/img/favicons/uiuc_favicon.ico b/img/favicons/uiuc_favicon.ico
diff --git a/img/favicons/uiucideals_favicon.ico b/img/favicons/uiucideals_favicon.ico
diff --git a/img/favicons/upennsylvania_favicon.ico b/img/favicons/upennsylvania_favicon.ico
diff --git a/img/favicons/utaustin_favicon.ico b/img/favicons/utaustin_favicon.ico
diff --git a/img/favicons/utktrace_favicon.ico b/img/favicons/utktrace_favicon.ico
diff --git a/img/favicons/valposcholar_favicon.ico b/img/favicons/valposcholar_favicon.ico
diff --git a/img/favicons/vtech_favicon.ico b/img/favicons/vtech_favicon.ico
diff --git a/requirements.txt b/requirements.txt
@@ -23,3 +23,4 @@ django-cors-headers==1.1.0
 psycopg2==2.6.1
 rfc3987==1.3.4
 strict-rfc3339==0.5
+xmltodict==0.9.2
diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py
@@ -21,7 +21,7 @@
     build_properties,
     oai_get_records_and_token,
     compose,
-    date_formatter,
+    datetime_formatter,
     null_on_error,
     coerce_to_list
 )
@@ -154,7 +154,7 @@ def formatted_properties(self):
 
     def format_property(self, property):
         if property == 'date':
-            fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property)
+            fn = compose(lambda x: map(null_on_error(datetime_formatter), x), coerce_to_list, self.resolve_property)
         else:
             fn = self.resolve_property
         return (property, (

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
@@ -15,12 +15,22 @@
 from scrapi import requests
 
 
-URL_REGEX = re.compile(r'(https?://\S*\.\S*)')
+URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
 
-''' Takes a value, returns a function that always returns that value
-    Useful inside schemas for defining constants '''
-CONSTANT = lambda x: lambda *_, **__: x
+
+def CONSTANT(x):
+    ''' Takes a value, returns a function that always returns that value
+        Useful inside schemas for defining constants
+
+        >>> CONSTANT(7)('my', 'name', verb='is')
+        7
+        >>> CONSTANT([123, 456])()
+        [123, 456]
+    '''
+    def inner(*y, **z):
+        return x
+    return inner
 
 
 def build_properties(*args):
@@ -48,15 +58,37 @@ def build_property(name, expr, description=None, uri=None):
 
 
 def single_result(l, default=''):
-    return l[0] if l else default
+    ''' A function that will return the first element of a list if it exists
 
+        >>> print(single_result(['hello', None]))
+        hello
+        >>> print(single_result([], default='hello'))
+        hello
+        >>> print(single_result([]))
+        <BLANKLINE>
 
-def compose(*functions):
     '''
-    evaluates functions from right to left.
-    ex. compose(f, g)(*x, **y) = f(g(*x, **y))
+    return l[0] if l else default
+
 
-    credit to sloria
+def compose(*functions):
+    ''' evaluates functions from right to left.
+
+        >>> add = lambda x, y: x + y
+        >>> add3 = lambda x: x + 3
+        >>> divide2 = lambda x: x/2
+        >>> subtract4 = lambda x: x - 4
+        >>> subtract1 = compose(add3, subtract4)
+        >>> subtract1(1)
+        0
+        >>> compose(subtract1, add3)(4)
+        6
+        >>> compose(int, add3, add3, divide2)(4)
+        8
+        >>> compose(int, divide2, add3, add3)(4)
+        5
+        >>> compose(int, divide2, compose(add3, add3), add)(7, 3)
+        8
     '''
     def inner(func1, func2):
         return lambda *x, **y: func1(func2(*x, **y))
@@ -65,8 +97,18 @@ def inner(func1, func2):
 
 def updated_schema(old, new):
     ''' Creates a dictionary resulting from adding all keys/values of the second to the first
+        The second dictionary will overwrite the first.
 
-    The second dictionary will overwrite the first.'''
+        >>> old, new = {'name': 'ric', 'job': None}, {'name': 'Rick'}
+        >>> updated = updated_schema(old, new)
+        >>> len(updated.keys())
+        2
+        >>> print(updated['name'])
+        Rick
+        >>> updated['job'] is None
+        True
+
+    '''
     d = deepcopy(old)
     for key, value in new.items():
         if isinstance(value, dict) and old.get(key) and isinstance(old[key], dict):
@@ -77,18 +119,28 @@ def updated_schema(old, new):
 
 
 def default_name_parser(names):
-    contributor_list = []
-    for person in names:
-        name = HumanName(person)
-        contributor = {
-            'name': person,
-            'givenName': name.first,
-            'additionalName': name.middle,
-            'familyName': name.last,
-        }
-        contributor_list.append(contributor)
+    ''' Takes a list of names, and attempts to parse them
+    '''
+    return list(map(maybe_parse_name, names))
 
-    return contributor_list
+
+def maybe_parse_name(name):
+    ''' Tries to parse a name. If the parsing fails, returns a dictionary
+        with just the unparsed name (as per the SHARE schema)
+    '''
+    return null_on_error(parse_name)(name) or {'name': name}
+
+
+def parse_name(name):
+    ''' Takes a human name, parses it into given/middle/last names
+    '''
+    person = HumanName(name)
+    return {
+        'name': name,
+        'givenName': person.first,
+        'additionalName': person.middle,
+        'familyName': person.last
+    }
 
 
 def format_tags(all_tags, sep=','):
@@ -105,7 +157,13 @@ def format_tags(all_tags, sep=','):
     return list(set([six.text_type(tag.lower().strip()) for tag in tags if tag.strip()]))
 
 
-def oai_process_uris(*args):
+def format_doi_as_url(doi):
+    if doi:
+        plain_doi = doi.replace('doi:', '').replace('DOI:', '').strip()
+        return 'http://dx.doi.org/{}'.format(plain_doi)
+
+
+def gather_identifiers(args):
     identifiers = []
     for arg in args:
         if isinstance(arg, list):
@@ -114,28 +172,58 @@ def oai_process_uris(*args):
         elif arg:
             identifiers.append(arg)
 
+    return identifiers
+
+
+def maybe_group(match):
+    '''
+    evaluates an regular expression match object, returns the group or none
+    '''
+    return match.group() if match else None
+
+
+def gather_object_uris(identifiers):
     object_uris = []
-    provider_uris = []
     for item in identifiers:
         if 'doi' in item.lower():
-            doi = item.replace('doi:', '').replace('DOI:', '').strip()
-            if 'http://dx.doi.org/' in doi:
-                object_uris.append(doi)
-            else:
-                object_uris.append('http://dx.doi.org/{}'.format(doi))
+            url_doi, just_doi = URL_REGEX.search(item), DOI_REGEX.search(item)
+            url_doi = maybe_group(url_doi)
+            just_doi = maybe_group(just_doi)
+            object_uris.append(url_doi or format_doi_as_url(just_doi))
+
+    return object_uris
+
+
+def seperate_provider_object_uris(identifiers):
+    object_uris = gather_object_uris(identifiers)
+    provider_uris = []
+    for item in identifiers:
+
+        found_url = maybe_group(URL_REGEX.search(item))
 
-        try:
-            found_url = URL_REGEX.search(item).group()
-        except AttributeError:
-            found_url = None
         if found_url:
             if 'viewcontent' in found_url:
                 object_uris.append(found_url)
             else:
-                provider_uris.append(found_url)
+                if 'dx.doi.org' not in found_url:
+                    provider_uris.append(found_url)
+
+    return provider_uris, object_uris
+
+
+def oai_process_uris(*args, **kwargs):
+    use_doi = kwargs.get('use_doi', False)
+
+    identifiers = gather_identifiers(args)
+    provider_uris, object_uris = seperate_provider_object_uris(identifiers)
 
     try:
-        canonical_uri = (provider_uris + object_uris)[0]
+        if use_doi:
+            for uri in object_uris:
+                if 'dx.doi.org' in uri:
+                    canonical_uri = uri
+        else:
+            canonical_uri = (provider_uris + object_uris)[0]
     except IndexError:
         raise ValueError('No Canonical URI was returned for this record.')
 
@@ -242,17 +330,34 @@ def inner(*args, **kwargs):
 
 
 def coerce_to_list(thing):
-    ''' If a value is not already a list or tuple, puts that value in a length 1 list'''
+    ''' If a value is not already a list or tuple, puts that value in a length 1 list
+
+        >>> niceties = coerce_to_list('hello')
+        >>> len(niceties)
+        1
+        >>> print(niceties[0])
+        hello
+        >>> niceties2 = coerce_to_list(['hello'])
+        >>> niceties2 == niceties
+        True
+        >>> niceties3 = (coerce_to_list(('hello', 'goodbye')))
+        >>> len(niceties3)
+        2
+        >>> print(niceties3[0])
+        hello
+        >>> print(niceties3[1])
+        goodbye
+    '''
     if not (isinstance(thing, list) or isinstance(thing, tuple)):
         return [thing]
-    return thing
+    return list(thing)
 
 
-def date_formatter(date_string):
+def datetime_formatter(datetime_string):
     '''Takes an arbitrary date/time string and parses it, adds time
     zone information and returns a valid ISO-8601 datetime string
     '''
-    date_time = parser.parse(date_string)
+    date_time = parser.parse(datetime_string)
     if not date_time.tzinfo:
         date_time = date_time.replace(tzinfo=pytz.UTC)
     return date_time.isoformat()
diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
@@ -5,7 +5,7 @@
     format_tags,
     single_result,
     language_codes,
-    date_formatter,
+    datetime_formatter,
     oai_process_uris,
     build_properties,
     default_name_parser,
@@ -17,7 +17,7 @@
     "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
     "contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)),
     "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
-    "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(date_formatter, single_result)),
+    "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
     "uris": {
         "canonicalUri": ('//dcq:identifier-citation/node()', compose(lambda x: x.strip(), single_result)),
         "objectUris": [('//dc:doi/node()', compose(lambda x: 'http://dx.doi.org/' + x, single_result))]
@@ -53,7 +53,7 @@
 OAISCHEMA = {
     "contributors": ('//dc:creator/node()', '//dc:contributor/node()', oai_process_contributors),
     "uris": ('//dc:doi/node()', '//dc:identifier/node()', oai_process_uris),
-    'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(date_formatter, single_result)),
+    'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(datetime_formatter, single_result)),
     'title': ('//dc:title/node()', single_result),
     'description': ('//dc:description/node()', single_result),
     'subjects': ('//dc:subject/node()', format_tags),
Original file line number	Diff line number	Diff line change
Expand Up		@@ -65,3 +65,5 @@ archive/*

		# Emacs edited files:
		.~

		.sublime