strip nonprinting characters, including unicode, from susceptible inp…

…ut ids not stripping from tiids and cids right now… they seems safe. for completeness maybe do this in the future? not sure.
ourresearch · Aug 14, 2013 · c10b6bb · c10b6bb
1 parent 4b9a498
commit c10b6bb
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 37 deletions.
diff --git a/test/unit_tests/test_unicode_helpers.py b/test/unit_tests/test_unicode_helpers.py
@@ -0,0 +1,28 @@
+from nose.tools import raises, assert_equals, nottest
+
+from totalimpact import unicode_helpers
+
+
+
+class TestUnicodeHelpers():
+
+    def setUp(self):
+        pass
+
+    def test_remove_nonprinting_characters(self):
+        unicode_input = u"hi"
+        response = unicode_helpers.remove_nonprinting_characters(unicode_input)
+        expected = u"hi"
+        assert_equals(response, expected)
+
+    def test_remove_nonprinting_characters(self):
+        unicode_input = '0000-0001-8907-4150\xe2\x80\x8e' # a nonprinting character at the end
+        response = unicode_helpers.remove_nonprinting_characters(unicode_input)
+        expected = "0000-0001-8907-4150"
+        assert_equals(response, expected)
+
+    def test_remove_nonprinting_characters_unicode_input(self):
+        unicode_input = u'0000-0001-8907-4150\u200e'  # a nonprinting character at the end
+        response = unicode_helpers.remove_nonprinting_characters(unicode_input)
+        expected = u"0000-0001-8907-4150"
+        assert_equals(response, expected)
diff --git a/test/unit_tests/test_views.py b/test/unit_tests/test_views.py
@@ -219,6 +219,14 @@ def test_memberitems_get(self):
         assert_equals(json.loads(response.data)["memberitems"], GOLD_MEMBER_ITEM_CONTENT)
         assert_equals(response.mimetype, "application/json")
 
+    def test_memberitems_get_with_nonprinting_character(self):        
+        response = self.client.get(u'/provider/dryad/memberitems/Otto\u200e%2C%20Sarah%20P.?method=sync')
+        print response
+        print response.data
+        assert_equals(response.status_code, 200)
+        assert_equals(json.loads(response.data)["memberitems"], GOLD_MEMBER_ITEM_CONTENT)
+        assert_equals(response.mimetype, "application/json")
+
     def test_file_parsing(self):
         datadir = os.path.join(os.path.split(__file__)[0], "../../extras/sample_provider_pages/bibtex")
         path = os.path.join(datadir, "Vision.bib")

diff --git a/totalimpact/item.py b/totalimpact/item.py
@@ -4,6 +4,8 @@
 
 from totalimpact.providers.provider import ProviderFactory
 from totalimpact.providers.provider import ProviderTimeout, ProviderServerError
+from totalimpact import unicode_helpers
+
 from totalimpact import default_settings
 from totalimpact.utils import Retry
 
@@ -13,11 +15,6 @@
 import logging
 logger = logging.getLogger('ti.item')
 
-# setup to remove control characters from received IDs
-# from http://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
-control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
-control_char_re = re.compile('[%s]' % re.escape(control_chars))
-
 class NotAuthenticatedError(Exception):
     pass
 
@@ -36,9 +33,8 @@ def largest_value_that_is_less_than_or_equal_to(target, collection):
 
 def clean_id(nid):
     try:
-        nid = control_char_re.sub('', nid)
-        nid = nid.replace(u'\u200b', "")
         nid = nid.strip()
+        nid = unicode_helpers.remove_nonprinting_characters(nid)
     except TypeError:
         #isn't a string.  That's ok, might be biblio
         pass
@@ -399,6 +395,8 @@ def create_or_find_items_from_aliases(clean_aliases, myredis, mydao):
     new_items = []
     for alias in clean_aliases:
         (namespace, nid) = alias
+        namespace = clean_id(namespace)
+        nid = clean_id(nid)
         existing_tiid = get_tiid_by_alias(namespace, nid, mydao)
         if existing_tiid:
             tiids.append(existing_tiid)
@@ -411,8 +409,6 @@ def create_or_find_items_from_aliases(clean_aliases, myredis, mydao):
                     alias=alias
                 ))
             item = make()
-            namespace = clean_id(namespace)
-            nid = clean_id(nid)
             item["aliases"][namespace] = [nid]
             item["aliases"] = canonical_aliases(item["aliases"])
 

diff --git a/totalimpact/providers/bibtex.py b/totalimpact/providers/bibtex.py
@@ -7,7 +7,7 @@
 
 from totalimpact.providers import provider
 from totalimpact.providers.provider import Provider, ProviderContentMalformedError, ProviderTimeout, ProviderServerError
-from totalimpact import utils 
+from totalimpact import unicode_helpers 
 from totalimpact.providers import bibtex_lookup
 
 import logging
@@ -42,7 +42,7 @@ def __init__(self):
         self.bibtex_to_unicode = build_bibtex_to_unicode(bibtex_lookup.unicode_to_latex)
 
     def _to_unicode(self, text):
-        text = utils.to_unicode_or_bust(text)
+        text = unicode_helpers.to_unicode_or_bust(text)
         if "{" in text:
             text = text.replace("\\", "")
             for i, j in self.bibtex_to_unicode.iteritems():

diff --git a/totalimpact/providers/webpage.py b/totalimpact/providers/webpage.py
@@ -1,6 +1,6 @@
 from totalimpact.providers import provider
 from totalimpact.providers.provider import Provider, ProviderContentMalformedError
-from totalimpact import utils
+from totalimpact import unicode_helpers
 
 import lxml.html
 import re
@@ -89,7 +89,7 @@ def _extract_biblio(self, page, id=None):
         if not page:
             return biblio_dict
 
-        unicode_page = utils.to_unicode_or_bust(page)
+        unicode_page = unicode_helpers.to_unicode_or_bust(page)
         try:
             parsed_html = lxml.html.document_fromstring(unicode_page)
 

diff --git a/totalimpact/unicode_helpers.py b/totalimpact/unicode_helpers.py
@@ -0,0 +1,31 @@
+import unicodedata
+import logging
+
+logger = logging.getLogger('ti.unicode_helpers')
+
+#from http://farmdev.com/talks/unicode/
+def to_unicode_or_bust(obj, encoding='utf-8'):
+    if isinstance(obj, basestring):
+        if not isinstance(obj, unicode):
+            obj = unicode(obj, encoding)
+    return obj
+
+
+def remove_nonprinting_characters(input, encoding='utf-8'):
+    input_was_unicode = True
+    if isinstance(input, basestring):
+        if not isinstance(input, unicode):
+            input_was_unicode = False
+
+    unicode_input = to_unicode_or_bust(input)
+
+    # see http://www.fileformat.info/info/unicode/category/index.htm
+    char_classes_to_remove = ["C", "M", "Z"]
+
+    response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove)
+
+    if not input_was_unicode:
+        response = response.encode(encoding)
+
+    return response
+
diff --git a/totalimpact/utils.py b/totalimpact/utils.py
@@ -41,11 +41,3 @@ def fn(*args, **kwargs):
             return False # fail silently...
         return fn
 
-
-#from http://farmdev.com/talks/unicode/
-def to_unicode_or_bust(obj, encoding='utf-8'):
-     if isinstance(obj, basestring):
-         if not isinstance(obj, unicode):
-             obj = unicode(obj, encoding)
-     return obj
-
diff --git a/totalimpact/views.py b/totalimpact/views.py
@@ -12,6 +12,7 @@
 from totalimpact import item as item_module
 from totalimpact.models import MemberItems, UserFactory, NotAuthenticatedError
 from totalimpact.providers.provider import ProviderFactory, ProviderItemNotFoundError, ProviderError, ProviderServerError, ProviderTimeout
+from totalimpact import unicode_helpers
 from totalimpact import default_settings
 import logging
 
@@ -325,6 +326,7 @@ def provider_memberitems_get(provider_name, query):
     """
     Gets aliases associated with a query from a given provider.
     """
+    query = unicode_helpers.remove_nonprinting_characters(query)
 
     try:
         provider = ProviderFactory.get_provider(provider_name)
@@ -488,6 +490,19 @@ def delete_items(cid=""):
     return resp
 
 
+def get_alias_strings(aliases):
+    alias_strings = []
+    for (namespace, nid) in aliases:
+        namespace = item_module.clean_id(namespace)
+        nid = item_module.clean_id(nid)
+        try:
+            alias_strings += [namespace+":"+nid]
+        except TypeError:
+            # jsonify the biblio dicts
+            alias_strings += [namespace+":"+json.dumps(nid)]
+    return alias_strings   
+
+
 @app.route("/collection/<cid>/items", methods=["PUT"])
 @app.route("/v1/collection/<cid>/items", methods=["PUT"])
 def put_collection(cid=""):
@@ -499,12 +514,7 @@ def put_collection(cid=""):
 
     try:
         aliases = request.json["aliases"]
-        try:
-            alias_strings = [namespace+":"+nid for (namespace, nid) in aliases]
-        except TypeError:
-            # jsonify the biblio dicts
-            alias_strings = [namespace+":"+json.dumps(nid) for (namespace, nid) in aliases]
-
+        alias_strings = get_alias_strings(aliases)
         (tiids, new_items) = item_module.create_or_update_items_from_aliases(
             aliases, myredis, mydao)
 
@@ -531,6 +541,7 @@ def put_collection(cid=""):
     return resp
 
 
+
 """ Updates all the items in a given collection.
 """
 @app.route("/collection/<cid>", methods=["POST"])
@@ -555,7 +566,6 @@ def collection_update(cid=""):
     return resp
 
 
-
 # creates a collection with aliases
 @app.route('/collection', methods=['POST'])
 @app.route('/v1/collection', methods=['POST'])
@@ -574,9 +584,6 @@ def collection_create():
         coll["title"] = request.json["title"]
         aliases = request.json["aliases"]
         (tiids, new_items) = item_module.create_or_update_items_from_aliases(aliases, myredis, mydao)
-        for item in new_items:
-            namespaces = item["aliases"].keys()
-
         if not tiids:
             abort_custom(404, "POST /collection requires a list of [namespace, id] pairs.")
     except (AttributeError, TypeError):
@@ -587,14 +594,10 @@ def collection_create():
                 json=str(request.json)))
         abort_custom(404, "Missing arguments.")
 
-    try:
-        alias_strings = aliases_strings = [namespace+":"+nid for (namespace, nid) in aliases]
-    except TypeError:
-        # jsonify the biblio dicts
-        alias_strings = aliases_strings = [namespace+":"+json.dumps(nid) for (namespace, nid) in aliases]
+    alias_strings = get_alias_strings(aliases)
 
     # save dict of alias:tiid
-    coll["alias_tiids"] = dict(zip(aliases_strings, tiids))
+    coll["alias_tiids"] = dict(zip(alias_strings, tiids))
 
     logger.info(json.dumps(coll, sort_keys=True, indent=4))