diff --git a/test/unit_tests/test_unicode_helpers.py b/test/unit_tests/test_unicode_helpers.py new file mode 100644 index 00000000..9b65c0ae --- /dev/null +++ b/test/unit_tests/test_unicode_helpers.py @@ -0,0 +1,28 @@ +from nose.tools import raises, assert_equals, nottest + +from totalimpact import unicode_helpers + + + +class TestUnicodeHelpers(): + + def setUp(self): + pass + + def test_remove_nonprinting_characters(self): + unicode_input = u"hi" + response = unicode_helpers.remove_nonprinting_characters(unicode_input) + expected = u"hi" + assert_equals(response, expected) + + def test_remove_nonprinting_characters(self): + unicode_input = '0000-0001-8907-4150\xe2\x80\x8e' # a nonprinting character at the end + response = unicode_helpers.remove_nonprinting_characters(unicode_input) + expected = "0000-0001-8907-4150" + assert_equals(response, expected) + + def test_remove_nonprinting_characters_unicode_input(self): + unicode_input = u'0000-0001-8907-4150\u200e' # a nonprinting character at the end + response = unicode_helpers.remove_nonprinting_characters(unicode_input) + expected = u"0000-0001-8907-4150" + assert_equals(response, expected) diff --git a/test/unit_tests/test_views.py b/test/unit_tests/test_views.py index f74e883f..6f6335e2 100644 --- a/test/unit_tests/test_views.py +++ b/test/unit_tests/test_views.py @@ -219,6 +219,14 @@ def test_memberitems_get(self): assert_equals(json.loads(response.data)["memberitems"], GOLD_MEMBER_ITEM_CONTENT) assert_equals(response.mimetype, "application/json") + def test_memberitems_get_with_nonprinting_character(self): + response = self.client.get(u'/provider/dryad/memberitems/Otto\u200e%2C%20Sarah%20P.?method=sync') + print response + print response.data + assert_equals(response.status_code, 200) + assert_equals(json.loads(response.data)["memberitems"], GOLD_MEMBER_ITEM_CONTENT) + assert_equals(response.mimetype, "application/json") + def test_file_parsing(self): datadir = os.path.join(os.path.split(__file__)[0], "../../extras/sample_provider_pages/bibtex") path = os.path.join(datadir, "Vision.bib") diff --git a/totalimpact/item.py b/totalimpact/item.py index 173561c6..b432ecec 100644 --- a/totalimpact/item.py +++ b/totalimpact/item.py @@ -4,6 +4,8 @@ from totalimpact.providers.provider import ProviderFactory from totalimpact.providers.provider import ProviderTimeout, ProviderServerError +from totalimpact import unicode_helpers + from totalimpact import default_settings from totalimpact.utils import Retry @@ -13,11 +15,6 @@ import logging logger = logging.getLogger('ti.item') -# setup to remove control characters from received IDs -# from http://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python -control_chars = ''.join(map(unichr, range(0,32) + range(127,160))) -control_char_re = re.compile('[%s]' % re.escape(control_chars)) - class NotAuthenticatedError(Exception): pass @@ -36,9 +33,8 @@ def largest_value_that_is_less_than_or_equal_to(target, collection): def clean_id(nid): try: - nid = control_char_re.sub('', nid) - nid = nid.replace(u'\u200b', "") nid = nid.strip() + nid = unicode_helpers.remove_nonprinting_characters(nid) except TypeError: #isn't a string. That's ok, might be biblio pass @@ -399,6 +395,8 @@ def create_or_find_items_from_aliases(clean_aliases, myredis, mydao): new_items = [] for alias in clean_aliases: (namespace, nid) = alias + namespace = clean_id(namespace) + nid = clean_id(nid) existing_tiid = get_tiid_by_alias(namespace, nid, mydao) if existing_tiid: tiids.append(existing_tiid) @@ -411,8 +409,6 @@ def create_or_find_items_from_aliases(clean_aliases, myredis, mydao): alias=alias )) item = make() - namespace = clean_id(namespace) - nid = clean_id(nid) item["aliases"][namespace] = [nid] item["aliases"] = canonical_aliases(item["aliases"]) diff --git a/totalimpact/providers/bibtex.py b/totalimpact/providers/bibtex.py index 60eec623..f7fb6ed2 100644 --- a/totalimpact/providers/bibtex.py +++ b/totalimpact/providers/bibtex.py @@ -7,7 +7,7 @@ from totalimpact.providers import provider from totalimpact.providers.provider import Provider, ProviderContentMalformedError, ProviderTimeout, ProviderServerError -from totalimpact import utils +from totalimpact import unicode_helpers from totalimpact.providers import bibtex_lookup import logging @@ -42,7 +42,7 @@ def __init__(self): self.bibtex_to_unicode = build_bibtex_to_unicode(bibtex_lookup.unicode_to_latex) def _to_unicode(self, text): - text = utils.to_unicode_or_bust(text) + text = unicode_helpers.to_unicode_or_bust(text) if "{" in text: text = text.replace("\\", "") for i, j in self.bibtex_to_unicode.iteritems(): diff --git a/totalimpact/providers/webpage.py b/totalimpact/providers/webpage.py index 4e1b1ad3..36fc41ae 100644 --- a/totalimpact/providers/webpage.py +++ b/totalimpact/providers/webpage.py @@ -1,6 +1,6 @@ from totalimpact.providers import provider from totalimpact.providers.provider import Provider, ProviderContentMalformedError -from totalimpact import utils +from totalimpact import unicode_helpers import lxml.html import re @@ -89,7 +89,7 @@ def _extract_biblio(self, page, id=None): if not page: return biblio_dict - unicode_page = utils.to_unicode_or_bust(page) + unicode_page = unicode_helpers.to_unicode_or_bust(page) try: parsed_html = lxml.html.document_fromstring(unicode_page) diff --git a/totalimpact/unicode_helpers.py b/totalimpact/unicode_helpers.py new file mode 100644 index 00000000..53074e22 --- /dev/null +++ b/totalimpact/unicode_helpers.py @@ -0,0 +1,31 @@ +import unicodedata +import logging + +logger = logging.getLogger('ti.unicode_helpers') + +#from http://farmdev.com/talks/unicode/ +def to_unicode_or_bust(obj, encoding='utf-8'): + if isinstance(obj, basestring): + if not isinstance(obj, unicode): + obj = unicode(obj, encoding) + return obj + + +def remove_nonprinting_characters(input, encoding='utf-8'): + input_was_unicode = True + if isinstance(input, basestring): + if not isinstance(input, unicode): + input_was_unicode = False + + unicode_input = to_unicode_or_bust(input) + + # see http://www.fileformat.info/info/unicode/category/index.htm + char_classes_to_remove = ["C", "M", "Z"] + + response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove) + + if not input_was_unicode: + response = response.encode(encoding) + + return response + diff --git a/totalimpact/utils.py b/totalimpact/utils.py index 99953ba5..afec7e18 100644 --- a/totalimpact/utils.py +++ b/totalimpact/utils.py @@ -41,11 +41,3 @@ def fn(*args, **kwargs): return False # fail silently... return fn - -#from http://farmdev.com/talks/unicode/ -def to_unicode_or_bust(obj, encoding='utf-8'): - if isinstance(obj, basestring): - if not isinstance(obj, unicode): - obj = unicode(obj, encoding) - return obj - diff --git a/totalimpact/views.py b/totalimpact/views.py index 124eb900..8be8f50c 100755 --- a/totalimpact/views.py +++ b/totalimpact/views.py @@ -12,6 +12,7 @@ from totalimpact import item as item_module from totalimpact.models import MemberItems, UserFactory, NotAuthenticatedError from totalimpact.providers.provider import ProviderFactory, ProviderItemNotFoundError, ProviderError, ProviderServerError, ProviderTimeout +from totalimpact import unicode_helpers from totalimpact import default_settings import logging @@ -325,6 +326,7 @@ def provider_memberitems_get(provider_name, query): """ Gets aliases associated with a query from a given provider. """ + query = unicode_helpers.remove_nonprinting_characters(query) try: provider = ProviderFactory.get_provider(provider_name) @@ -488,6 +490,19 @@ def delete_items(cid=""): return resp +def get_alias_strings(aliases): + alias_strings = [] + for (namespace, nid) in aliases: + namespace = item_module.clean_id(namespace) + nid = item_module.clean_id(nid) + try: + alias_strings += [namespace+":"+nid] + except TypeError: + # jsonify the biblio dicts + alias_strings += [namespace+":"+json.dumps(nid)] + return alias_strings + + @app.route("/collection//items", methods=["PUT"]) @app.route("/v1/collection//items", methods=["PUT"]) def put_collection(cid=""): @@ -499,12 +514,7 @@ def put_collection(cid=""): try: aliases = request.json["aliases"] - try: - alias_strings = [namespace+":"+nid for (namespace, nid) in aliases] - except TypeError: - # jsonify the biblio dicts - alias_strings = [namespace+":"+json.dumps(nid) for (namespace, nid) in aliases] - + alias_strings = get_alias_strings(aliases) (tiids, new_items) = item_module.create_or_update_items_from_aliases( aliases, myredis, mydao) @@ -531,6 +541,7 @@ def put_collection(cid=""): return resp + """ Updates all the items in a given collection. """ @app.route("/collection/", methods=["POST"]) @@ -555,7 +566,6 @@ def collection_update(cid=""): return resp - # creates a collection with aliases @app.route('/collection', methods=['POST']) @app.route('/v1/collection', methods=['POST']) @@ -574,9 +584,6 @@ def collection_create(): coll["title"] = request.json["title"] aliases = request.json["aliases"] (tiids, new_items) = item_module.create_or_update_items_from_aliases(aliases, myredis, mydao) - for item in new_items: - namespaces = item["aliases"].keys() - if not tiids: abort_custom(404, "POST /collection requires a list of [namespace, id] pairs.") except (AttributeError, TypeError): @@ -587,14 +594,10 @@ def collection_create(): json=str(request.json))) abort_custom(404, "Missing arguments.") - try: - alias_strings = aliases_strings = [namespace+":"+nid for (namespace, nid) in aliases] - except TypeError: - # jsonify the biblio dicts - alias_strings = aliases_strings = [namespace+":"+json.dumps(nid) for (namespace, nid) in aliases] + alias_strings = get_alias_strings(aliases) # save dict of alias:tiid - coll["alias_tiids"] = dict(zip(aliases_strings, tiids)) + coll["alias_tiids"] = dict(zip(alias_strings, tiids)) logger.info(json.dumps(coll, sort_keys=True, indent=4))