Skip to content

Commit

Permalink
strip nonprinting characters, including unicode, from susceptible inp…
Browse files Browse the repository at this point in the history
…ut ids

not stripping from tiids and cids right now… they seems safe.  for completeness maybe do this in the future?  not sure.
  • Loading branch information
hpiwowar committed Aug 14, 2013
1 parent 4b9a498 commit c10b6bb
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 37 deletions.
28 changes: 28 additions & 0 deletions test/unit_tests/test_unicode_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from nose.tools import raises, assert_equals, nottest

from totalimpact import unicode_helpers



class TestUnicodeHelpers():

def setUp(self):
pass

def test_remove_nonprinting_characters(self):
unicode_input = u"hi"
response = unicode_helpers.remove_nonprinting_characters(unicode_input)
expected = u"hi"
assert_equals(response, expected)

def test_remove_nonprinting_characters(self):
unicode_input = '0000-0001-8907-4150\xe2\x80\x8e' # a nonprinting character at the end
response = unicode_helpers.remove_nonprinting_characters(unicode_input)
expected = "0000-0001-8907-4150"
assert_equals(response, expected)

def test_remove_nonprinting_characters_unicode_input(self):
unicode_input = u'0000-0001-8907-4150\u200e' # a nonprinting character at the end
response = unicode_helpers.remove_nonprinting_characters(unicode_input)
expected = u"0000-0001-8907-4150"
assert_equals(response, expected)
8 changes: 8 additions & 0 deletions test/unit_tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,14 @@ def test_memberitems_get(self):
assert_equals(json.loads(response.data)["memberitems"], GOLD_MEMBER_ITEM_CONTENT)
assert_equals(response.mimetype, "application/json")

def test_memberitems_get_with_nonprinting_character(self):
response = self.client.get(u'/provider/dryad/memberitems/Otto\u200e%2C%20Sarah%20P.?method=sync')
print response
print response.data
assert_equals(response.status_code, 200)
assert_equals(json.loads(response.data)["memberitems"], GOLD_MEMBER_ITEM_CONTENT)
assert_equals(response.mimetype, "application/json")

def test_file_parsing(self):
datadir = os.path.join(os.path.split(__file__)[0], "../../extras/sample_provider_pages/bibtex")
path = os.path.join(datadir, "Vision.bib")
Expand Down
14 changes: 5 additions & 9 deletions totalimpact/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from totalimpact.providers.provider import ProviderFactory
from totalimpact.providers.provider import ProviderTimeout, ProviderServerError
from totalimpact import unicode_helpers

from totalimpact import default_settings
from totalimpact.utils import Retry

Expand All @@ -13,11 +15,6 @@
import logging
logger = logging.getLogger('ti.item')

# setup to remove control characters from received IDs
# from http://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
control_chars = ''.join(map(unichr, range(0,32) + range(127,160)))
control_char_re = re.compile('[%s]' % re.escape(control_chars))

class NotAuthenticatedError(Exception):
pass

Expand All @@ -36,9 +33,8 @@ def largest_value_that_is_less_than_or_equal_to(target, collection):

def clean_id(nid):
try:
nid = control_char_re.sub('', nid)
nid = nid.replace(u'\u200b', "")
nid = nid.strip()
nid = unicode_helpers.remove_nonprinting_characters(nid)
except TypeError:
#isn't a string. That's ok, might be biblio
pass
Expand Down Expand Up @@ -399,6 +395,8 @@ def create_or_find_items_from_aliases(clean_aliases, myredis, mydao):
new_items = []
for alias in clean_aliases:
(namespace, nid) = alias
namespace = clean_id(namespace)
nid = clean_id(nid)
existing_tiid = get_tiid_by_alias(namespace, nid, mydao)
if existing_tiid:
tiids.append(existing_tiid)
Expand All @@ -411,8 +409,6 @@ def create_or_find_items_from_aliases(clean_aliases, myredis, mydao):
alias=alias
))
item = make()
namespace = clean_id(namespace)
nid = clean_id(nid)
item["aliases"][namespace] = [nid]
item["aliases"] = canonical_aliases(item["aliases"])

Expand Down
4 changes: 2 additions & 2 deletions totalimpact/providers/bibtex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from totalimpact.providers import provider
from totalimpact.providers.provider import Provider, ProviderContentMalformedError, ProviderTimeout, ProviderServerError
from totalimpact import utils
from totalimpact import unicode_helpers
from totalimpact.providers import bibtex_lookup

import logging
Expand Down Expand Up @@ -42,7 +42,7 @@ def __init__(self):
self.bibtex_to_unicode = build_bibtex_to_unicode(bibtex_lookup.unicode_to_latex)

def _to_unicode(self, text):
text = utils.to_unicode_or_bust(text)
text = unicode_helpers.to_unicode_or_bust(text)
if "{" in text:
text = text.replace("\\", "")
for i, j in self.bibtex_to_unicode.iteritems():
Expand Down
4 changes: 2 additions & 2 deletions totalimpact/providers/webpage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from totalimpact.providers import provider
from totalimpact.providers.provider import Provider, ProviderContentMalformedError
from totalimpact import utils
from totalimpact import unicode_helpers

import lxml.html
import re
Expand Down Expand Up @@ -89,7 +89,7 @@ def _extract_biblio(self, page, id=None):
if not page:
return biblio_dict

unicode_page = utils.to_unicode_or_bust(page)
unicode_page = unicode_helpers.to_unicode_or_bust(page)
try:
parsed_html = lxml.html.document_fromstring(unicode_page)

Expand Down
31 changes: 31 additions & 0 deletions totalimpact/unicode_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import unicodedata
import logging

logger = logging.getLogger('ti.unicode_helpers')

#from http://farmdev.com/talks/unicode/
def to_unicode_or_bust(obj, encoding='utf-8'):
if isinstance(obj, basestring):
if not isinstance(obj, unicode):
obj = unicode(obj, encoding)
return obj


def remove_nonprinting_characters(input, encoding='utf-8'):
input_was_unicode = True
if isinstance(input, basestring):
if not isinstance(input, unicode):
input_was_unicode = False

unicode_input = to_unicode_or_bust(input)

# see http://www.fileformat.info/info/unicode/category/index.htm
char_classes_to_remove = ["C", "M", "Z"]

response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove)

if not input_was_unicode:
response = response.encode(encoding)

return response

8 changes: 0 additions & 8 deletions totalimpact/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,3 @@ def fn(*args, **kwargs):
return False # fail silently...
return fn


#from http://farmdev.com/talks/unicode/
def to_unicode_or_bust(obj, encoding='utf-8'):
if isinstance(obj, basestring):
if not isinstance(obj, unicode):
obj = unicode(obj, encoding)
return obj

35 changes: 19 additions & 16 deletions totalimpact/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from totalimpact import item as item_module
from totalimpact.models import MemberItems, UserFactory, NotAuthenticatedError
from totalimpact.providers.provider import ProviderFactory, ProviderItemNotFoundError, ProviderError, ProviderServerError, ProviderTimeout
from totalimpact import unicode_helpers
from totalimpact import default_settings
import logging

Expand Down Expand Up @@ -325,6 +326,7 @@ def provider_memberitems_get(provider_name, query):
"""
Gets aliases associated with a query from a given provider.
"""
query = unicode_helpers.remove_nonprinting_characters(query)

try:
provider = ProviderFactory.get_provider(provider_name)
Expand Down Expand Up @@ -488,6 +490,19 @@ def delete_items(cid=""):
return resp


def get_alias_strings(aliases):
alias_strings = []
for (namespace, nid) in aliases:
namespace = item_module.clean_id(namespace)
nid = item_module.clean_id(nid)
try:
alias_strings += [namespace+":"+nid]
except TypeError:
# jsonify the biblio dicts
alias_strings += [namespace+":"+json.dumps(nid)]
return alias_strings


@app.route("/collection/<cid>/items", methods=["PUT"])
@app.route("/v1/collection/<cid>/items", methods=["PUT"])
def put_collection(cid=""):
Expand All @@ -499,12 +514,7 @@ def put_collection(cid=""):

try:
aliases = request.json["aliases"]
try:
alias_strings = [namespace+":"+nid for (namespace, nid) in aliases]
except TypeError:
# jsonify the biblio dicts
alias_strings = [namespace+":"+json.dumps(nid) for (namespace, nid) in aliases]

alias_strings = get_alias_strings(aliases)
(tiids, new_items) = item_module.create_or_update_items_from_aliases(
aliases, myredis, mydao)

Expand All @@ -531,6 +541,7 @@ def put_collection(cid=""):
return resp



""" Updates all the items in a given collection.
"""
@app.route("/collection/<cid>", methods=["POST"])
Expand All @@ -555,7 +566,6 @@ def collection_update(cid=""):
return resp



# creates a collection with aliases
@app.route('/collection', methods=['POST'])
@app.route('/v1/collection', methods=['POST'])
Expand All @@ -574,9 +584,6 @@ def collection_create():
coll["title"] = request.json["title"]
aliases = request.json["aliases"]
(tiids, new_items) = item_module.create_or_update_items_from_aliases(aliases, myredis, mydao)
for item in new_items:
namespaces = item["aliases"].keys()

if not tiids:
abort_custom(404, "POST /collection requires a list of [namespace, id] pairs.")
except (AttributeError, TypeError):
Expand All @@ -587,14 +594,10 @@ def collection_create():
json=str(request.json)))
abort_custom(404, "Missing arguments.")

try:
alias_strings = aliases_strings = [namespace+":"+nid for (namespace, nid) in aliases]
except TypeError:
# jsonify the biblio dicts
alias_strings = aliases_strings = [namespace+":"+json.dumps(nid) for (namespace, nid) in aliases]
alias_strings = get_alias_strings(aliases)

# save dict of alias:tiid
coll["alias_tiids"] = dict(zip(aliases_strings, tiids))
coll["alias_tiids"] = dict(zip(alias_strings, tiids))

logger.info(json.dumps(coll, sort_keys=True, indent=4))

Expand Down

0 comments on commit c10b6bb

Please sign in to comment.