-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
strip nonprinting characters, including unicode, from susceptible inp…
…ut ids not stripping from tiids and cids right now… they seems safe. for completeness maybe do this in the future? not sure.
- Loading branch information
Showing
8 changed files
with
95 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from nose.tools import raises, assert_equals, nottest | ||
|
||
from totalimpact import unicode_helpers | ||
|
||
|
||
|
||
class TestUnicodeHelpers(): | ||
|
||
def setUp(self): | ||
pass | ||
|
||
def test_remove_nonprinting_characters(self): | ||
unicode_input = u"hi" | ||
response = unicode_helpers.remove_nonprinting_characters(unicode_input) | ||
expected = u"hi" | ||
assert_equals(response, expected) | ||
|
||
def test_remove_nonprinting_characters(self): | ||
unicode_input = '0000-0001-8907-4150\xe2\x80\x8e' # a nonprinting character at the end | ||
response = unicode_helpers.remove_nonprinting_characters(unicode_input) | ||
expected = "0000-0001-8907-4150" | ||
assert_equals(response, expected) | ||
|
||
def test_remove_nonprinting_characters_unicode_input(self): | ||
unicode_input = u'0000-0001-8907-4150\u200e' # a nonprinting character at the end | ||
response = unicode_helpers.remove_nonprinting_characters(unicode_input) | ||
expected = u"0000-0001-8907-4150" | ||
assert_equals(response, expected) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import unicodedata | ||
import logging | ||
|
||
logger = logging.getLogger('ti.unicode_helpers') | ||
|
||
#from http://farmdev.com/talks/unicode/ | ||
def to_unicode_or_bust(obj, encoding='utf-8'): | ||
if isinstance(obj, basestring): | ||
if not isinstance(obj, unicode): | ||
obj = unicode(obj, encoding) | ||
return obj | ||
|
||
|
||
def remove_nonprinting_characters(input, encoding='utf-8'): | ||
input_was_unicode = True | ||
if isinstance(input, basestring): | ||
if not isinstance(input, unicode): | ||
input_was_unicode = False | ||
|
||
unicode_input = to_unicode_or_bust(input) | ||
|
||
# see http://www.fileformat.info/info/unicode/category/index.htm | ||
char_classes_to_remove = ["C", "M", "Z"] | ||
|
||
response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove) | ||
|
||
if not input_was_unicode: | ||
response = response.encode(encoding) | ||
|
||
return response | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters