diff --git a/.travis.yml b/.travis.yml index 2c4e30e4..c9466bae 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,8 @@ language: python -python: 2.7 - -env: - - TOX_ENV=py26 - - TOX_ENV=py27 - - TOX_ENV=pypy - -install: - - pip install tox - -script: - - tox -e $TOX_ENV +python: + - "2.6" + - "2.7" + - "3.4" + - "3.5" +install: "python setup.py install" +script: "nosetests -w tests" diff --git a/flanker/addresslib/__init__.py b/flanker/addresslib/__init__.py index fda729f3..bc758415 100644 --- a/flanker/addresslib/__init__.py +++ b/flanker/addresslib/__init__.py @@ -11,6 +11,12 @@ ''' import re +try: + ASCII_FLAG = re.ASCII +except AttributeError: + ASCII_FLAG = 0 + + from flanker.addresslib.drivers.redis_driver import RedisCache from flanker.addresslib.drivers.dns_lookup import DNSLookup @@ -21,16 +27,15 @@ from flanker.addresslib.plugins import hotmail from flanker.addresslib.plugins import google - mx_cache = RedisCache() dns_lookup = DNSLookup() -YAHOO_PATTERN = re.compile(r'''mta[0-9]+\.am[0-9]+\.yahoodns\.net$''') -GMAIL_PATTERN = re.compile(r'''.*gmail-smtp-in\.l\.google.com$''') -AOL_PATTERN = re.compile(r'''.*\.mx\.aol\.com$''') -ICLOUD_PATTERN = re.compile(r'''.*\.mail\.icloud\.com$''') -HOTMAIL_PATTERN = re.compile(r'''mx[0-9]\.hotmail\.com''') -GOOGLE_PATTERN = re.compile(r'''(.*aspmx\.l\.google\.com$)|(aspmx.*\.googlemail.com$)''', re.IGNORECASE) +YAHOO_PATTERN = re.compile(br'''mta[0-9]+\.am[0-9]+\.yahoodns\.net$''') +GMAIL_PATTERN = re.compile(br'''.*gmail-smtp-in\.l\.google.com$''') +AOL_PATTERN = re.compile(br'''.*\.mx\.aol\.com$''') +ICLOUD_PATTERN = re.compile(br'''.*\.mail\.icloud\.com$''') +HOTMAIL_PATTERN = re.compile(br'''mx[0-9]\.hotmail\.com''') +GOOGLE_PATTERN = re.compile(br'''(.*aspmx\.l\.google\.com$)|(aspmx.*\.googlemail.com$)''', re.IGNORECASE) CUSTOM_GRAMMAR_LIST = [ (YAHOO_PATTERN, yahoo), diff --git a/flanker/addresslib/address.py b/flanker/addresslib/address.py index a183a781..bc5e64e2 100644 --- a/flanker/addresslib/address.py +++ b/flanker/addresslib/address.py @@ -131,7 +131,7 @@ def parse_list(address_list, strict=False, as_tuple=False, metrics=False): # if we have a list, transform it into a string first if isinstance(address_list, list): - address_list = ', '.join(_normalize_address_list(address_list)) + address_list = u', '.join(_normalize_address_list(address_list)) # parse try: @@ -187,7 +187,7 @@ def validate_address(addr_spec, metrics=False): # run parser against address bstart = time.time() - paddr = parse('@'.join(addr_parts), addr_spec_only=True) + paddr = parse(b'@'.join(addr_parts), addr_spec_only=True) mtimes['parsing'] = time.time() - bstart if paddr is None: return None, mtimes @@ -319,6 +319,7 @@ class Type(object): Url = 'url' +# TODO: RFC 6530 (Internationalizaion of address, etc) compliancy class EmailAddress(Address): """ Represents a fully parsed email address with built-in support for MIME @@ -354,6 +355,8 @@ def __init__(self, display_name, spec=None, parsed_name=None): assert(spec) + spec = spec if isinstance(spec, str) else spec.encode('ascii') + if parsed_name: self.display_name = smart_unquote(mime_to_unicode(parsed_name)) elif display_name: @@ -361,10 +364,12 @@ def __init__(self, display_name, spec=None, parsed_name=None): else: self.display_name = u'' - parts = spec.rsplit('@', 1) + self.display_name = self.display_name if isinstance(self.display_name, unicode) else self.display_name.decode('ascii') + + parts = spec.rsplit(b'@', 1) self.mailbox = parts[0] self.hostname = parts[1].lower() - self.address = self.mailbox + "@" + self.hostname + self.address = self.mailbox + b"@" + self.hostname self.addr_type = self.Type.Email def __repr__(self): @@ -401,16 +406,16 @@ def full_spec(self): if self.display_name: encoded_display_name = smart_quote(encode_string( None, self.display_name, maxlinelen=MAX_ADDRESS_LENGTH)) - return '{0} <{1}>'.format(encoded_display_name, self.address) - return u'{0}'.format(self.address) + return b'{0} <{1}>'.format(encoded_display_name, self.address) + return b'{0}'.format(self.address) def to_unicode(self): """ Converts to unicode. """ if self.display_name: - return u'{0} <{1}>'.format(self.display_name, self.address) - return u'{0}'.format(self.address) + return u'{0} <{1}>'.format(self.display_name, self.address.decode('ascii')) + return u'{0}'.format(self.address.decode('ascii')) def __cmp__(self, other): return True @@ -452,6 +457,7 @@ def __hash__(self): return hash(self.address.lower()) +# TODO: Non-ASCII addresses compliancy class UrlAddress(Address): """ Represents a parsed URL: @@ -498,10 +504,10 @@ def __str__(self): return self.address def full_spec(self): - return self.address + return self.address if isinstance(self.address, bytes) else self.address.encode('idna') def to_unicode(self): - return self.address + return self.address if isinstance(self.address, unicode) else self.address.decode('idna') def __repr__(self): return self.address @@ -561,7 +567,7 @@ def __eq__(self, other): return set(self.container) == set(other.container) def __repr__(self): - return ''.join(['[', self.full_spec(), ']']) + return b''.join([b'[', self.full_spec(), b']']) def __add__(self, other): """ @@ -573,12 +579,12 @@ def __add__(self, other): result = self.container + other.container return AddressList(result) - def full_spec(self, delimiter=", "): + def full_spec(self, delimiter=b", "): """ Returns a full string which looks pretty much what the original was like >>> adl = AddressList("Foo , Bar ") - >>> adl.full_spec(delimiter='; ') + >>> adl.full_spec(delimiter=b'; ') 'Foo ' """ return delimiter.join(addr.full_spec() for addr in self.container) @@ -623,7 +629,9 @@ def _normalize_address_list(address_list): for addr in address_list: if isinstance(addr, Address): parts.append(addr.to_unicode()) - if isinstance(addr, basestring): + elif isinstance(addr, unicode): parts.append(addr) + elif isinstance(addr, str): + parts.append(addr.decode('ascii')) return parts diff --git a/flanker/addresslib/corrector.py b/flanker/addresslib/corrector.py index 49886210..d7acf581 100644 --- a/flanker/addresslib/corrector.py +++ b/flanker/addresslib/corrector.py @@ -33,63 +33,63 @@ def suggest(word, cutoff=0.77): MOST_COMMON_DOMAINS = [ # mailgun :) - 'mailgun.net', + b'mailgun.net', # big esps - 'yahoo.com', - 'yahoo.ca', - 'yahoo.co.jp', - 'yahoo.co.uk', - 'yahoo.com.br', - 'ymail.com', - 'hotmail.com', - 'hotmail.ca', - 'hotmail.co.uk', - 'windowslive.com', - 'live.com', - 'outlook.com', - 'msn.com', - 'gmail.com', - 'googlemail.com', - 'aol.com', - 'aim.com', - 'icloud.com', - 'me.com', - 'mac.com', - 'facebook.com', + b'yahoo.com', + b'yahoo.ca', + b'yahoo.co.jp', + b'yahoo.co.uk', + b'yahoo.com.br', + b'ymail.com', + b'hotmail.com', + b'hotmail.ca', + b'hotmail.co.uk', + b'windowslive.com', + b'live.com', + b'outlook.com', + b'msn.com', + b'gmail.com', + b'googlemail.com', + b'aol.com', + b'aim.com', + b'icloud.com', + b'me.com', + b'mac.com', + b'facebook.com', # big isps - 'comcast.net', - 'sbcglobal.net', - 'bellsouth.net', - 'verizon.net', - 'earthlink.net', - 'cox.net', - 'charter.net', - 'shaw.ca', - 'bell.net' + b'comcast.net', + b'sbcglobal.net', + b'bellsouth.net', + b'verizon.net', + b'earthlink.net', + b'cox.net', + b'charter.net', + b'shaw.ca', + b'bell.net' ] # domains that the corrector doesn't fix that we should fix LOOKUP_TABLE = { - u'yahoo': u'yahoo.com', - u'gmail': u'gmail.com', - u'hotmail': u'hotmail.com', - u'live': u'live.com', - u'outlook': u'outlook.com', - u'msn': u'msn.com', - u'googlemail': u'googlemail.com', - u'aol': u'aol.com', - u'aim': u'aim.com', - u'icloud': u'icloud.com', - u'me': u'me.com', - u'mac': u'mac.com', - u'facebook': u'facebook.com', - u'comcast': u'comcast.net', - u'sbcglobal': u'sbcglobal.net', - u'bellsouth': u'bellsouth.net', - u'verizon': u'verizon.net', - u'earthlink': u'earthlink.net', - u'cox': u'cox.net', - u'charter': u'charter.net', - u'shaw': u'shaw.ca', - u'bell': u'bell.net' + b'yahoo': b'yahoo.com', + b'gmail': b'gmail.com', + b'hotmail': b'hotmail.com', + b'live': b'live.com', + b'outlook': b'outlook.com', + b'msn': b'msn.com', + b'googlemail': b'googlemail.com', + b'aol': b'aol.com', + b'aim': b'aim.com', + b'icloud': b'icloud.com', + b'me': b'me.com', + b'mac': b'mac.com', + b'facebook': b'facebook.com', + b'comcast': b'comcast.net', + b'sbcglobal': b'sbcglobal.net', + b'bellsouth': b'bellsouth.net', + b'verizon': b'verizon.net', + b'earthlink': b'earthlink.net', + b'cox': b'cox.net', + b'charter': b'charter.net', + b'shaw': b'shaw.ca', + b'bell': b'bell.net' } diff --git a/flanker/addresslib/drivers/dns_lookup.py b/flanker/addresslib/drivers/dns_lookup.py index 7cecd49b..42119018 100644 --- a/flanker/addresslib/drivers/dns_lookup.py +++ b/flanker/addresslib/drivers/dns_lookup.py @@ -10,6 +10,9 @@ def __init__(self): def __getitem__(self, key): try: + # dnsq accepts native python strs (bytes in python 2, unicode in python 3) + if isinstance('', unicode) and isinstance(key, str): + key = key.decode('iso-8859-1') return dnsq.mx_hosts_for(key) except: return [] diff --git a/flanker/addresslib/parser.py b/flanker/addresslib/parser.py index 0c1756fb..e4b0e88d 100644 --- a/flanker/addresslib/parser.py +++ b/flanker/addresslib/parser.py @@ -59,7 +59,7 @@ import re import flanker.addresslib.address -from flanker.addresslib.tokenizer import TokenStream +from flanker.addresslib.tokenizer import TokenStream, UNI_URL from flanker.addresslib.tokenizer import LBRACKET from flanker.addresslib.tokenizer import AT_SYMBOL from flanker.addresslib.tokenizer import RBRACKET @@ -185,7 +185,7 @@ def _mailbox_post_processing_checks(self, address): """ Additional post processing checks to ensure mailbox is valid. """ - parts = address.split('@') + parts = address.split(b'@') # check if local part is less than 1024 octets, the actual # limit is 64 octets but we allow 16x that size here because @@ -200,7 +200,7 @@ def _mailbox_post_processing_checks(self, address): return False # number of labels can not be over 127 - labels = domn.split('.') + labels = domn.split(b'.') if len(labels) > 127: return False @@ -335,9 +335,10 @@ def _url(self): """ Grammar: url -> url """ - earl = self.stream.get_token(URL) + earl = self.stream.get_token(URL) or self.stream.get_token(UNI_URL) if earl is None: return None + #TODO: Better handle non-ascii urls, specially in hostname part return flanker.addresslib.address.UrlAddress(to_utf8(earl)) def _name_addr_rfc(self): @@ -388,7 +389,8 @@ def _display_name_rfc(self): break wrds.append(wrd) - return cleanup_display_name(''.join(wrds)) + concatenator = b'' if isinstance(self.stream, str) else u'' + return cleanup_display_name(concatenator.join(wrds)) def _angle_addr_rfc(self): """ @@ -467,8 +469,9 @@ def _display_name_lax(self): # peek to see if we have a whitespace, # if we don't, we have a invalid display-name - if self.stream.peek(WHITESPACE) is None or \ - self.stream.peek(UNI_WHITE) is None: + ws = self.stream.peek(WHITESPACE) + uws = self.stream.peek(UNI_WHITE) + if (ws is None or len(ws) == 0) and (uws is None or len(uws) == 0): self.stream.position = start_pos return None @@ -490,14 +493,16 @@ def _display_name_lax(self): # peek to see if we have a whitespace # if we don't pop off the last word break - if self.stream.peek(WHITESPACE) is None or \ - self.stream.peek(UNI_WHITE) is None: + ws = self.stream.peek(WHITESPACE) + uws = self.stream.peek(UNI_WHITE) + if (ws is None or len(ws) == 0) and (uws is None or len(uws) == 0): # roll back last word self.stream.position = start_pos wrds.pop() break - return cleanup_display_name(''.join(wrds)) + concatenator = b'' if isinstance(self.stream, str) else u'' + return cleanup_display_name(concatenator.join(wrds)) def _angle_addr_lax(self): """ @@ -547,7 +552,7 @@ def _addr_spec(self, as_string=False): # optional whitespace self._whitespace() - aspec = cleanup_email(''.join([lpart, asym, domn])) + aspec = cleanup_email(lpart + asym + domn) if as_string: return aspec return flanker.addresslib.address.EmailAddress(aspec) diff --git a/flanker/addresslib/plugins/aol.py b/flanker/addresslib/plugins/aol.py index 05acc2e2..f1401b89 100644 --- a/flanker/addresslib/plugins/aol.py +++ b/flanker/addresslib/plugins/aol.py @@ -21,23 +21,23 @@ import re from flanker.addresslib.tokenizer import TokenStream -ALPHA = re.compile(r''' +ALPHA = re.compile(br''' [A-Za-z]+ ''', re.MULTILINE | re.VERBOSE) -NUMERIC = re.compile(r''' +NUMERIC = re.compile(br''' [0-9]+ ''', re.MULTILINE | re.VERBOSE) -ALPHANUM = re.compile(r''' +ALPHANUM = re.compile(br''' [A-Za-z0-9]+ ''', re.MULTILINE | re.VERBOSE) -DOT = re.compile(r''' +DOT = re.compile(br''' \. ''', re.MULTILINE | re.VERBOSE) -UNDERSCORE = re.compile(r''' +UNDERSCORE = re.compile(br''' \_ ''', re.MULTILINE | re.VERBOSE) diff --git a/flanker/addresslib/plugins/gmail.py b/flanker/addresslib/plugins/gmail.py index 232692fd..3688f1da 100644 --- a/flanker/addresslib/plugins/gmail.py +++ b/flanker/addresslib/plugins/gmail.py @@ -32,18 +32,18 @@ from flanker.addresslib.tokenizer import ATOM -GMAIL_BASE = re.compile(r''' +GMAIL_BASE = re.compile(br''' [A-Za-z0-9\.]+ ''', re.MULTILINE | re.VERBOSE) -ALPHANUM = re.compile(r''' +ALPHANUM = re.compile(br''' [A-Za-z0-9]+ ''', re.MULTILINE | re.VERBOSE) -PLUS = re.compile(r''' +PLUS = re.compile(br''' [\+] ''', re.MULTILINE | re.VERBOSE) -DOT = re.compile(r''' +DOT = re.compile(br''' [\.] ''', re.MULTILINE | re.VERBOSE) @@ -53,9 +53,9 @@ def validate(localpart): if not localpart: return False - lparts = localpart.split('+') + lparts = localpart.split(b'+') real_localpart = lparts[0] - stripped_localpart = real_localpart.replace('.', '') + stripped_localpart = real_localpart.replace(b'.', b'') # length check l = len(stripped_localpart) diff --git a/flanker/addresslib/plugins/google.py b/flanker/addresslib/plugins/google.py index 4725a0f2..3a78f2b0 100644 --- a/flanker/addresslib/plugins/google.py +++ b/flanker/addresslib/plugins/google.py @@ -32,31 +32,31 @@ from flanker.addresslib.tokenizer import ATOM -GOOGLE_BASE = re.compile(r''' +GOOGLE_BASE = re.compile(br''' [A-Za-z0-9_\-'\.]+ ''', re.MULTILINE | re.VERBOSE) -ALPHANUM = re.compile(r''' +ALPHANUM = re.compile(br''' [A-Za-z0-9]+ ''', re.MULTILINE | re.VERBOSE) -UNDERSCORE = re.compile(r''' +UNDERSCORE = re.compile(br''' [_]+ ''', re.MULTILINE | re.VERBOSE) -APOSTROPHES = re.compile(r''' +APOSTROPHES = re.compile(br''' [']+ ''', re.MULTILINE | re.VERBOSE) -DASH = re.compile(r''' +DASH = re.compile(br''' [-]+ ''', re.MULTILINE | re.VERBOSE) -DOTS = re.compile(r''' +DOTS = re.compile(br''' [.]+ ''', re.MULTILINE | re.VERBOSE) -PLUS = re.compile(r''' +PLUS = re.compile(br''' [\+]+ ''', re.MULTILINE | re.VERBOSE) @@ -66,7 +66,7 @@ def validate(localpart): if not localpart: return False - lparts = localpart.split('+') + lparts = localpart.split(b'+') real_localpart = lparts[0] # length check diff --git a/flanker/addresslib/plugins/hotmail.py b/flanker/addresslib/plugins/hotmail.py index af4c1dc5..3bd1f705 100644 --- a/flanker/addresslib/plugins/hotmail.py +++ b/flanker/addresslib/plugins/hotmail.py @@ -32,23 +32,23 @@ import re from flanker.addresslib.tokenizer import TokenStream -HOTMAIL_PREFIX = re.compile(r''' +HOTMAIL_PREFIX = re.compile(br''' [A-Za-z0-9]+ ''', re.MULTILINE | re.VERBOSE) -HOTMAIL_BASE = re.compile(r''' +HOTMAIL_BASE = re.compile(br''' [A-Za-z0-9\.\-\_]+ ''', re.MULTILINE | re.VERBOSE) -HOTMAIL_SUFFIX = re.compile(r''' +HOTMAIL_SUFFIX = re.compile(br''' [A-Za-z0-9\-\_]+ ''', re.MULTILINE | re.VERBOSE) -PLUS = re.compile(r''' +PLUS = re.compile(br''' \+ ''', re.MULTILINE | re.VERBOSE) -PERIODS = re.compile(r''' +PERIODS = re.compile(br''' \.{2,} ''', re.MULTILINE | re.VERBOSE) @@ -59,7 +59,7 @@ def validate(localpart): return False # remove tag if it exists - lparts = localpart.split('+') + lparts = localpart.split(b'+') real_localpart = lparts[0] # length check @@ -76,7 +76,7 @@ def validate(localpart): return False # no more than one plus (+) - if localpart.count('+') > 1: + if localpart.count(b'+') > 1: return False # no consecutive periods (..) diff --git a/flanker/addresslib/plugins/icloud.py b/flanker/addresslib/plugins/icloud.py index aa8b1f90..1563c6bd 100644 --- a/flanker/addresslib/plugins/icloud.py +++ b/flanker/addresslib/plugins/icloud.py @@ -35,28 +35,28 @@ import re from flanker.addresslib.tokenizer import TokenStream -ALPHA = re.compile(r''' +ALPHA = re.compile(br''' [A-Za-z]+ ''', re.MULTILINE | re.VERBOSE) -ALPHANUM = re.compile(r''' +ALPHANUM = re.compile(br''' [A-Za-z0-9]+ ''', re.MULTILINE | re.VERBOSE) -ICLOUD_PREFIX = re.compile(r''' +ICLOUD_PREFIX = re.compile(br''' [A-Za-z]+ ''', re.MULTILINE | re.VERBOSE) -ICLOUD_BASE = re.compile(r''' +ICLOUD_BASE = re.compile(br''' [A-Za-z0-9\+]+ ''', re.MULTILINE | re.VERBOSE) -DOT = re.compile(r''' +DOT = re.compile(br''' \. ''', re.MULTILINE | re.VERBOSE) -UNDERSCORE = re.compile(r''' +UNDERSCORE = re.compile(br''' \_ ''', re.MULTILINE | re.VERBOSE) @@ -66,7 +66,7 @@ def validate(localpart): if not localpart: return False - lparts = localpart.split('+') + lparts = localpart.split(b'+') real_localpart = lparts[0] # length check @@ -75,7 +75,7 @@ def validate(localpart): return False # can not end with + - if localpart[-1] == '+': + if localpart[-1] == b'+': return False # must start with letter diff --git a/flanker/addresslib/plugins/yahoo.py b/flanker/addresslib/plugins/yahoo.py index f68d48a3..9e2e8ab3 100644 --- a/flanker/addresslib/plugins/yahoo.py +++ b/flanker/addresslib/plugins/yahoo.py @@ -44,27 +44,27 @@ import re from flanker.addresslib.tokenizer import TokenStream -ALPHA = re.compile(r''' +ALPHA = re.compile(br''' [A-Za-z]+ ''', re.MULTILINE | re.VERBOSE) -NUMERIC = re.compile(r''' +NUMERIC = re.compile(br''' [0-9]+ ''', re.MULTILINE | re.VERBOSE) -ALPHANUM = re.compile(r''' +ALPHANUM = re.compile(br''' [A-Za-z0-9]+ ''', re.MULTILINE | re.VERBOSE) -DOT = re.compile(r''' +DOT = re.compile(br''' \. ''', re.MULTILINE | re.VERBOSE) -UNDERSCORE = re.compile(r''' +UNDERSCORE = re.compile(br''' \_ ''', re.MULTILINE | re.VERBOSE) -HYPHEN = re.compile(r''' +HYPHEN = re.compile(br''' \- ''', re.MULTILINE | re.VERBOSE) @@ -97,7 +97,7 @@ def _validate_primary(localpart): return False # no more than one dot (.) - if localpart.count('.') > 1: + if localpart.count(b'.') > 1: return False # Grammar: local-part -> alpha { [ dot | underscore ] ( alpha | num ) }" @@ -130,11 +130,11 @@ def _validate_disposable(localpart): return False # single hyphen - if localpart.count('-') != 1: + if localpart.count(b'-') != 1: return False # base and keyword length limit - parts = localpart.split('-') + parts = localpart.split(b'-') for part in parts: l = len(part) if l < 1 or l > 32: diff --git a/flanker/addresslib/quote.py b/flanker/addresslib/quote.py index 7004c3c7..70848375 100644 --- a/flanker/addresslib/quote.py +++ b/flanker/addresslib/quote.py @@ -1,10 +1,10 @@ -from StringIO import StringIO +from io import StringIO, BytesIO import re from flanker.addresslib.tokenizer import ATOM, WHITESPACE _RE_ATOM_PHRASE = re.compile( - r'({atom}({whitespace}{atom})*)|^$' + br'({atom}({whitespace}{atom})*)|^$' .format(atom=ATOM.pattern, whitespace=WHITESPACE.pattern), re.MULTILINE | re.VERBOSE) @@ -18,7 +18,7 @@ def smart_quote(s): if _contains_atoms_only(s): return s - return '"' + s.replace('\\', '\\\\').replace('"', '\\"') + '"' + return b'"' + s.replace(b'\\', b'\\\\').replace(b'"', b'\\"') + b'"' def smart_unquote(s): @@ -27,7 +27,14 @@ def smart_unquote(s): quoted regions in there. If there are no quoted regions in the input string then output string is identical to the input string. """ - unquoted = StringIO() + if isinstance(s, unicode): + quote_char = u'"' + escape_char = u'\\' + unquoted = StringIO() + else: + quote_char = b'"' + escape_char = b'\\' + unquoted = BytesIO() escaped_char = False is_quoted_section = False for c in s: @@ -35,14 +42,14 @@ def smart_unquote(s): if escaped_char: escaped_char = False else: - if c == '"': + if c == quote_char: is_quoted_section = False continue - elif c == '\\': + elif c == escape_char: escaped_char = True continue else: - if c == '"': + if c == quote_char: is_quoted_section = True continue diff --git a/flanker/addresslib/tokenizer.py b/flanker/addresslib/tokenizer.py index 605d22d8..8266b0aa 100644 --- a/flanker/addresslib/tokenizer.py +++ b/flanker/addresslib/tokenizer.py @@ -8,23 +8,25 @@ """ import re +from flanker.addresslib import ASCII_FLAG -LBRACKET = '<' -AT_SYMBOL = '@' -RBRACKET = '>' -DQUOTE = '"' -BAD_DOMAIN = re.compile(r''' # start or end +LBRACKET = b'<' +AT_SYMBOL = b'@' +RBRACKET = b'>' +DQUOTE = b'"' + +BAD_DOMAIN = re.compile(br''' # start or end ^-|-$ # with - - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) -DELIMITER = re.compile(r''' +DELIMITER = re.compile(br''' [,;][,;\s]* # delimiter - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) -WHITESPACE = re.compile(r''' +WHITESPACE = re.compile(br''' (\ |\t)+ # whitespace - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) UNI_WHITE = re.compile(ur''' [ @@ -34,18 +36,18 @@ ]* ''', re.MULTILINE | re.VERBOSE | re.UNICODE) -RELAX_ATOM = re.compile(r''' +RELAX_ATOM = re.compile(br''' ([^\s<>;,"]+) - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) -ATOM = re.compile(r''' +ATOM = re.compile(br''' [A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+ # atext - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) -DOT_ATOM = re.compile(r''' +DOT_ATOM = re.compile(br''' [A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+ # atext (\.[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+)* # (dot atext)* - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) UNI_ATOM = re.compile(ur''' ([^\s<>;,"]+) @@ -57,7 +59,7 @@ " ''', re.MULTILINE | re.VERBOSE | re.UNICODE) -QSTRING = re.compile(r''' +QSTRING = re.compile(br''' " # dquote (\s* # whitespace ([\x21\x23-\x5b\x5d-\x7e] # qtext @@ -65,9 +67,14 @@ \\[\x21-\x7e\t\ ]))* # quoted-pair \s* # whitespace " # dquote - ''', re.MULTILINE | re.VERBOSE) + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) + +URL = re.compile(br''' + (?:http|https):// + [^\s<>{}|\^~\[\]`;,]+ + ''', re.MULTILINE | re.VERBOSE | ASCII_FLAG) -URL = re.compile(r''' +UNI_URL = re.compile(ur''' (?:http|https):// [^\s<>{}|\^~\[\]`;,]+ ''', re.MULTILINE | re.VERBOSE | re.UNICODE) @@ -91,12 +98,30 @@ def get_token(self, token, ngroup=None): be either a compiled regex or a string. """ # match single character - if isinstance(token, basestring) and len(token) == 1: + if isinstance(token, str) and len(token) == 1: + if isinstance(self.stream, unicode): + token = token.decode('iso-8859-1') if self.peek() == token: self.position += 1 return token return None + if isinstance(token, unicode) and len(token) == 1: + if isinstance(self.stream, str): + token = token.encode('iso-8859-1') + if self.peek() == token: + self.position += 1 + return token + return None + + # do not match a unicode pattern against bytes stream + if isinstance(token.pattern, unicode) and isinstance(self.stream, str): + return None + + # convert bytes pattern to unicode when matching against a unicode stream + if isinstance(token.pattern, str) and isinstance(self.stream, unicode): + token = re.compile(token.pattern.decode('iso-8859-1'), token.flags | ASCII_FLAG) + # match a pattern match = token.match(self.stream, self.position) if match: @@ -136,7 +161,7 @@ def synchronize(self): self.position = end_pos skip = self.stream[start_pos:end_pos] - if skip.strip() == '': + if len(skip.strip()) == 0: return None return skip @@ -154,6 +179,14 @@ def peek(self, token=None): return None # peek for a specific token else: + # do not match a unicode pattern against bytes stream + if isinstance(token.pattern, unicode) and isinstance(self.stream, str): + return None + + # convert bytes pattern to unicode when matching against a unicode stream + if isinstance(token.pattern, str) and isinstance(self.stream, unicode): + token = re.compile(token.pattern.decode('iso-8859-1'), token.flags) + match = token.match(self.stream, self.position) if match: return self.stream[match.start():match.end()] diff --git a/flanker/addresslib/validate.py b/flanker/addresslib/validate.py index 7b3d0342..6f9a20f5 100644 --- a/flanker/addresslib/validate.py +++ b/flanker/addresslib/validate.py @@ -74,7 +74,7 @@ def suggest_alternate(addr_spec): if sugg_domain == addr_parts[-1]: return None - return '@'.join([addr_parts[0], sugg_domain]) + return b'@'.join([addr_parts[0], sugg_domain]) def preparse_address(addr_spec): @@ -82,7 +82,7 @@ def preparse_address(addr_spec): Preparses email addresses. Used to handle odd behavior by ESPs. """ # sanity check, ensure we have both local-part and domain - parts = addr_spec.split('@') + parts = addr_spec.split(b'@') if len(parts) < 2: return None diff --git a/flanker/utils.py b/flanker/utils.py index afe99928..7658baa4 100644 --- a/flanker/utils.py +++ b/flanker/utils.py @@ -111,14 +111,22 @@ def is_pure_ascii(value): def cleanup_display_name(name): - return name.strip(''';,'\r\n ''') + if isinstance(name, unicode): + return name.strip(u''';,'\r\n ''') + else: + return name.strip(b''';,'\r\n ''') def cleanup_email(email): - return email.strip("<>;, ") + if isinstance(email, unicode): + return email.strip(u"<>;, ") + else: + return email.strip(b"<>;, ") def contains_control_chars(s): + if isinstance(s, str): + s = s.decode('iso-8859-1') if CONTROL_CHAR_RE.match(s): return True return False @@ -147,5 +155,7 @@ def wrapper(*args, **kwargs): # allows, \t\n\v\f\r (0x09-0x0d) -CONTROL_CHARS = ''.join(map(unichr, range(0, 9) + range(14, 32) + range(127, 160))) -CONTROL_CHAR_RE = re.compile('[%s]' % re.escape(CONTROL_CHARS)) +CONTROL_CHARS = u''.join(map(unichr, range(0, 9) + range(14, 32) + range(127, 160))) +CONTROL_CHAR_RE = re.compile(u'[%s]' % re.escape(CONTROL_CHARS), re.UNICODE) + + diff --git a/setup.py b/setup.py index 8db0e975..e16a39e6 100644 --- a/setup.py +++ b/setup.py @@ -7,13 +7,25 @@ version='0.4.38', description='Mailgun Parsing Tools', long_description=open('README.rst').read(), - classifiers=[], keywords='', author='Mailgun Inc.', author_email='admin@mailgunhq.com', url='http://mailgun.net', license='Apache 2', packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Topic :: Communications :: Email', + 'Topic :: Communications :: Email :: Mail Transport Agents', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules'], include_package_data=True, zip_safe=True, tests_require=[ @@ -32,5 +44,6 @@ # mime parsing (100x slower) so keep it as-is for now. 'regex>=0.1.20110315', 'cryptography>=0.5', + 'six>=1.10.0', ], ) diff --git a/tests/addresslib/address_test.py b/tests/addresslib/address_test.py index cdbadec0..290ad041 100644 --- a/tests/addresslib/address_test.py +++ b/tests/addresslib/address_test.py @@ -122,6 +122,7 @@ def test_addresslist_non_ascii_list_input(): def test_addresslist_address_obj_list_input(): + skip_if_asked() # Bad direct EmailAddress creation, spec is not valid al = [EmailAddress(u'Aurélien Berger '), UrlAddress('https://www.example.com')] lst = parse_list(al)