From a25db31dcf2dfc3ed0064beba076b0b4f44473d4 Mon Sep 17 00:00:00 2001 From: Adam Kariv Date: Mon, 1 Apr 2024 21:57:49 +0300 Subject: [PATCH] v1.20.11 Performance improvements --- tableschema/VERSION | 2 +- tableschema/field.py | 3 ++- tableschema/types/integer.py | 4 +-- tableschema/types/number.py | 49 ++++++++++++++++++++++-------------- tableschema/types/string.py | 3 +++ 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/tableschema/VERSION b/tableschema/VERSION index 62773a4..49ff017 100644 --- a/tableschema/VERSION +++ b/tableschema/VERSION @@ -1 +1 @@ -1.20.10 \ No newline at end of file +1.20.11 \ No newline at end of file diff --git a/tableschema/field.py b/tableschema/field.py index a38e554..cf01cb1 100644 --- a/tableschema/field.py +++ b/tableschema/field.py @@ -46,6 +46,7 @@ def __init__(self, descriptor, missing_values=config.DEFAULT_MISSING_VALUES, self.__schema = schema self.__cast_function = self.__get_cast_function() self.__check_functions = self.__get_check_functions() + self.__preserve_missing_values = os.environ.get('TABLESCHEMA_PRESERVE_MISSING_VALUES') @cached_property def schema(self): @@ -155,7 +156,7 @@ def cast_value(self, value, constraints=True): # Null value if value in self.__missing_values: # Whether missing_values should be preserved without being cast - if os.environ.get('TABLESCHEMA_PRESERVE_MISSING_VALUES'): + if self.__preserve_missing_values: return value value = None diff --git a/tableschema/types/integer.py b/tableschema/types/integer.py index 60c6ed5..1f4390f 100644 --- a/tableschema/types/integer.py +++ b/tableschema/types/integer.py @@ -21,7 +21,7 @@ def cast_integer(format, value, **options): elif isinstance(value, six.string_types): if not options.get('bareNumber', _DEFAULT_BARE_NUMBER): - value = re.sub(r'((^\D*)|(\D*$))', '', value) + value = _RE_BARE_NUMBER.sub('', value) try: value = int(value) @@ -41,5 +41,5 @@ def cast_integer(format, value, **options): # Internal - +_RE_BARE_NUMBER = re.compile(r'((^\D*)|(\D*$))') _DEFAULT_BARE_NUMBER = True diff --git a/tableschema/types/number.py b/tableschema/types/number.py index 24be3e8..de4d4e0 100644 --- a/tableschema/types/number.py +++ b/tableschema/types/number.py @@ -13,31 +13,42 @@ # Module API def cast_number(format, value, **options): - group_char = options.get('groupChar', _DEFAULT_GROUP_CHAR) - decimal_char = options.get('decimalChar', _DEFAULT_DECIMAL_CHAR) - if not isinstance(value, Decimal): - if isinstance(value, six.string_types): - value = re.sub(r'\s', '', value) - value = value.replace(decimal_char, '__decimal_char__') + if isinstance(value, six.string_types): + group_char = options.get('groupChar', _DEFAULT_GROUP_CHAR) + decimal_char = options.get('decimalChar', _DEFAULT_DECIMAL_CHAR) + value = _RE_WHITESPACE.sub('', value) + if decimal_char != '.': + if group_char: + value = value.replace(decimal_char, '__decimal_char__') + value = value.replace(group_char, '') + value = value.replace('__decimal_char__', '.') + else: + value = value.replace(decimal_char, '__decimal_char__') + value = value.replace('__decimal_char__', '.') + elif group_char: value = value.replace(group_char, '') - value = value.replace('__decimal_char__', '.') - if not options.get('bareNumber', _DEFAULT_BARE_NUMBER): - value = re.sub(r'((^\D*)|(\D*$))', '', value) - elif not isinstance(value, six.integer_types + (float,)): - return ERROR - elif value is True or value is False: - return ERROR - try: - if isinstance(value, float): - value = str(value) - value = Decimal(value) - except Exception: - return ERROR + + if not options.get('bareNumber', _DEFAULT_BARE_NUMBER): + value = _RE_BARE_NUMBER.sub('', value) + elif isinstance(value, Decimal): + return value + elif not isinstance(value, six.integer_types + (float,)): + return ERROR + elif value is True or value is False: + return ERROR + else: + value = str(value) + try: + value = Decimal(value) + except Exception: + return ERROR return value # Internal +_RE_WHITESPACE = re.compile(r'\s') +_RE_BARE_NUMBER = re.compile(r'((^\D*)|(\D*$))') _DEFAULT_GROUP_CHAR = '' _DEFAULT_DECIMAL_CHAR = '.' _DEFAULT_BARE_NUMBER = True diff --git a/tableschema/types/string.py b/tableschema/types/string.py index 928043d..75e43af 100644 --- a/tableschema/types/string.py +++ b/tableschema/types/string.py @@ -19,6 +19,8 @@ def cast_string(format, value, **options): if not isinstance(value, six.string_types): return ERROR + if format in _SIMPLE_FORMATS: + return value if format == 'uri': uri = _uri_from_string(value) try: @@ -43,6 +45,7 @@ def cast_string(format, value, **options): # Internal +_SIMPLE_FORMATS = {'default', None} _EMAIL_PATTERN = re.compile(r'[^@]+@[^@]+\.[^@]+') _uri_from_string = rfc3986.uri.URIReference.from_string _uri_validator = rfc3986.validators.Validator().require_presence_of('scheme')