From 79790e0ed3cded901a5e22ad7adeaac0ce9e3d67 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 31 Jul 2024 14:37:42 +0200 Subject: [PATCH 01/40] compute stats for datetimes --- .../worker/src/worker/statistics_utils.py | 109 +++++++++++++++++- 1 file changed, 107 insertions(+), 2 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index f2651bb091..ccb28ace6b 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2024 The HuggingFace Authors. +import datetime import enum import io import logging @@ -50,11 +51,12 @@ class ColumnType(str, enum.Enum): STRING_TEXT = "string_text" AUDIO = "audio" IMAGE = "image" + DATETIME = "datetime" class Histogram(TypedDict): hist: list[int] - bin_edges: list[Union[int, float]] + bin_edges: list[Union[int, float, str]] class NumericalStatisticsItem(TypedDict): @@ -68,6 +70,17 @@ class NumericalStatisticsItem(TypedDict): histogram: Optional[Histogram] +class DatetimeStatisticsItem(TypedDict): + nan_count: int + nan_proportion: float + min: Optional[str] # might be None in very rare cases when the whole column is only None values + max: Optional[str] + mean: Optional[str] + median: Optional[str] + std: Optional[str] # string representation of timedelta + histogram: Optional[Histogram] + + class CategoricalStatisticsItem(TypedDict): nan_count: int nan_proportion: float @@ -83,7 +96,9 @@ class BoolStatisticsItem(TypedDict): frequencies: dict[str, int] -SupportedStatistics = Union[NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem] +SupportedStatistics = Union[ + NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem, DatetimeStatisticsItem +] class StatisticsPerColumnItem(TypedDict): @@ -699,3 +714,93 @@ def get_shape(example: Optional[Union[bytes, dict[str, Any]]]) -> Union[tuple[No @classmethod def transform(cls, example: Optional[Union[bytes, dict[str, Any]]]) -> Optional[int]: return cls.get_width(example) + + +class DatetimeColumn(Column): + transform_column = IntColumn + + @classmethod + def compute_transformed_data( + cls, + data: pl.DataFrame, + column_name: str, + transformed_column_name: str, + min_date: datetime.datetime, + ) -> pl.DataFrame: + return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name)) + + @staticmethod + def shift_and_convert_to_string(min_date, seconds) -> str: + return datetime_to_string(min_date + datetime.timedelta(seconds=seconds)) + + @classmethod + def _compute_statistics( + cls, + data: pl.DataFrame, + column_name: str, + n_samples: int, + ) -> DatetimeStatisticsItem: + nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) + if nan_count == n_samples: # all values are None + return DatetimeStatisticsItem( + nan_count=n_samples, + nan_proportion=1.0, + min=None, + max=None, + mean=None, + median=None, + std=None, + histogram=None, + ) + + min_date = data[column_name].min() + timedelta_column_name = f"{column_name}_timedelta" + # compute distribution of time passed from min date in **seconds** + timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date) + timedelta_stats: NumericalStatisticsItem = cls.transform_column.compute_statistics( + timedelta_df, + column_name=timedelta_column_name, + n_samples=n_samples, + ) + for stat in ("max", "mean", "median"): + timedelta_stats[stat] = cls.shift_and_convert_to_string(min_date, timedelta_stats[stat]) + + bin_edges = [ + cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"] + ] + + return DatetimeStatisticsItem( + nan_count=nan_count, + nan_proportion=nan_proportion, + min=datetime_to_string(min_date), + max=timedelta_stats["max"], + mean=timedelta_stats["mean"], + median=timedelta_stats["median"], + std=str(timedelta_stats["std"]), + histogram=Histogram( + hist=timedelta_stats["histogram"]["hist"], + bin_edges=bin_edges, + ), + ) + + def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem: + stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples) + return StatisticsPerColumnItem( + column_name=self.name, + column_type=ColumnType.DATETIME, + column_statistics=stats, + ) + + +def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S") -> str: + """ + Convert a datetime.datetime object to a string. + + Args: + dt (datetime): The datetime object to convert. + format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S". + + Returns: + str: The datetime object as a string. + """ + return dt.strftime(format) From 851ec1b434a586e92e04de90e3ad4967ca674bc2 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 31 Jul 2024 16:42:59 +0200 Subject: [PATCH 02/40] fix typing --- .../worker/src/worker/statistics_utils.py | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index ccb28ace6b..dfd2599164 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -56,14 +56,19 @@ class ColumnType(str, enum.Enum): class Histogram(TypedDict): hist: list[int] - bin_edges: list[Union[int, float, str]] + bin_edges: list[Union[int, float]] + + +class DatetimeHistogram(TypedDict): + hist: list[int] + bin_edges: list[str] # edges are string representations of dates class NumericalStatisticsItem(TypedDict): nan_count: int nan_proportion: float - min: Optional[float] # might be None in very rare cases when the whole column is only None values - max: Optional[float] + min: Optional[Union[int, float]] # might be None in very rare cases when the whole column is only None values + max: Optional[Union[int, float]] mean: Optional[float] median: Optional[float] std: Optional[float] @@ -78,7 +83,7 @@ class DatetimeStatisticsItem(TypedDict): mean: Optional[str] median: Optional[str] std: Optional[str] # string representation of timedelta - histogram: Optional[Histogram] + histogram: Optional[DatetimeHistogram] class CategoricalStatisticsItem(TypedDict): @@ -730,8 +735,8 @@ def compute_transformed_data( return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name)) @staticmethod - def shift_and_convert_to_string(min_date, seconds) -> str: - return datetime_to_string(min_date + datetime.timedelta(seconds=seconds)) + def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str: + return datetime_to_string(base_date + datetime.timedelta(seconds=seconds)) @classmethod def _compute_statistics( @@ -753,7 +758,7 @@ def _compute_statistics( histogram=None, ) - min_date = data[column_name].min() + min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly timedelta_column_name = f"{column_name}_timedelta" # compute distribution of time passed from min date in **seconds** timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date) @@ -762,10 +767,14 @@ def _compute_statistics( column_name=timedelta_column_name, n_samples=n_samples, ) - for stat in ("max", "mean", "median"): - timedelta_stats[stat] = cls.shift_and_convert_to_string(min_date, timedelta_stats[stat]) - - bin_edges = [ + # to assure mypy that there values are not None to pass to conversion functions: + assert timedelta_stats["histogram"] is not None + assert timedelta_stats["max"] is not None + assert timedelta_stats["mean"] is not None + assert timedelta_stats["median"] is not None + assert timedelta_stats["std"] is not None + + datetime_bin_edges = [ cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"] ] @@ -773,13 +782,13 @@ def _compute_statistics( nan_count=nan_count, nan_proportion=nan_proportion, min=datetime_to_string(min_date), - max=timedelta_stats["max"], - mean=timedelta_stats["mean"], - median=timedelta_stats["median"], - std=str(timedelta_stats["std"]), - histogram=Histogram( + max=cls.shift_and_convert_to_string(min_date, timedelta_stats["max"]), + mean=cls.shift_and_convert_to_string(min_date, timedelta_stats["mean"]), + median=cls.shift_and_convert_to_string(min_date, timedelta_stats["median"]), + std=str(datetime.timedelta(seconds=timedelta_stats["std"])), + histogram=DatetimeHistogram( hist=timedelta_stats["histogram"]["hist"], - bin_edges=bin_edges, + bin_edges=datetime_bin_edges, ), ) From 3347c134fa2d062a9d4e4844f14118758d428838 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 1 Aug 2024 17:11:12 +0200 Subject: [PATCH 03/40] add testcase --- services/worker/tests/fixtures/datasets.py | 2 + .../tests/fixtures/statistics_dataset.py | 25 ++++++++ .../worker/tests/test_statistics_utils.py | 57 ++++++++++++++++++- 3 files changed, 83 insertions(+), 1 deletion(-) diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py index 77e41e2ae4..2b471a9861 100644 --- a/services/worker/tests/fixtures/datasets.py +++ b/services/worker/tests/fixtures/datasets.py @@ -28,6 +28,7 @@ from .statistics_dataset import ( audio_dataset, + datetime_dataset, image_dataset, null_column, statistics_dataset, @@ -238,4 +239,5 @@ def datasets() -> Mapping[str, Dataset]: "descriptive_statistics_not_supported": statistics_not_supported_dataset, "audio_statistics": audio_dataset, "image_statistics": image_dataset, + "datetime_statistics": datetime_dataset, } diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index f32e404131..7d60fd100c 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2024 The HuggingFace Authors. +from datetime import datetime from pathlib import Path from typing import Optional @@ -1698,3 +1699,27 @@ def null_column(n_samples: int) -> list[None]: } ), ) + + +datetime_dataset = Dataset.from_dict( + { + "datetime": [ + datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), + ] + }, + features=Features( + { + "datetime": Value("timestamp[s]"), + } + ), +) diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 80f41f317f..29abdfb3eb 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # Copyright 2024 The HuggingFace Authors. +import datetime from collections.abc import Mapping -from typing import Optional, Union +from typing import Any, Optional, Union import numpy as np import pandas as pd @@ -22,6 +23,7 @@ BoolColumn, ClassLabelColumn, ColumnType, + DatetimeColumn, FloatColumn, ImageColumn, IntColumn, @@ -470,3 +472,56 @@ def test_image_statistics( n_samples=4, ) assert computed == expected + + +def count_expected_statistics_for_datetime() -> dict[str, Any]: + seconds_in_day = 24 * 60 * 60 + timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) + std = timedeltas.std() + std_str = str(datetime.timedelta(seconds=std)) + std_str = std_str.split(".")[0] # check precision up to seconds + return { + "nan_count": 0, + "nan_proportion": 0.0, + "min": "2024-01-01 00:00:00", + "max": "2024-01-11 00:00:00", + "mean": "2024-01-06 00:00:00", + "median": "2024-01-06 00:00:00", + "std": std_str, + "histogram": { + "hist": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "bin_edges": [ + "2024-01-01 00:00:00", + "2024-01-02 00:00:01", + "2024-01-03 00:00:02", + "2024-01-04 00:00:03", + "2024-01-05 00:00:04", + "2024-01-06 00:00:05", + "2024-01-07 00:00:06", + "2024-01-08 00:00:07", + "2024-01-09 00:00:08", + "2024-01-10 00:00:09", + "2024-01-11 00:00:00", + ], + }, + } + + +@pytest.mark.parametrize( + "column_name", + ["datetime_column"], +) +def test_datetime_statistics( + column_name: str, + datasets: Mapping[str, Dataset], +) -> None: + column_name = "datetime" + expected = count_expected_statistics_for_datetime() + data = datasets["datetime_statistics"].to_pandas() + computed = DatetimeColumn.compute_statistics( + data=pl.from_pandas(data), + column_name=column_name, + n_samples=len(data[column_name]), + ) + assert computed.pop("std").split(".")[0] == expected.pop("std") + assert computed == expected From 0340b54c25bd47d2af51b4eaf553139c8023fe1b Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 5 Aug 2024 15:37:00 +0200 Subject: [PATCH 04/40] moar tests: column with nulls and all nulls column --- .../worker/src/worker/statistics_utils.py | 4 +- .../tests/fixtures/statistics_dataset.py | 18 +++- .../worker/tests/test_statistics_utils.py | 89 +++++++++++++------ 3 files changed, 80 insertions(+), 31 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index dfd2599164..9b4a2302b2 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -801,13 +801,13 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum ) -def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S") -> str: +def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str: """ Convert a datetime.datetime object to a string. Args: dt (datetime): The datetime object to convert. - format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S". + format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z". Returns: str: The datetime object as a string. diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index 7d60fd100c..c00c63afc5 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1715,11 +1715,27 @@ def null_column(n_samples: int) -> list[None]: datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), - ] + ], + "datetime_null": [ + datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + None, + datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"), + None, + datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"), + None, + datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"), + None, + datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"), + None, + datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), + ], + "datetime_all_null": [None] * 11, }, features=Features( { "datetime": Value("timestamp[s]"), + "datetime_null": Value("timestamp[s]"), + "datetime_all_null": Value("timestamp[s]"), } ), ) diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 29abdfb3eb..84eee81448 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -2,7 +2,7 @@ # Copyright 2024 The HuggingFace Authors. import datetime from collections.abc import Mapping -from typing import Any, Optional, Union +from typing import Optional, Union import numpy as np import pandas as pd @@ -474,54 +474,87 @@ def test_image_statistics( assert computed == expected -def count_expected_statistics_for_datetime() -> dict[str, Any]: +def count_expected_statistics_for_datetime(column: pd.Series, column_name: str) -> dict: # type: ignore + n_samples = column.shape[0] + nan_count = column.isna().sum() + if nan_count == n_samples: + return { + "nan_count": n_samples, + "nan_proportion": 1.0, + "min": None, + "max": None, + "mean": None, + "median": None, + "std": None, + "histogram": None, + } + + # hardcode expected values + minv = "2024-01-01 00:00:00" + maxv = "2024-01-11 00:00:00" + mean = "2024-01-06 00:00:00" + median = "2024-01-06 00:00:00" + bin_edges = [ + "2024-01-01 00:00:00", + "2024-01-02 00:00:01", + "2024-01-03 00:00:02", + "2024-01-04 00:00:03", + "2024-01-05 00:00:04", + "2024-01-06 00:00:05", + "2024-01-07 00:00:06", + "2024-01-08 00:00:07", + "2024-01-09 00:00:08", + "2024-01-10 00:00:09", + "2024-01-11 00:00:00", + ] + + # compute std seconds_in_day = 24 * 60 * 60 - timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) + if column_name == "datetime": + timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) + hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] + elif column_name == "datetime_null": + timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every second day + hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1] + else: + raise ValueError("Incorrect column") + std = timedeltas.std() std_str = str(datetime.timedelta(seconds=std)) - std_str = std_str.split(".")[0] # check precision up to seconds + return { - "nan_count": 0, - "nan_proportion": 0.0, - "min": "2024-01-01 00:00:00", - "max": "2024-01-11 00:00:00", - "mean": "2024-01-06 00:00:00", - "median": "2024-01-06 00:00:00", + "nan_count": nan_count, + "nan_proportion": np.round(nan_count / n_samples, DECIMALS).item() if nan_count else 0.0, + "min": minv, + "max": maxv, + "mean": mean, + "median": median, "std": std_str, "histogram": { - "hist": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1], - "bin_edges": [ - "2024-01-01 00:00:00", - "2024-01-02 00:00:01", - "2024-01-03 00:00:02", - "2024-01-04 00:00:03", - "2024-01-05 00:00:04", - "2024-01-06 00:00:05", - "2024-01-07 00:00:06", - "2024-01-08 00:00:07", - "2024-01-09 00:00:08", - "2024-01-10 00:00:09", - "2024-01-11 00:00:00", - ], + "hist": hist, + "bin_edges": bin_edges, }, } @pytest.mark.parametrize( "column_name", - ["datetime_column"], + ["datetime", "datetime_null", "datetime_all_null"], ) def test_datetime_statistics( column_name: str, datasets: Mapping[str, Dataset], ) -> None: - column_name = "datetime" - expected = count_expected_statistics_for_datetime() data = datasets["datetime_statistics"].to_pandas() + expected = count_expected_statistics_for_datetime(data[column_name], column_name) computed = DatetimeColumn.compute_statistics( data=pl.from_pandas(data), column_name=column_name, n_samples=len(data[column_name]), ) - assert computed.pop("std").split(".")[0] == expected.pop("std") + computed_std, expected_std = computed.pop("std"), expected.pop("std") + if computed_std: + assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds + else: + assert computed_std == expected_std assert computed == expected From 434b2d8a0d487425d0e8078f9ff3c9392de69a3c Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 8 Aug 2024 14:07:01 +0200 Subject: [PATCH 05/40] add datetime to worker --- .../job_runners/split/descriptive_statistics.py | 15 ++++++++++++++- services/worker/src/worker/statistics_utils.py | 11 ++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py index ed8bb6aa17..06d485d5bb 100644 --- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py +++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py @@ -32,6 +32,7 @@ from worker.dtos import CompleteJobResult from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache from worker.statistics_utils import ( + DATETIME_DTYPES, FLOAT_DTYPES, INTEGER_DTYPES, NUMERICAL_DTYPES, @@ -39,6 +40,7 @@ AudioColumn, BoolColumn, ClassLabelColumn, + DatetimeColumn, FloatColumn, ImageColumn, IntColumn, @@ -57,7 +59,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict): SupportedColumns = Union[ - ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn + ClassLabelColumn, + IntColumn, + FloatColumn, + StringColumn, + BoolColumn, + ListColumn, + AudioColumn, + ImageColumn, + DatetimeColumn, ] @@ -238,6 +248,9 @@ def _column_from_feature( if dataset_feature.get("dtype") == "bool": return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples) + + if dataset_feature.get("dtype") in DATETIME_DTYPES: + return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples) return None columns: list[SupportedColumns] = [] diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 9b4a2302b2..23e80ab775 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -39,6 +39,7 @@ FLOAT_DTYPES = ["float16", "float32", "float64"] NUMERICAL_DTYPES = INTEGER_DTYPES + FLOAT_DTYPES STRING_DTYPES = ["string", "large_string"] +DATETIME_DTYPES = ["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"] class ColumnType(str, enum.Enum): @@ -768,11 +769,11 @@ def _compute_statistics( n_samples=n_samples, ) # to assure mypy that there values are not None to pass to conversion functions: - assert timedelta_stats["histogram"] is not None - assert timedelta_stats["max"] is not None - assert timedelta_stats["mean"] is not None - assert timedelta_stats["median"] is not None - assert timedelta_stats["std"] is not None + assert timedelta_stats["histogram"] is not None # nosec + assert timedelta_stats["max"] is not None # nosec + assert timedelta_stats["mean"] is not None # nosec + assert timedelta_stats["median"] is not None # nosec + assert timedelta_stats["std"] is not None # nosec datetime_bin_edges = [ cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"] From 260458758c3f64c7af1bd17eef7475444a3414ce Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 8 Aug 2024 14:07:12 +0200 Subject: [PATCH 06/40] add test --- services/worker/tests/fixtures/hub.py | 20 ++++++ .../split/test_descriptive_statistics.py | 67 ++++++++++++++----- .../worker/tests/test_statistics_utils.py | 4 +- 3 files changed, 71 insertions(+), 20 deletions(-) diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index 62046b66c7..d5b890dbcc 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -354,6 +354,13 @@ def hub_public_image_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str delete_hub_dataset_repo(repo_id=repo_id) +@pytest.fixture(scope="session") +def hub_public_datetime_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str]: + repo_id = create_hub_dataset_repo(prefix="datetime_statistics", dataset=datasets["datetime_statistics"]) + yield repo_id + delete_hub_dataset_repo(repo_id=repo_id) + + @pytest.fixture(scope="session") def hub_public_n_configs_with_default(datasets: Mapping[str, Dataset]) -> Iterator[str]: default_config_name, _ = get_default_config_split() @@ -1207,6 +1214,19 @@ def hub_responses_image_statistics( } +@pytest.fixture +def hub_responses_datetime_statistics( + hub_public_datetime_statistics: str, +) -> HubDatasetTest: + return { + "name": hub_public_datetime_statistics, + "config_names_response": create_config_names_response(hub_public_datetime_statistics), + "splits_response": create_splits_response(hub_public_datetime_statistics), + "first_rows_response": None, + "parquet_and_info_response": None, + } + + @pytest.fixture def hub_responses_descriptive_statistics_parquet_builder( hub_public_descriptive_statistics_parquet_builder: str, diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py index 54f1f53954..a95932d67a 100644 --- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py +++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py @@ -3,7 +3,7 @@ from collections.abc import Callable, Mapping from dataclasses import replace from http import HTTPStatus -from typing import Optional +from typing import Any, Optional import pandas as pd import polars as pl @@ -30,6 +30,7 @@ from ...test_statistics_utils import ( count_expected_statistics_for_bool_column, count_expected_statistics_for_categorical_column, + count_expected_statistics_for_datetime_column, count_expected_statistics_for_list_column, count_expected_statistics_for_numerical_column, count_expected_statistics_for_string_column, @@ -215,7 +216,7 @@ def _get_job_runner( @pytest.fixture -def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict: # type: ignore +def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]: ds = datasets["descriptive_statistics"] df = ds.to_pandas() expected_statistics = {} @@ -253,7 +254,7 @@ def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict: # @pytest.fixture -def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) -> dict: # type: ignore +def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]: ds = datasets["descriptive_statistics_string_text"] df = ds.to_pandas() expected_statistics = {} @@ -270,7 +271,7 @@ def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) @pytest.fixture -def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, Dataset]) -> dict: # type: ignore +def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]: ds = datasets["descriptive_statistics_string_text"] df = ds.to_pandas()[:50] # see `fixtures.hub.hub_public_descriptive_statistics_parquet_builder` expected_statistics = {} @@ -287,7 +288,7 @@ def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, D @pytest.fixture -def audio_statistics_expected() -> dict: # type: ignore +def audio_statistics_expected() -> dict[str, Any]: column_names_to_durations = [ ("audio", [1.0, 2.0, 3.0, 4.0]), # datasets consists of 4 audio files of 1, 2, 3, 4 seconds lengths ("audio_null", [1.0, None, 3.0, None]), # take first and third audio file for this testcase @@ -312,7 +313,7 @@ def audio_statistics_expected() -> dict: # type: ignore @pytest.fixture -def image_statistics_expected() -> dict: # type: ignore +def image_statistics_expected() -> dict[str, Any]: column_names_to_widths = [ ("image", [640, 1440, 520, 1240]), # datasets consists of 4 image files ("image_null", [640, None, 520, None]), # take first and third image file for this testcase @@ -334,6 +335,21 @@ def image_statistics_expected() -> dict: # type: ignore } +@pytest.fixture +def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]: + ds = datasets["datetime_statistics"] + df = ds.to_pandas() + expected_statistics = {} + for column_name in df.columns: + statistics = count_expected_statistics_for_datetime_column(column=df[column_name], column_name=column_name) + expected_statistics[column_name] = { + "column_name": column_name, + "column_type": ColumnType.DATETIME, + "column_statistics": statistics, + } + return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False} + + @pytest.fixture def struct_thread_panic_error_parquet_file(tmp_path_factory: pytest.TempPathFactory) -> str: repo_id = "__DUMMY_TRANSFORMERS_USER__/test_polars_panic_error" @@ -369,13 +385,14 @@ def test_polars_struct_thread_panic_error(struct_thread_panic_error_parquet_file @pytest.mark.parametrize( "hub_dataset_name,expected_error_code", [ - ("descriptive_statistics", None), - ("descriptive_statistics_string_text", None), - ("descriptive_statistics_string_text_partial", None), - ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), - ("audio_statistics", None), - ("image_statistics", None), - ("gated", None), + # ("descriptive_statistics", None), + # ("descriptive_statistics_string_text", None), + # ("descriptive_statistics_string_text_partial", None), + # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), + # ("audio_statistics", None), + # ("image_statistics", None), + ("datetime_statistics", None), + # ("gated", None), ], ) def test_compute( @@ -391,13 +408,15 @@ def test_compute( hub_responses_descriptive_statistics_not_supported: HubDatasetTest, hub_responses_audio_statistics: HubDatasetTest, hub_responses_image_statistics: HubDatasetTest, + hub_responses_datetime_statistics: HubDatasetTest, hub_dataset_name: str, expected_error_code: Optional[str], - descriptive_statistics_expected: dict, # type: ignore - descriptive_statistics_string_text_expected: dict, # type: ignore - descriptive_statistics_string_text_partial_expected: dict, # type: ignore - audio_statistics_expected: dict, # type: ignore - image_statistics_expected: dict, # type: ignore + descriptive_statistics_expected: dict[str, Any], + descriptive_statistics_string_text_expected: dict[str, Any], + descriptive_statistics_string_text_partial_expected: dict[str, Any], + audio_statistics_expected: dict[str, Any], + image_statistics_expected: dict[str, Any], + datetime_statistics_expected: dict[str, Any], ) -> None: hub_datasets = { "descriptive_statistics": hub_responses_descriptive_statistics, @@ -407,6 +426,7 @@ def test_compute( "gated": hub_responses_gated_descriptive_statistics, "audio_statistics": hub_responses_audio_statistics, "image_statistics": hub_responses_image_statistics, + "datetime_statistics": hub_responses_datetime_statistics, } expected = { "descriptive_statistics": descriptive_statistics_expected, @@ -416,6 +436,7 @@ def test_compute( "descriptive_statistics_string_text_partial": descriptive_statistics_string_text_partial_expected, "audio_statistics": audio_statistics_expected, "image_statistics": image_statistics_expected, + "datetime_statistics": datetime_statistics_expected, } dataset = hub_datasets[hub_dataset_name]["name"] splits_response = hub_datasets[hub_dataset_name]["splits_response"] @@ -534,5 +555,15 @@ def test_compute( column_response_stats.pop("nan_proportion") ) == expected_column_response_stats.pop("nan_proportion") assert column_response_stats == expected_column_response_stats + elif column_response["column_type"] is ColumnType.DATETIME: + std, expected_std = ( + column_response_stats.pop("std"), + expected_column_response_stats.pop("std"), + ) + if std: + assert std.split(".")[0] == expected_std.split(".")[0] + else: + assert std == expected_std + assert column_response_stats == expected_column_response_stats else: raise ValueError("Incorrect data type") diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 84eee81448..377cb47c86 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -474,7 +474,7 @@ def test_image_statistics( assert computed == expected -def count_expected_statistics_for_datetime(column: pd.Series, column_name: str) -> dict: # type: ignore +def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict: # type: ignore n_samples = column.shape[0] nan_count = column.isna().sum() if nan_count == n_samples: @@ -546,7 +546,7 @@ def test_datetime_statistics( datasets: Mapping[str, Dataset], ) -> None: data = datasets["datetime_statistics"].to_pandas() - expected = count_expected_statistics_for_datetime(data[column_name], column_name) + expected = count_expected_statistics_for_datetime_column(data[column_name], column_name) computed = DatetimeColumn.compute_statistics( data=pl.from_pandas(data), column_name=column_name, From 913f812f472e30ca1eca102ac0eaa5eecb7814b3 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 9 Aug 2024 13:23:39 +0200 Subject: [PATCH 07/40] include timezone aware --- .../split/descriptive_statistics.py | 23 ++++++++++--------- .../worker/src/worker/statistics_utils.py | 1 - .../tests/fixtures/statistics_dataset.py | 14 +++++++++++ .../split/test_descriptive_statistics.py | 14 +++++------ .../worker/tests/test_statistics_utils.py | 7 ++++-- 5 files changed, 38 insertions(+), 21 deletions(-) diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py index 06d485d5bb..3c3886d703 100644 --- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py +++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py @@ -32,7 +32,6 @@ from worker.dtos import CompleteJobResult from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache from worker.statistics_utils import ( - DATETIME_DTYPES, FLOAT_DTYPES, INTEGER_DTYPES, NUMERICAL_DTYPES, @@ -225,31 +224,33 @@ def _column_from_feature( return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples) if isinstance(dataset_feature, dict): - if dataset_feature.get("_type") == "ClassLabel": + _type = dataset_feature.get("_type") + if _type == "ClassLabel": return ClassLabelColumn( feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature ) - if dataset_feature.get("_type") == "Audio": + if _type == "Audio": return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples) - if dataset_feature.get("_type") == "Image": + if _type == "Image": return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples) - if dataset_feature.get("_type") == "Value": - if dataset_feature.get("dtype") in INTEGER_DTYPES: + if _type == "Value": + dtype = dataset_feature.get("dtype", "") + if dtype in INTEGER_DTYPES: return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples) - if dataset_feature.get("dtype") in FLOAT_DTYPES: + if dtype in FLOAT_DTYPES: return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples) - if dataset_feature.get("dtype") in STRING_DTYPES: + if dtype in STRING_DTYPES: return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples) - if dataset_feature.get("dtype") == "bool": + if dtype == "bool": return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples) - if dataset_feature.get("dtype") in DATETIME_DTYPES: + if dtype.startswith("timestamp"): return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples) return None @@ -262,7 +263,7 @@ def _column_from_feature( if not columns: raise NoSupportedFeaturesError( "No columns for statistics computation found. Currently supported feature types are: " - f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. " + f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. " ) column_names_str = ", ".join([column.name for column in columns]) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 23e80ab775..d514cec931 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -39,7 +39,6 @@ FLOAT_DTYPES = ["float16", "float32", "float64"] NUMERICAL_DTYPES = INTEGER_DTYPES + FLOAT_DTYPES STRING_DTYPES = ["string", "large_string"] -DATETIME_DTYPES = ["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"] class ColumnType(str, enum.Enum): diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index c00c63afc5..c233e61639 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1716,6 +1716,19 @@ def null_column(n_samples: int) -> list[None]: datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), ], + "datetime_tz": [ + datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-03 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-04 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-05 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-06 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-07 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-08 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-09 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-10 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + datetime.strptime("2024-01-11 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), + ], "datetime_null": [ datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), None, @@ -1734,6 +1747,7 @@ def null_column(n_samples: int) -> list[None]: features=Features( { "datetime": Value("timestamp[s]"), + "datetime_tz": Value("timestamp[s, tz=+02:00]"), "datetime_null": Value("timestamp[s]"), "datetime_all_null": Value("timestamp[s]"), } diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py index a95932d67a..7cdd785def 100644 --- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py +++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py @@ -385,14 +385,14 @@ def test_polars_struct_thread_panic_error(struct_thread_panic_error_parquet_file @pytest.mark.parametrize( "hub_dataset_name,expected_error_code", [ - # ("descriptive_statistics", None), - # ("descriptive_statistics_string_text", None), - # ("descriptive_statistics_string_text_partial", None), - # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), - # ("audio_statistics", None), - # ("image_statistics", None), + ("descriptive_statistics", None), + ("descriptive_statistics_string_text", None), + ("descriptive_statistics_string_text_partial", None), + ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), + ("audio_statistics", None), + ("image_statistics", None), ("datetime_statistics", None), - # ("gated", None), + ("gated", None), ], ) def test_compute( diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 377cb47c86..1a34fadce1 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -507,10 +507,13 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name "2024-01-10 00:00:09", "2024-01-11 00:00:00", ] + if column_name == "datetime_tz": + bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges] + minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200" # compute std seconds_in_day = 24 * 60 * 60 - if column_name == "datetime": + if column_name in ["datetime", "datetime_tz"]: timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] elif column_name == "datetime_null": @@ -539,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name @pytest.mark.parametrize( "column_name", - ["datetime", "datetime_null", "datetime_all_null"], + ["datetime", "datetime_tz", "datetime_null", "datetime_all_null"], ) def test_datetime_statistics( column_name: str, From d51739356a2834ef2df49fb8f8ae86dd1c9561e6 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 14 Oct 2024 15:10:19 +0200 Subject: [PATCH 08/40] refactor --- libs/libcommon/src/libcommon/utils.py | 14 +++++++ .../worker/src/worker/statistics_utils.py | 38 +++++++++++-------- .../split/test_descriptive_statistics.py | 1 - .../worker/tests/test_statistics_utils.py | 2 +- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index c85079b697..3a08ebf8d1 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -2,12 +2,14 @@ # Copyright 2022 The HuggingFace Authors. import base64 +import datetime import functools import logging import mimetypes import time from collections.abc import Callable, Sequence from datetime import datetime, timedelta, timezone +from dateutil import parser from fnmatch import fnmatch from pathlib import Path from typing import Any, Optional, TypeVar, Union, cast @@ -93,6 +95,18 @@ def get_datetime(days: Optional[float] = None) -> datetime: return date +def is_datetime(string: str): + try: + parser.parse(string) + return True + except ValueError: + return False + + +def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str: + return dt.strftime(format) + + def get_duration(started_at: datetime) -> float: """ Get time in seconds that has passed from `started_at` until now. diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index d514cec931..28d340faa5 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -15,6 +15,7 @@ from libcommon.exceptions import ( StatisticsComputationError, ) +from libcommon.utils import datetime_to_string, is_datetime from PIL import Image from tqdm.contrib.concurrent import thread_map @@ -476,6 +477,13 @@ def is_class(n_unique: int, n_samples: int) -> bool: n_unique / n_samples <= MAX_PROPORTION_STRING_LABELS and n_unique <= MAX_NUM_STRING_LABELS ) or n_unique <= NUM_BINS + @staticmethod + def is_datetime(data: pl.DataFrame, column_name: str) -> bool: + """Check if first 1000 non-null samples in a column match datetime format.""" + + values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list() + return all(is_datetime(value) for value in values) + @classmethod def compute_transformed_data( cls, @@ -493,7 +501,7 @@ def _compute_statistics( data: pl.DataFrame, column_name: str, n_samples: int, - ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem]: + ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]: nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) n_unique = data[column_name].n_unique() if cls.is_class(n_unique, n_samples): @@ -509,6 +517,13 @@ def _compute_statistics( n_unique=len(labels2counts), frequencies=labels2counts, ) + if cls.is_datetime(data, column_name): + datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( + data.select(pl.col(column_name).cast(pl.Datetime)), + column_name=column_name, + n_samples=n_samples, + ) + return datetime_stats lengths_column_name = f"{column_name}_len" lengths_df = cls.compute_transformed_data(data, column_name, transformed_column_name=lengths_column_name) @@ -519,7 +534,12 @@ def _compute_statistics( def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem: stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples) - string_type = ColumnType.STRING_LABEL if "frequencies" in stats else ColumnType.STRING_TEXT + if "frequencies" in stats: + string_type = ColumnType.STRING_LABEL + elif isinstance(stats["histogram"], DatetimeHistogram): # type: ignore + string_type = ColumnType.DATETIME + else: + string_type = ColumnType.STRING_TEXT return StatisticsPerColumnItem( column_name=self.name, column_type=string_type, @@ -799,17 +819,3 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum column_type=ColumnType.DATETIME, column_statistics=stats, ) - - -def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str: - """ - Convert a datetime.datetime object to a string. - - Args: - dt (datetime): The datetime object to convert. - format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z". - - Returns: - str: The datetime object as a string. - """ - return dt.strftime(format) diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py index 14fb9dbf3a..4aa1c68900 100644 --- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py +++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py @@ -347,7 +347,6 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False} - @pytest.mark.parametrize( "hub_dataset_name,expected_error_code", [ diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 1a34fadce1..dc74d9a31c 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -517,7 +517,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] elif column_name == "datetime_null": - timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every second day + timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every other day hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1] else: raise ValueError("Incorrect column") From 7046d8b7d67d1926d2e3e41b80420395b1f0f647 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 14 Oct 2024 15:21:10 +0200 Subject: [PATCH 09/40] fix --- libs/libcommon/src/libcommon/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index 3a08ebf8d1..b81ff70fff 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -2,14 +2,12 @@ # Copyright 2022 The HuggingFace Authors. import base64 -import datetime import functools import logging import mimetypes import time from collections.abc import Callable, Sequence from datetime import datetime, timedelta, timezone -from dateutil import parser from fnmatch import fnmatch from pathlib import Path from typing import Any, Optional, TypeVar, Union, cast @@ -17,6 +15,7 @@ import orjson import pandas as pd import pytz +from dateutil import parser from huggingface_hub import constants, hf_hub_download from requests.exceptions import ReadTimeout @@ -95,7 +94,7 @@ def get_datetime(days: Optional[float] = None) -> datetime: return date -def is_datetime(string: str): +def is_datetime(string: str) -> bool: try: parser.parse(string) return True From 945dff0378043a9ae4ab79ef56fa82f1b8abab44 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 14 Oct 2024 15:37:04 +0200 Subject: [PATCH 10/40] do not typecheck dateutil --- libs/libcommon/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml index 48cc7629bb..c6c0b9e679 100644 --- a/libs/libcommon/pyproject.toml +++ b/libs/libcommon/pyproject.toml @@ -76,6 +76,7 @@ module = [ "moto.*", "aiobotocore.*", "requests.*", + "dateutil.*" ] # ^ huggingface_hub is not typed since version 0.13.0 ignore_missing_imports = true From bdec2e475b0bfc4c3a47ba3a6a1b2ca17cfa4d1d Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 23 Dec 2024 13:29:13 +0100 Subject: [PATCH 11/40] fix --- services/worker/src/worker/statistics_utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 28d340faa5..d53597fff8 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -504,6 +504,14 @@ def _compute_statistics( ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]: nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) n_unique = data[column_name].n_unique() + if cls.is_datetime(data, column_name): + datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( + data.select(pl.col(column_name).cast(pl.Datetime)), + column_name=column_name, + n_samples=n_samples, + ) + return datetime_stats + if cls.is_class(n_unique, n_samples): labels2counts: dict[str, int] = value_counts(data, column_name) if nan_count != n_samples else {} logging.debug(f"{n_unique=} {nan_count=} {nan_proportion=} {labels2counts=}") @@ -517,13 +525,6 @@ def _compute_statistics( n_unique=len(labels2counts), frequencies=labels2counts, ) - if cls.is_datetime(data, column_name): - datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( - data.select(pl.col(column_name).cast(pl.Datetime)), - column_name=column_name, - n_samples=n_samples, - ) - return datetime_stats lengths_column_name = f"{column_name}_len" lengths_df = cls.compute_transformed_data(data, column_name, transformed_column_name=lengths_column_name) @@ -536,7 +537,7 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples) if "frequencies" in stats: string_type = ColumnType.STRING_LABEL - elif isinstance(stats["histogram"], DatetimeHistogram): # type: ignore + elif isinstance(stats["histogram"]["bin_edges"][0], str): string_type = ColumnType.DATETIME else: string_type = ColumnType.STRING_TEXT From f9ffe82d9a7f9ac018fa9aa436d783fe05115f1b Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 23 Dec 2024 13:29:22 +0100 Subject: [PATCH 12/40] more tests --- .../tests/fixtures/statistics_dataset.py | 14 +++++++++++++ .../worker/tests/test_statistics_utils.py | 21 ++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index c233e61639..acbec4858a 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1716,6 +1716,19 @@ def null_column(n_samples: int) -> list[None]: datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), ], + "datetime_string": [ + "2024-01-01 00:00:00", + "2024-01-02 00:00:00", + "2024-01-03 00:00:00", + "2024-01-04 00:00:00", + "2024-01-05 00:00:00", + "2024-01-06 00:00:00", + "2024-01-07 00:00:00", + "2024-01-08 00:00:00", + "2024-01-09 00:00:00", + "2024-01-10 00:00:00", + "2024-01-11 00:00:00", + ], "datetime_tz": [ datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), @@ -1747,6 +1760,7 @@ def null_column(n_samples: int) -> list[None]: features=Features( { "datetime": Value("timestamp[s]"), + "datetime_string": Value("string"), "datetime_tz": Value("timestamp[s, tz=+02:00]"), "datetime_null": Value("timestamp[s]"), "datetime_all_null": Value("timestamp[s]"), diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index dc74d9a31c..e2ff01f120 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -513,7 +513,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name # compute std seconds_in_day = 24 * 60 * 60 - if column_name in ["datetime", "datetime_tz"]: + if column_name in ["datetime", "datetime_string", "datetime_tz"]: timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] elif column_name == "datetime_null": @@ -542,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name @pytest.mark.parametrize( "column_name", - ["datetime", "datetime_tz", "datetime_null", "datetime_all_null"], + ["datetime", "datetime_string", "datetime_tz", "datetime_null", "datetime_all_null"], ) def test_datetime_statistics( column_name: str, @@ -550,11 +550,18 @@ def test_datetime_statistics( ) -> None: data = datasets["datetime_statistics"].to_pandas() expected = count_expected_statistics_for_datetime_column(data[column_name], column_name) - computed = DatetimeColumn.compute_statistics( - data=pl.from_pandas(data), - column_name=column_name, - n_samples=len(data[column_name]), - ) + if column_name == "datetime_string": + computed = StringColumn.compute_statistics( + data=pl.from_pandas(data), + column_name=column_name, + n_samples=len(data[column_name]), + ) + else: + computed = DatetimeColumn.compute_statistics( + data=pl.from_pandas(data), + column_name=column_name, + n_samples=len(data[column_name]), + ) computed_std, expected_std = computed.pop("std"), expected.pop("std") if computed_std: assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds From d2c37c6b71f4b0f457353d1f0a87ad8a5033f35d Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 26 Dec 2024 14:02:56 +0100 Subject: [PATCH 13/40] fix string to datetime conversion: add format inferring --- libs/libcommon/src/libcommon/utils.py | 44 +++++++++++++++++++ .../worker/src/worker/statistics_utils.py | 25 ++++++++--- .../split/test_descriptive_statistics.py | 10 ++--- 3 files changed, 68 insertions(+), 11 deletions(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index b81ff70fff..a89f49d980 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -106,6 +106,50 @@ def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str return dt.strftime(format) +def identify_datetime_format(datetime_string: str) -> Optional[str]: + # Common datetime formats + common_formats = [ + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S%z", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + "%Y-%m-%d", + "%d-%m-%Y %H:%M:%S%z", + "%d-%m-%Y %H:%M:%S", + "%d-%m-%Y %H:%M", + "%d-%m-%Y", + "%m-%d-%Y %H:%M:%S%z", + "%m-%d-%Y %H:%M:%S", + "%m-%d-%Y %H:%M", + "%m-%d-%Y", + + "%Y/%m/%d %H:%M:%S%z", + "%Y/%m/%d %H:%M:%S", + "%Y/%m/%d %H:%M", + "%Y/%m/%d", + "%d/%m/%Y %H:%M:%S%z", + "%d/%m/%Y %H:%M:%S", + "%d/%m/%Y %H:%M", + "%d/%m/%Y", + "%m/%d/%Y %H:%M:%S%z", + "%m/%d/%Y %H:%M:%S", + "%m/%d/%Y %H:%M", + "%m/%d/%Y", + + "%B %d, %Y", + "%d %B %Y", + ] + + for fmt in common_formats: + try: + datetime.strptime(datetime_string, fmt) + return fmt + except ValueError: + continue + + def get_duration(started_at: datetime) -> float: """ Get time in seconds that has passed from `started_at` until now. diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index d53597fff8..cb5e19f8ca 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -15,7 +15,7 @@ from libcommon.exceptions import ( StatisticsComputationError, ) -from libcommon.utils import datetime_to_string, is_datetime +from libcommon.utils import datetime_to_string, identify_datetime_format, is_datetime from PIL import Image from tqdm.contrib.concurrent import thread_map @@ -478,11 +478,19 @@ def is_class(n_unique: int, n_samples: int) -> bool: ) or n_unique <= NUM_BINS @staticmethod - def is_datetime(data: pl.DataFrame, column_name: str) -> bool: - """Check if first 1000 non-null samples in a column match datetime format.""" + def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]: + """Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format""" values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list() - return all(is_datetime(value) for value in values) + _is_datetime = all(is_datetime(value) for value in values) + + if _is_datetime: + formats = [identify_datetime_format(value) for value in values] + if len(set(formats)) == 1: + return True, formats[0] + raise StatisticsComputationError("Multiple datetime formats detected. ") + + return False, None @classmethod def compute_transformed_data( @@ -504,11 +512,13 @@ def _compute_statistics( ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]: nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) n_unique = data[column_name].n_unique() - if cls.is_datetime(data, column_name): + _is_datetime, datetime_format = cls.is_datetime(data, column_name) + if _is_datetime: datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( - data.select(pl.col(column_name).cast(pl.Datetime)), + data, column_name=column_name, n_samples=n_samples, + format=datetime_format, ) return datetime_stats @@ -765,6 +775,7 @@ def _compute_statistics( data: pl.DataFrame, column_name: str, n_samples: int, + format: Optional[str] = None, ) -> DatetimeStatisticsItem: nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) if nan_count == n_samples: # all values are None @@ -778,6 +789,8 @@ def _compute_statistics( std=None, histogram=None, ) + if isinstance(data[column_name].dtype, pl.String): + data = data.with_columns(pl.col(column_name).str.to_datetime(format=format)) min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly timedelta_column_name = f"{column_name}_timedelta" diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py index 4aa1c68900..20ca8b369f 100644 --- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py +++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py @@ -353,11 +353,11 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A ("descriptive_statistics", None), ("descriptive_statistics_string_text", None), ("descriptive_statistics_string_text_partial", None), - ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), - ("audio_statistics", None), - ("image_statistics", None), - ("datetime_statistics", None), - ("gated", None), + # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), + # ("audio_statistics", None), + # ("image_statistics", None), + # ("datetime_statistics", None), + # ("gated", None), ], ) def test_compute( From 658719e1fc585f55dbefd0d64e840144f5708fb7 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 26 Dec 2024 14:45:02 +0100 Subject: [PATCH 14/40] fix style --- libs/libcommon/src/libcommon/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index a89f49d980..90c181ae21 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -124,7 +124,6 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: "%m-%d-%Y %H:%M:%S", "%m-%d-%Y %H:%M", "%m-%d-%Y", - "%Y/%m/%d %H:%M:%S%z", "%Y/%m/%d %H:%M:%S", "%Y/%m/%d %H:%M", @@ -137,7 +136,6 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: "%m/%d/%Y %H:%M:%S", "%m/%d/%Y %H:%M", "%m/%d/%Y", - "%B %d, %Y", "%d %B %Y", ] @@ -148,6 +146,7 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: return fmt except ValueError: continue + return None def get_duration(started_at: datetime) -> float: From 5c2d94a546d07355938ee4c45b53c3fca96fb899 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 27 Dec 2024 12:45:23 +0100 Subject: [PATCH 15/40] fix check for datetime --- services/worker/src/worker/statistics_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index cb5e19f8ca..e35e04a266 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -482,7 +482,7 @@ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[st """Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format""" values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list() - _is_datetime = all(is_datetime(value) for value in values) + _is_datetime = all(is_datetime(value) for value in values) if len(values) > 0 else False if _is_datetime: formats = [identify_datetime_format(value) for value in values] From 359a30bf1b625e93fb64f786e75844882d27d28e Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 27 Dec 2024 13:41:06 +0100 Subject: [PATCH 16/40] minor --- .../split/test_descriptive_statistics.py | 10 +++++----- services/worker/tests/test_statistics_utils.py | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py index 20ca8b369f..4aa1c68900 100644 --- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py +++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py @@ -353,11 +353,11 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A ("descriptive_statistics", None), ("descriptive_statistics_string_text", None), ("descriptive_statistics_string_text_partial", None), - # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), - # ("audio_statistics", None), - # ("image_statistics", None), - # ("datetime_statistics", None), - # ("gated", None), + ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"), + ("audio_statistics", None), + ("image_statistics", None), + ("datetime_statistics", None), + ("gated", None), ], ) def test_compute( diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index e2ff01f120..d693acf501 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -2,7 +2,7 @@ # Copyright 2024 The HuggingFace Authors. import datetime from collections.abc import Mapping -from typing import Optional, Union +from typing import Any, Optional, Union import numpy as np import pandas as pd @@ -68,7 +68,7 @@ def test_generate_bins( def count_expected_statistics_for_numerical_column( column: pd.Series, # type: ignore dtype: ColumnType, -) -> dict: # type: ignore +) -> dict[str, Any]: minimum, maximum, mean, median, std = ( column.min(), column.max(), @@ -125,7 +125,7 @@ def count_expected_statistics_for_numerical_column( } -def count_expected_statistics_for_list_column(column: pd.Series) -> dict: # type: ignore +def count_expected_statistics_for_list_column(column: pd.Series) -> dict[str, Any]: if column.isnull().all(): lengths_column = pd.Series([None] * column.shape[0]) return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT) @@ -141,7 +141,7 @@ def count_expected_statistics_for_list_column(column: pd.Series) -> dict: # typ def count_expected_statistics_for_categorical_column( column: pd.Series, # type: ignore class_label_feature: ClassLabel, -) -> dict: # type: ignore +) -> dict[str, Any]: n_samples = column.shape[0] nan_count = column.isna().sum() value_counts = column.value_counts(dropna=True).to_dict() @@ -160,7 +160,7 @@ def count_expected_statistics_for_categorical_column( } -def count_expected_statistics_for_string_column(column: pd.Series) -> dict: # type: ignore +def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, Any]: n_samples = column.shape[0] nan_count = column.isna().sum() value_counts = column.value_counts(dropna=True).to_dict() @@ -183,7 +183,7 @@ def count_expected_statistics_for_string_column(column: pd.Series) -> dict: # t return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT) -def count_expected_statistics_for_bool_column(column: pd.Series) -> dict: # type: ignore +def count_expected_statistics_for_bool_column(column: pd.Series) -> dict[str, Any]: n_samples = column.shape[0] nan_count = column.isna().sum() value_counts = column.value_counts(dropna=True).to_dict() @@ -474,7 +474,7 @@ def test_image_statistics( assert computed == expected -def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict: # type: ignore +def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict[str, Any]: n_samples = column.shape[0] nan_count = column.isna().sum() if nan_count == n_samples: From 0744e075b4ec8d41a0d84004fd96b7cfb83f1e57 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 27 Dec 2024 14:01:50 +0100 Subject: [PATCH 17/40] mypy --- services/worker/tests/test_statistics_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index d693acf501..648cf6791b 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -125,7 +125,7 @@ def count_expected_statistics_for_numerical_column( } -def count_expected_statistics_for_list_column(column: pd.Series) -> dict[str, Any]: +def count_expected_statistics_for_list_column(column: pd.Series) -> dict[str, Any]: # type: ignore if column.isnull().all(): lengths_column = pd.Series([None] * column.shape[0]) return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT) @@ -160,7 +160,7 @@ def count_expected_statistics_for_categorical_column( } -def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, Any]: +def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, Any]: # type: ignore n_samples = column.shape[0] nan_count = column.isna().sum() value_counts = column.value_counts(dropna=True).to_dict() @@ -183,7 +183,7 @@ def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT) -def count_expected_statistics_for_bool_column(column: pd.Series) -> dict[str, Any]: +def count_expected_statistics_for_bool_column(column: pd.Series) -> dict[str, Any]: # type: ignore n_samples = column.shape[0] nan_count = column.isna().sum() value_counts = column.value_counts(dropna=True).to_dict() @@ -474,7 +474,7 @@ def test_image_statistics( assert computed == expected -def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict[str, Any]: +def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict[str, Any]: # type: ignore n_samples = column.shape[0] nan_count = column.isna().sum() if nan_count == n_samples: From 53e210085722afbda4e50b762edbabb13f5cd941 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 6 Jan 2025 17:50:36 +0100 Subject: [PATCH 18/40] add testcase currently not passing, bug on polars side? --- services/worker/src/worker/statistics_utils.py | 6 ++++-- .../worker/tests/fixtures/statistics_dataset.py | 14 ++++++++++++++ services/worker/tests/test_statistics_utils.py | 6 +++--- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index e35e04a266..b1322ce66a 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -826,8 +826,10 @@ def _compute_statistics( ), ) - def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem: - stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples) + def compute_and_prepare_response( + self, data: pl.DataFrame, format: Optional[str] = None + ) -> StatisticsPerColumnItem: + stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples, format=format) return StatisticsPerColumnItem( column_name=self.name, column_type=ColumnType.DATETIME, diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index acbec4858a..8768f854f0 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1729,6 +1729,19 @@ def null_column(n_samples: int) -> list[None]: "2024-01-10 00:00:00", "2024-01-11 00:00:00", ], + "datetime_string_z": [ + "2024-01-01 00:00:00Z", + "2024-01-02 00:00:00Z", + "2024-01-03 00:00:00Z", + "2024-01-04 00:00:00Z", + "2024-01-05 00:00:00Z", + "2024-01-06 00:00:00Z", + "2024-01-07 00:00:00Z", + "2024-01-08 00:00:00Z", + "2024-01-09 00:00:00Z", + "2024-01-10 00:00:00Z", + "2024-01-11 00:00:00Z", + ], "datetime_tz": [ datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), @@ -1761,6 +1774,7 @@ def null_column(n_samples: int) -> list[None]: { "datetime": Value("timestamp[s]"), "datetime_string": Value("string"), + "datetime_string_z": Value("string"), "datetime_tz": Value("timestamp[s, tz=+02:00]"), "datetime_null": Value("timestamp[s]"), "datetime_all_null": Value("timestamp[s]"), diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 648cf6791b..5b85ddc340 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -513,7 +513,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name # compute std seconds_in_day = 24 * 60 * 60 - if column_name in ["datetime", "datetime_string", "datetime_tz"]: + if column_name in ["datetime", "datetime_string", "datetime_string_z", "datetime_tz"]: timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] elif column_name == "datetime_null": @@ -542,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name @pytest.mark.parametrize( "column_name", - ["datetime", "datetime_string", "datetime_tz", "datetime_null", "datetime_all_null"], + ["datetime", "datetime_string", "datetime_string_z", "datetime_tz", "datetime_null", "datetime_all_null"], ) def test_datetime_statistics( column_name: str, @@ -550,7 +550,7 @@ def test_datetime_statistics( ) -> None: data = datasets["datetime_statistics"].to_pandas() expected = count_expected_statistics_for_datetime_column(data[column_name], column_name) - if column_name == "datetime_string": + if "_string" in column_name: computed = StringColumn.compute_statistics( data=pl.from_pandas(data), column_name=column_name, From 3df62647a417addfd79856473836d8b08b97a2e7 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 8 Jan 2025 16:22:35 +0100 Subject: [PATCH 19/40] fix? --- libs/libcommon/src/libcommon/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index 90c181ae21..837d595d7c 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -142,7 +142,9 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: for fmt in common_formats: try: - datetime.strptime(datetime_string, fmt) + _ = datetime.strptime(datetime_string, fmt) + if fmt.endswith("%z") and any(datetime_string.endswith(timezone) for timezone in ["Z", "UTC", "ACST"]): + fmt = f"{fmt.rstrip('%z')}%Z" return fmt except ValueError: continue From 812bf36d5011fc1eb632ab1ea00b47eac6e6ec77 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 8 Jan 2025 17:53:41 +0100 Subject: [PATCH 20/40] add example to docs --- docs/source/statistics.md | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/docs/source/statistics.md b/docs/source/statistics.md index 3b061c010c..a849a11638 100644 --- a/docs/source/statistics.md +++ b/docs/source/statistics.md @@ -178,6 +178,7 @@ Currently, statistics are supported for strings, float and integer numbers, list * `list` - for lists of any other data types (including lists) * `audio` - for audio data * `image` - for image data +* `datetime` - for datetime data ### `class_label` @@ -591,3 +592,57 @@ For image data, the distribution of images widths is computed. The following mea

+ +### datetime + +The distribution of datetime is computed. + +
Example +

+ +```json +{ + "column_name": "date", + "column_type": "datetime", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": "2013-05-18 04:54:11", + "max": "2013-06-20 10:01:41", + "mean": "2013-05-27 18:03:39", + "median": "2013-05-23 11:55:50", + "std": "11 days, 4:57:32.322450", + "histogram": { + "hist": [ + 318776, + 393036, + 173904, + 0, + 0, + 0, + 0, + 0, + 0, + 206284 + ], + "bin_edges": [ + "2013-05-18 04:54:11", + "2013-05-21 12:36:57", + "2013-05-24 20:19:43", + "2013-05-28 04:02:29", + "2013-05-31 11:45:15", + "2013-06-03 19:28:01", + "2013-06-07 03:10:47", + "2013-06-10 10:53:33", + "2013-06-13 18:36:19", + "2013-06-17 02:19:05", + "2013-06-20 10:01:41" + ] + } + } +} +``` + +

+
+ From c68efb7f744f8428b039281e7de847e03b999700 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 9 Jan 2025 13:21:49 +0100 Subject: [PATCH 21/40] fix + add tz string (%Z) to formats --- libs/libcommon/src/libcommon/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index 837d595d7c..b2052f0265 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -109,29 +109,36 @@ def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str def identify_datetime_format(datetime_string: str) -> Optional[str]: # Common datetime formats common_formats = [ + "%Y-%m-%dT%H:%M:%S%Z", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S%Z", "%Y-%m-%d %H:%M:%S%z", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d", + "%d-%m-%Y %H:%M:%S%Z", "%d-%m-%Y %H:%M:%S%z", "%d-%m-%Y %H:%M:%S", "%d-%m-%Y %H:%M", "%d-%m-%Y", + "%m-%d-%Y %H:%M:%S%Z", "%m-%d-%Y %H:%M:%S%z", "%m-%d-%Y %H:%M:%S", "%m-%d-%Y %H:%M", "%m-%d-%Y", + "%Y/%m/%d %H:%M:%S%Z", "%Y/%m/%d %H:%M:%S%z", "%Y/%m/%d %H:%M:%S", "%Y/%m/%d %H:%M", "%Y/%m/%d", + "%d/%m/%Y %H:%M:%S%Z", "%d/%m/%Y %H:%M:%S%z", "%d/%m/%Y %H:%M:%S", "%d/%m/%Y %H:%M", "%d/%m/%Y", + "%m/%d/%Y %H:%M:%S%Z", "%m/%d/%Y %H:%M:%S%z", "%m/%d/%Y %H:%M:%S", "%m/%d/%Y %H:%M", @@ -143,7 +150,7 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: for fmt in common_formats: try: _ = datetime.strptime(datetime_string, fmt) - if fmt.endswith("%z") and any(datetime_string.endswith(timezone) for timezone in ["Z", "UTC", "ACST"]): + if fmt.endswith("%z") and any(datetime_string.endswith(tz) for tz in ["Z", "ACST"]): fmt = f"{fmt.rstrip('%z')}%Z" return fmt except ValueError: From 351ef5cfeb0b47d3e504b1632db524a56e4c8a30 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 9 Jan 2025 14:22:29 +0100 Subject: [PATCH 22/40] test for string timezone not sure it works as expected --- .../tests/fixtures/statistics_dataset.py | 56 ++++++++++++++----- .../worker/tests/test_statistics_utils.py | 38 +++++++++++-- 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index 8768f854f0..7215472ca2 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1703,19 +1703,6 @@ def null_column(n_samples: int) -> list[None]: datetime_dataset = Dataset.from_dict( { - "datetime": [ - datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"), - datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), - ], "datetime_string": [ "2024-01-01 00:00:00", "2024-01-02 00:00:00", @@ -1742,6 +1729,45 @@ def null_column(n_samples: int) -> list[None]: "2024-01-10 00:00:00Z", "2024-01-11 00:00:00Z", ], + "datetime_string_cet": [ + "2024-01-01 00:00:00CET", + "2024-01-02 00:00:00CET", + "2024-01-03 00:00:00CET", + "2024-01-04 00:00:00CET", + "2024-01-05 00:00:00CET", + "2024-01-06 00:00:00CET", + "2024-01-07 00:00:00CET", + "2024-01-08 00:00:00CET", + "2024-01-09 00:00:00CET", + "2024-01-10 00:00:00CET", + "2024-01-11 00:00:00CET", + ], + "datetime_string_tz": [ + "2024-01-01 00:00:00+0200", + "2024-01-02 00:00:00+0200", + "2024-01-03 00:00:00+0200", + "2024-01-04 00:00:00+0200", + "2024-01-05 00:00:00+0200", + "2024-01-06 00:00:00+0200", + "2024-01-07 00:00:00+0200", + "2024-01-08 00:00:00+0200", + "2024-01-09 00:00:00+0200", + "2024-01-10 00:00:00+0200", + "2024-01-11 00:00:00+0200", + ], + "datetime": [ + datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"), + datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"), + ], "datetime_tz": [ datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"), @@ -1772,9 +1798,11 @@ def null_column(n_samples: int) -> list[None]: }, features=Features( { - "datetime": Value("timestamp[s]"), "datetime_string": Value("string"), "datetime_string_z": Value("string"), + "datetime_string_cet": Value("string"), + "datetime_string_tz": Value("string"), + "datetime": Value("timestamp[s]"), "datetime_tz": Value("timestamp[s, tz=+02:00]"), "datetime_null": Value("timestamp[s]"), "datetime_all_null": Value("timestamp[s]"), diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 5b85ddc340..86ec8167f8 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -510,17 +510,34 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name if column_name == "datetime_tz": bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges] minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200" + elif column_name == "datetime_string_tz": + # switch everything to two hours earlier in UTC timezone + minv = "2023-12-31 22:00:00+0000" + maxv = "2024-01-10 22:00:00+0000" + mean = "2024-01-05 22:00:00+0000" + median = "2024-01-05 22:00:00+0000" + bin_edges = [ + "2023-12-31 22:00:00+0000", + "2024-01-01 22:00:01+0000", + "2024-01-02 22:00:02+0000", + "2024-01-03 22:00:03+0000", + "2024-01-04 22:00:04+0000", + "2024-01-05 22:00:05+0000", + "2024-01-06 22:00:06+0000", + "2024-01-07 22:00:07+0000", + "2024-01-08 22:00:08+0000", + "2024-01-09 22:00:09+0000", + "2024-01-10 22:00:00+0000", + ] # compute std seconds_in_day = 24 * 60 * 60 - if column_name in ["datetime", "datetime_string", "datetime_string_z", "datetime_tz"]: - timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) - hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] - elif column_name == "datetime_null": + if column_name == "datetime_null": timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every other day hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1] else: - raise ValueError("Incorrect column") + timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day)) + hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1] std = timedeltas.std() std_str = str(datetime.timedelta(seconds=std)) @@ -542,7 +559,16 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name @pytest.mark.parametrize( "column_name", - ["datetime", "datetime_string", "datetime_string_z", "datetime_tz", "datetime_null", "datetime_all_null"], + [ + "datetime", + "datetime_string", + "datetime_string_z", + "datetime_string_cet", + "datetime_string_tz", + "datetime_tz", + "datetime_null", + "datetime_all_null", + ], ) def test_datetime_statistics( column_name: str, From 787ad3bbdb36735d4a81e8f204fff8e05860c756 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 10 Jan 2025 13:23:41 +0100 Subject: [PATCH 23/40] try to debug test fail is not reproduced locally --- services/worker/src/worker/statistics_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index b1322ce66a..6711c61a9c 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -487,6 +487,9 @@ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[st if _is_datetime: formats = [identify_datetime_format(value) for value in values] if len(set(formats)) == 1: + datetime_format = formats[0] + if not datetime_format: + raise ValueError("Values are datetime but format is not identified") return True, formats[0] raise StatisticsComputationError("Multiple datetime formats detected. ") From 5163500b092b0950e29b82ab76634d386df2553a Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 10 Jan 2025 16:24:10 +0100 Subject: [PATCH 24/40] test identify_datetime_format to debug why test is not passed in CI but works locally --- libs/libcommon/tests/test_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py index d929e8d7de..fef9784e56 100644 --- a/libs/libcommon/tests/test_utils.py +++ b/libs/libcommon/tests/test_utils.py @@ -16,6 +16,7 @@ get_datetime, get_duration, get_expires, + identify_datetime_format, inputs_to_string, is_image_url, orjson_dumps, @@ -153,3 +154,13 @@ def test_serialize_and_truncate_raises(obj: Any, max_bytes: int) -> None: def test_get_duration() -> None: assert get_duration(get_datetime() - timedelta(seconds=10)) == pytest.approx(10, rel=0.01) + + +@pytest.mark.parametrize( + "datetime_string,expected_format", + [ + ("2024-01-01 00:00:00CET", "%Y-%m-%d %H:%M:%S%Z"), + ], +) +def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None: + assert identify_datetime_format(datetime_string) == expected_format From 033e29e5b9672599ed7562a11fa23e4fdaebf4ce Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 13 Jan 2025 12:19:52 +0100 Subject: [PATCH 25/40] test datetime.strptime --- libs/libcommon/tests/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py index fef9784e56..67bf33cee0 100644 --- a/libs/libcommon/tests/test_utils.py +++ b/libs/libcommon/tests/test_utils.py @@ -163,4 +163,5 @@ def test_get_duration() -> None: ], ) def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None: + assert datetime.strptime(datetime_string, expected_format), "datetime error" assert identify_datetime_format(datetime_string) == expected_format From 349b65185010eb98bc927c00dc842f0c4b16cf04 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Mon, 13 Jan 2025 13:14:31 +0100 Subject: [PATCH 26/40] test --- libs/libcommon/tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py index 67bf33cee0..05b294b064 100644 --- a/libs/libcommon/tests/test_utils.py +++ b/libs/libcommon/tests/test_utils.py @@ -159,7 +159,7 @@ def test_get_duration() -> None: @pytest.mark.parametrize( "datetime_string,expected_format", [ - ("2024-01-01 00:00:00CET", "%Y-%m-%d %H:%M:%S%Z"), + ("2024-01-01 00:00:00 CET", "%Y-%m-%d %H:%M:%S %Z"), ], ) def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None: From 6c60c273499468dcb306d6ae69f796188746effc Mon Sep 17 00:00:00 2001 From: Polina Kazakova Date: Wed, 15 Jan 2025 12:17:31 +0100 Subject: [PATCH 27/40] Update services/worker/src/worker/statistics_utils.py Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- services/worker/src/worker/statistics_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 6711c61a9c..3514771166 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -479,9 +479,9 @@ def is_class(n_unique: int, n_samples: int) -> bool: @staticmethod def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]: - """Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format""" + """Check if first 100 non-null samples in a column match datetime format. If true, also return datetime format""" - values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list() + values = data.filter(pl.col(column_name).is_not_null()).head(100)[column_name].to_list() _is_datetime = all(is_datetime(value) for value in values) if len(values) > 0 else False if _is_datetime: From db10500ad471e39b61c98c5b72b945b62d276c7a Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 15 Jan 2025 14:37:40 +0100 Subject: [PATCH 28/40] keep original timezone for string dates polars switches dates to utc when casting from string while we want to preserve original dates --- libs/libcommon/src/libcommon/utils.py | 6 ++++- .../worker/src/worker/statistics_utils.py | 9 +++++-- .../tests/fixtures/statistics_dataset.py | 26 +++++++++---------- .../worker/tests/test_statistics_utils.py | 23 ++-------------- 4 files changed, 27 insertions(+), 37 deletions(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index b2052f0265..3e8134da64 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -7,7 +7,7 @@ import mimetypes import time from collections.abc import Callable, Sequence -from datetime import datetime, timedelta, timezone +from datetime import datetime, timedelta, timezone, tzinfo from fnmatch import fnmatch from pathlib import Path from typing import Any, Optional, TypeVar, Union, cast @@ -102,6 +102,10 @@ def is_datetime(string: str) -> bool: return False +def get_timezone(string: str) -> Optional[tzinfo]: + return parser.parse(string).tzinfo + + def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str: return dt.strftime(format) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 3514771166..9bc5a2a343 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -15,7 +15,7 @@ from libcommon.exceptions import ( StatisticsComputationError, ) -from libcommon.utils import datetime_to_string, identify_datetime_format, is_datetime +from libcommon.utils import datetime_to_string, get_timezone, identify_datetime_format, is_datetime from PIL import Image from tqdm.contrib.concurrent import thread_map @@ -490,7 +490,7 @@ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[st datetime_format = formats[0] if not datetime_format: raise ValueError("Values are datetime but format is not identified") - return True, formats[0] + return True, datetime_format raise StatisticsComputationError("Multiple datetime formats detected. ") return False, None @@ -792,7 +792,9 @@ def _compute_statistics( std=None, histogram=None, ) + original_timezone = None if isinstance(data[column_name].dtype, pl.String): + original_timezone = get_timezone(data[column_name][0]) data = data.with_columns(pl.col(column_name).str.to_datetime(format=format)) min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly @@ -811,6 +813,9 @@ def _compute_statistics( assert timedelta_stats["median"] is not None # nosec assert timedelta_stats["std"] is not None # nosec + if original_timezone: + min_date = min_date.astimezone(original_timezone) + datetime_bin_edges = [ cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"] ] diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index 7215472ca2..9033a9c470 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1729,18 +1729,18 @@ def null_column(n_samples: int) -> list[None]: "2024-01-10 00:00:00Z", "2024-01-11 00:00:00Z", ], - "datetime_string_cet": [ - "2024-01-01 00:00:00CET", - "2024-01-02 00:00:00CET", - "2024-01-03 00:00:00CET", - "2024-01-04 00:00:00CET", - "2024-01-05 00:00:00CET", - "2024-01-06 00:00:00CET", - "2024-01-07 00:00:00CET", - "2024-01-08 00:00:00CET", - "2024-01-09 00:00:00CET", - "2024-01-10 00:00:00CET", - "2024-01-11 00:00:00CET", + "datetime_string_t_z": [ + "2024-01-01T00:00:00Z", + "2024-01-02T00:00:00Z", + "2024-01-03T00:00:00Z", + "2024-01-04T00:00:00Z", + "2024-01-05T00:00:00Z", + "2024-01-06T00:00:00Z", + "2024-01-07T00:00:00Z", + "2024-01-08T00:00:00Z", + "2024-01-09T00:00:00Z", + "2024-01-10T00:00:00Z", + "2024-01-11T00:00:00Z", ], "datetime_string_tz": [ "2024-01-01 00:00:00+0200", @@ -1800,7 +1800,7 @@ def null_column(n_samples: int) -> list[None]: { "datetime_string": Value("string"), "datetime_string_z": Value("string"), - "datetime_string_cet": Value("string"), + "datetime_string_t_z": Value("string"), "datetime_string_tz": Value("string"), "datetime": Value("timestamp[s]"), "datetime_tz": Value("timestamp[s, tz=+02:00]"), diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index 86ec8167f8..a4bd281fe7 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -507,28 +507,9 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name "2024-01-10 00:00:09", "2024-01-11 00:00:00", ] - if column_name == "datetime_tz": + if column_name in ["datetime_tz", "datetime_string_tz"]: bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges] minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200" - elif column_name == "datetime_string_tz": - # switch everything to two hours earlier in UTC timezone - minv = "2023-12-31 22:00:00+0000" - maxv = "2024-01-10 22:00:00+0000" - mean = "2024-01-05 22:00:00+0000" - median = "2024-01-05 22:00:00+0000" - bin_edges = [ - "2023-12-31 22:00:00+0000", - "2024-01-01 22:00:01+0000", - "2024-01-02 22:00:02+0000", - "2024-01-03 22:00:03+0000", - "2024-01-04 22:00:04+0000", - "2024-01-05 22:00:05+0000", - "2024-01-06 22:00:06+0000", - "2024-01-07 22:00:07+0000", - "2024-01-08 22:00:08+0000", - "2024-01-09 22:00:09+0000", - "2024-01-10 22:00:00+0000", - ] # compute std seconds_in_day = 24 * 60 * 60 @@ -563,7 +544,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name "datetime", "datetime_string", "datetime_string_z", - "datetime_string_cet", + "datetime_string_t_z", "datetime_string_tz", "datetime_tz", "datetime_null", From 8794b7ae3b17329f6e73b0e35083f2d26e5a54b9 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 15 Jan 2025 15:24:22 +0100 Subject: [PATCH 29/40] let polars identify datetime format by itself provide manually only in case of failure --- services/worker/src/worker/statistics_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 9bc5a2a343..7afa2e996c 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -795,7 +795,11 @@ def _compute_statistics( original_timezone = None if isinstance(data[column_name].dtype, pl.String): original_timezone = get_timezone(data[column_name][0]) - data = data.with_columns(pl.col(column_name).str.to_datetime(format=format)) + # let polars identify format itself. provide manually in case of error + try: + data = data.with_columns(pl.col(column_name).str.to_datetime()) + except pl.ComputeError: + data = data.with_columns(pl.col(column_name).str.to_datetime(format=format)) min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly timedelta_column_name = f"{column_name}_timedelta" @@ -806,7 +810,7 @@ def _compute_statistics( column_name=timedelta_column_name, n_samples=n_samples, ) - # to assure mypy that there values are not None to pass to conversion functions: + # to assure mypy that these values are not None to pass to conversion functions: assert timedelta_stats["histogram"] is not None # nosec assert timedelta_stats["max"] is not None # nosec assert timedelta_stats["mean"] is not None # nosec From e0e7c91989c9018615683265e4a35647bb700b16 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 15 Jan 2025 15:24:49 +0100 Subject: [PATCH 30/40] do not display +0000 in timestamps (if timezone is UTC) --- libs/libcommon/src/libcommon/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index 3e8134da64..c540d77e48 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -7,7 +7,7 @@ import mimetypes import time from collections.abc import Callable, Sequence -from datetime import datetime, timedelta, timezone, tzinfo +from datetime import datetime, timedelta, timezone from fnmatch import fnmatch from pathlib import Path from typing import Any, Optional, TypeVar, Union, cast @@ -102,11 +102,13 @@ def is_datetime(string: str) -> bool: return False -def get_timezone(string: str) -> Optional[tzinfo]: +def get_timezone(string: str) -> Any: return parser.parse(string).tzinfo def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str: + if dt.utcoffset() == timedelta(0): + format = "%Y-%m-%d %H:%M:%S" # do not display +0000 return dt.strftime(format) @@ -154,8 +156,6 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: for fmt in common_formats: try: _ = datetime.strptime(datetime_string, fmt) - if fmt.endswith("%z") and any(datetime_string.endswith(tz) for tz in ["Z", "ACST"]): - fmt = f"{fmt.rstrip('%z')}%Z" return fmt except ValueError: continue From 8afade1c9bbaf1502e70b5567d7f1c50d20c78a6 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 15 Jan 2025 15:59:23 +0100 Subject: [PATCH 31/40] remove utils test --- libs/libcommon/tests/test_utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py index 05b294b064..67e909f1c6 100644 --- a/libs/libcommon/tests/test_utils.py +++ b/libs/libcommon/tests/test_utils.py @@ -154,14 +154,3 @@ def test_serialize_and_truncate_raises(obj: Any, max_bytes: int) -> None: def test_get_duration() -> None: assert get_duration(get_datetime() - timedelta(seconds=10)) == pytest.approx(10, rel=0.01) - - -@pytest.mark.parametrize( - "datetime_string,expected_format", - [ - ("2024-01-01 00:00:00 CET", "%Y-%m-%d %H:%M:%S %Z"), - ], -) -def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None: - assert datetime.strptime(datetime_string, expected_format), "datetime error" - assert identify_datetime_format(datetime_string) == expected_format From 341676c5c0001d5afda908192689000e5aba653d Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 15 Jan 2025 15:59:29 +0100 Subject: [PATCH 32/40] refactor: identify datetime format manually only when polars failed --- .../worker/src/worker/statistics_utils.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 7afa2e996c..653eb79aac 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -478,22 +478,11 @@ def is_class(n_unique: int, n_samples: int) -> bool: ) or n_unique <= NUM_BINS @staticmethod - def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]: - """Check if first 100 non-null samples in a column match datetime format. If true, also return datetime format""" + def is_datetime(data: pl.DataFrame, column_name: str) -> bool: + """Check if first 100 non-null samples in a column match datetime format.""" values = data.filter(pl.col(column_name).is_not_null()).head(100)[column_name].to_list() - _is_datetime = all(is_datetime(value) for value in values) if len(values) > 0 else False - - if _is_datetime: - formats = [identify_datetime_format(value) for value in values] - if len(set(formats)) == 1: - datetime_format = formats[0] - if not datetime_format: - raise ValueError("Values are datetime but format is not identified") - return True, datetime_format - raise StatisticsComputationError("Multiple datetime formats detected. ") - - return False, None + return all(is_datetime(value) for value in values) if len(values) > 0 else False @classmethod def compute_transformed_data( @@ -515,13 +504,11 @@ def _compute_statistics( ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]: nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) n_unique = data[column_name].n_unique() - _is_datetime, datetime_format = cls.is_datetime(data, column_name) - if _is_datetime: + if cls.is_datetime(data, column_name): datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( data, column_name=column_name, n_samples=n_samples, - format=datetime_format, ) return datetime_stats @@ -772,13 +759,27 @@ def compute_transformed_data( def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str: return datetime_to_string(base_date + datetime.timedelta(seconds=seconds)) + @staticmethod + def get_format(data: pl.DataFrame, column_name: str) -> str: + values = data.filter(pl.col(column_name).is_not_null()).head(100)[column_name].to_list() + formats = [identify_datetime_format(value) for value in values] + if len(set(formats)) == 1: + datetime_format = formats[0] + if not datetime_format: + raise StatisticsComputationError( + f"Values are datetime but format is not identified. Example: {values[0]}. " + ) + else: + raise StatisticsComputationError("Multiple datetime formats detected. ") + + return datetime_format + @classmethod def _compute_statistics( cls, data: pl.DataFrame, column_name: str, n_samples: int, - format: Optional[str] = None, ) -> DatetimeStatisticsItem: nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) if nan_count == n_samples: # all values are None @@ -799,7 +800,8 @@ def _compute_statistics( try: data = data.with_columns(pl.col(column_name).str.to_datetime()) except pl.ComputeError: - data = data.with_columns(pl.col(column_name).str.to_datetime(format=format)) + datetime_format = cls.get_format(data, column_name) + data = data.with_columns(pl.col(column_name).str.to_datetime(format=datetime_format)) min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly timedelta_column_name = f"{column_name}_timedelta" @@ -838,10 +840,8 @@ def _compute_statistics( ), ) - def compute_and_prepare_response( - self, data: pl.DataFrame, format: Optional[str] = None - ) -> StatisticsPerColumnItem: - stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples, format=format) + def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem: + stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples) return StatisticsPerColumnItem( column_name=self.name, column_type=ColumnType.DATETIME, From 3b5d9506df1a439f1c23bca26a06f2164b8f4481 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 12:34:50 +0100 Subject: [PATCH 33/40] style --- libs/libcommon/tests/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py index 67e909f1c6..d929e8d7de 100644 --- a/libs/libcommon/tests/test_utils.py +++ b/libs/libcommon/tests/test_utils.py @@ -16,7 +16,6 @@ get_datetime, get_duration, get_expires, - identify_datetime_format, inputs_to_string, is_image_url, orjson_dumps, From 21977db52bb36c97f4a4983ec0ebefe5df2f18f5 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 12:35:01 +0100 Subject: [PATCH 34/40] log formats in error message --- services/worker/src/worker/statistics_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 653eb79aac..3453ff825e 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -770,7 +770,7 @@ def get_format(data: pl.DataFrame, column_name: str) -> str: f"Values are datetime but format is not identified. Example: {values[0]}. " ) else: - raise StatisticsComputationError("Multiple datetime formats detected. ") + raise StatisticsComputationError(f"Multiple datetime formats detected: {set(formats)}. ") return datetime_format From 0ee76bfe623c36cca66d466e951b7b7c5a532992 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 14:19:40 +0100 Subject: [PATCH 35/40] update openapi specs --- docs/source/openapi.json | 427 +++++++++++++++++++++++++- libs/libcommon/src/libcommon/utils.py | 5 + 2 files changed, 430 insertions(+), 2 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 5844fda06a..c51a928009 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1176,7 +1176,8 @@ "bool", "list", "audio", - "image" + "image", + "datetime" ] }, "Histogram": { @@ -1197,6 +1198,24 @@ } } }, + "DatetimeHistogram": { + "type": "object", + "required": ["hist", "bin_edges"], + "properties": { + "hist": { + "type": "array", + "items": { + "type": "string" + } + }, + "bin_edges": { + "type": "array", + "items": { + "type": "number" + } + } + } + }, "NumericalStatisticsItem": { "type": "object", "required": [ @@ -1229,6 +1248,38 @@ } } }, + "DatetimeStatisticsItem": { + "type": "object", + "required": [ + "nan_count", + "nan_proportion", + "min", + "max", + "mean", + "median", + "std", + "histogram" + ], + "properties": { + "nan_count": { + "type": "integer" + }, + "nan_proportion": { + "type": "number" + }, + "min": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, + "max": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, + "mean": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, + "median": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, + "std": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, + "histogram": { + "oneOf": [ + { "$ref": "#/components/schemas/DatetimeHistogram" }, + { "type": "null" } + ] + } + } + }, "CategoricalStatisticsItem": { "type": "object", "description": "note that fields 'no_label_count' and 'no_label_proportion' are not required, because some old entries still miss them, and we don't want to recompute all of them. See https://github.com/huggingface/dataset-viewer/issues/2573.", @@ -1280,12 +1331,15 @@ { "$ref": "#/components/schemas/NumericalStatisticsItem" }, + { + "$ref": "#/components/schemas/DatetimeStatisticsItem" + }, { "$ref": "#/components/schemas/CategoricalStatisticsItem" }, { "$ref": "#/components/schemas/BoolStatisticsItem" - } + }, ] }, "StatisticsPerColumnItem": { @@ -5925,6 +5979,375 @@ "partial": false } }, + "A split (CL-ETM/datetimeevents) with a datetime column": { + "summary": "Statistics on a split with datetime columns 'charttime', 'storetime' and 'value'. ", + "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=CL-ETM/datetimeevents&config=mnist&split=train.", + "value": { + "num_examples": 6653174, + "statistics": [ + { + "column_name": "caregiver_id", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 45, + "max": 99872, + "mean": 49146.20367, + "median": 46354.0, + "std": 28893.09204, + "histogram": { + "hist": [ + 586864, + 696061, + 882127, + 627295, + 759981, + 594546, + 544977, + 653948, + 507192, + 800183 + ], + "bin_edges": [ + 45, + 10028, + 20011, + 29994, + 39977, + 49960, + 59943, + 69926, + 79909, + 89892, + 99872 + ] + } + } + }, + { + "column_name": "charttime", + "column_type": "datetime", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": "2110-01-13 09:39:00", + "max": "2214-07-26 08:00:00", + "mean": "2153-03-20 23:15:24", + "median": "2153-01-19 04:19:30", + "std": "8691 days, 20:22:21.464930", + "histogram": { + "hist": [ + 644662, + 824869, + 883173, + 884980, + 861445, + 863916, + 838647, + 664347, + 156213, + 30922 + ], + "bin_edges": [ + "2110-01-13 09:39:00", + "2120-06-27 07:05:07", + "2130-12-10 04:31:14", + "2141-05-24 01:57:21", + "2151-11-05 23:23:28", + "2162-04-19 20:49:35", + "2172-10-01 18:15:42", + "2183-03-16 15:41:49", + "2193-08-28 13:07:56", + "2204-02-11 10:34:03", + "2214-07-26 08:00:00" + ] + } + } + }, + { + "column_name": "hadm_id", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 20000094, + "max": 29999828, + "mean": 25027899.88926, + "median": 25052613.0, + "std": 2869146.55704, + "histogram": { + "hist": [ + 638196, + 656157, + 656168, + 661133, + 678335, + 693220, + 676587, + 653053, + 674626, + 665699 + ], + "bin_edges": [ + 20000094, + 21000068, + 22000042, + 23000016, + 23999990, + 24999964, + 25999938, + 26999912, + 27999886, + 28999860, + 29999828 + ] + } + } + }, + { + "column_name": "itemid", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 224183, + "max": 230120, + "mean": 225487.4805, + "median": 224290.0, + "std": 1820.04267, + "histogram": { + "hist": [ + 3742726, + 568047, + 1012645, + 75427, + 21011, + 41780, + 311155, + 100074, + 249544, + 530765 + ], + "bin_edges": [ + 224183, + 224777, + 225371, + 225965, + 226559, + 227153, + 227747, + 228341, + 228935, + 229529, + 230120 + ] + } + } + }, + { + "column_name": "stay_id", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 30000153, + "max": 39999858, + "mean": 34988877.57506, + "median": 34997302.0, + "std": 2873138.27766, + "histogram": { + "hist": [ + 669019, + 638622, + 695479, + 665010, + 659205, + 659496, + 696313, + 662500, + 671230, + 636300 + ], + "bin_edges": [ + 30000153, + 31000124, + 32000095, + 33000066, + 34000037, + 35000008, + 35999979, + 36999950, + 37999921, + 38999892, + 39999858 + ] + } + } + }, + { + "column_name": "storetime", + "column_type": "datetime", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": "2110-01-13 13:13:00", + "max": "2214-07-26 09:20:00", + "mean": "2153-03-20 23:57:17", + "median": "2153-01-19 03:42:00", + "std": "8691 days, 20:22:32.902370", + "histogram": { + "hist": [ + 644728, + 824803, + 883215, + 884951, + 861438, + 863915, + 838652, + 664336, + 156214, + 30922 + ], + "bin_edges": [ + "2110-01-13 13:13:00", + "2120-06-27 10:25:43", + "2130-12-10 07:38:26", + "2141-05-24 04:51:09", + "2151-11-06 02:03:52", + "2162-04-19 23:16:35", + "2172-10-01 20:29:18", + "2183-03-16 17:42:01", + "2193-08-28 14:54:44", + "2204-02-11 12:07:27", + "2214-07-26 09:20:00" + ] + } + } + }, + { + "column_name": "subject_id", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 10000032, + "max": 16657691, + "mean": 13340551.62433, + "median": 13334004.0, + "std": 1927957.39956, + "histogram": { + "hist": [ + 638347, + 684908, + 691450, + 631212, + 672810, + 659625, + 641987, + 654011, + 702989, + 675835 + ], + "bin_edges": [ + 10000032, + 10665798, + 11331564, + 11997330, + 12663096, + 13328862, + 13994628, + 14660394, + 15326160, + 15991926, + 16657691 + ] + } + } + }, + { + "column_name": "value", + "column_type": "datetime", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": "2109-08-02 00:00:00", + "max": "2214-07-24 09:57:00", + "mean": "2153-03-17 00:32:04", + "median": "2153-01-15 00:00:00", + "std": "8691 days, 20:07:56.642090", + "histogram": { + "hist": [ + 611811, + 820557, + 897262, + 880309, + 876200, + 860348, + 845238, + 673106, + 157352, + 30991 + ], + "bin_edges": [ + "2109-08-02 00:00:00", + "2120-01-31 03:23:43", + "2130-07-31 06:47:26", + "2141-01-28 10:11:09", + "2151-07-29 13:34:52", + "2162-01-26 16:58:35", + "2172-07-26 20:22:18", + "2183-01-24 23:46:01", + "2193-07-25 03:09:44", + "2204-01-24 06:33:27", + "2214-07-24 09:57:00" + ] + } + } + }, + { + "column_name": "valueuom", + "column_type": "string_label", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "no_label_count": 0, + "no_label_proportion": 0.0, + "n_unique": 2, + "frequencies": { + "Date and Time": 1885855, + "Date": 4767319 + } + } + }, + { + "column_name": "warning", + "column_type": "int", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": 0, + "max": 1, + "mean": 0.00028, + "median": 0.0, + "std": 0.01674, + "histogram": { + "hist": [ + 6651308, + 1866 + ], + "bin_edges": [ + 0, + 1, + 1 + ] + } + } + } + ], + "partial": true + } + }, "A split (nyu-mll/glue) with a string (text) column": { "summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=ax&split=test.", diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py index c540d77e48..c86077d583 100644 --- a/libs/libcommon/src/libcommon/utils.py +++ b/libs/libcommon/src/libcommon/utils.py @@ -151,6 +151,11 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]: "%m/%d/%Y", "%B %d, %Y", "%d %B %Y", + "%m-%Y", + "%Y-%m", + "%m/%Y", + "%Y/%m", + "%Y", ] for fmt in common_formats: From b7fee0bdb643a5f8d0e12e6b32c5ba828951afef Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 14:20:39 +0100 Subject: [PATCH 36/40] fallback to string stats if datetime didn't work + test --- services/worker/src/worker/statistics_utils.py | 18 ++++++++++++------ .../tests/fixtures/statistics_dataset.py | 14 ++++++++++++++ services/worker/tests/test_statistics_utils.py | 8 +++++++- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 3453ff825e..590d9286b9 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -505,12 +505,18 @@ def _compute_statistics( nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples) n_unique = data[column_name].n_unique() if cls.is_datetime(data, column_name): - datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( - data, - column_name=column_name, - n_samples=n_samples, - ) - return datetime_stats + try: + stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics( + data, + column_name=column_name, + n_samples=n_samples, + ) + return stats + except Exception as error: + logging.info( + f"Column {column_name} is datetime, but datetime stats compute failed ({error}), " + f"compute string stats instead. " + ) if cls.is_class(n_unique, n_samples): labels2counts: dict[str, int] = value_counts(data, column_name) if nan_count != n_samples else {} diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py index 9033a9c470..416a68f1a2 100644 --- a/services/worker/tests/fixtures/statistics_dataset.py +++ b/services/worker/tests/fixtures/statistics_dataset.py @@ -1755,6 +1755,19 @@ def null_column(n_samples: int) -> list[None]: "2024-01-10 00:00:00+0200", "2024-01-11 00:00:00+0200", ], + "datetime_string_error": [ + "16/01/2023", + "17/01/2023", + "18/01/2023", + "19/01/2023", + "01/2023", + "02/2023", + "20/01/2023", + "21/01/2023", + "03/2023", + "25/01/2023", + "26/01/2023", + ], "datetime": [ datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"), @@ -1802,6 +1815,7 @@ def null_column(n_samples: int) -> list[None]: "datetime_string_z": Value("string"), "datetime_string_t_z": Value("string"), "datetime_string_tz": Value("string"), + "datetime_string_error": Value("string"), "datetime": Value("timestamp[s]"), "datetime_tz": Value("timestamp[s, tz=+02:00]"), "datetime_null": Value("timestamp[s]"), diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py index a4bd281fe7..5b3ea88417 100644 --- a/services/worker/tests/test_statistics_utils.py +++ b/services/worker/tests/test_statistics_utils.py @@ -489,6 +489,10 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name "histogram": None, } + # testcase contains multiple datetime formats, and we compute string lengths distributions instead of error + if column_name == "datetime_string_error": + return count_expected_statistics_for_string_column(column) + # hardcode expected values minv = "2024-01-01 00:00:00" maxv = "2024-01-11 00:00:00" @@ -546,6 +550,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name "datetime_string_z", "datetime_string_t_z", "datetime_string_tz", + "datetime_string_error", "datetime_tz", "datetime_null", "datetime_all_null", @@ -569,8 +574,9 @@ def test_datetime_statistics( column_name=column_name, n_samples=len(data[column_name]), ) + computed_std, expected_std = computed.pop("std"), expected.pop("std") - if computed_std: + if computed_std and column_name != "datetime_string_error": assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds else: assert computed_std == expected_std From 6a76dd9e35a3155b5153389b4016bf67b5b4e158 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 14:48:07 +0100 Subject: [PATCH 37/40] fix test --- .../tests/job_runners/split/test_descriptive_statistics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py index 4aa1c68900..0837b73b60 100644 --- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py +++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py @@ -341,7 +341,7 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A statistics = count_expected_statistics_for_datetime_column(column=df[column_name], column_name=column_name) expected_statistics[column_name] = { "column_name": column_name, - "column_type": ColumnType.DATETIME, + "column_type": ColumnType.DATETIME if column_name != "datetime_string_error" else ColumnType.STRING_TEXT, "column_statistics": statistics, } return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False} From f3eefea1368768bf427b0013e187dc858874411b Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 15:00:01 +0100 Subject: [PATCH 38/40] update docs --- docs/source/statistics.md | 20 +++++++++------ services/worker/README.md | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/source/statistics.md b/docs/source/statistics.md index a849a11638..15e820da8a 100644 --- a/docs/source/statistics.md +++ b/docs/source/statistics.md @@ -165,7 +165,7 @@ The response JSON contains three keys: ## Response structure by data type -Currently, statistics are supported for strings, float and integer numbers, lists, audio and image data and the special [`datasets.ClassLabel`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.ClassLabel) feature type of the [`datasets`](https://huggingface.co/docs/datasets/) library. +Currently, statistics are supported for strings, float and integer numbers, lists, datetimes, audio and image data and the special [`datasets.ClassLabel`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.ClassLabel) feature type of the [`datasets`](https://huggingface.co/docs/datasets/) library. `column_type` in response can be one of the following values: @@ -217,7 +217,7 @@ This type represents categorical data encoded as [`ClassLabel`](https://huggingf The following measures are returned for float data types: -* minimum, maximum, mean, and standard deviation values +* minimum, maximum, mean, median, and standard deviation values * number and proportion of `null` and `NaN` values (`NaN` values are treated as `null`) * histogram with 10 bins @@ -274,7 +274,7 @@ The following measures are returned for float data types: The following measures are returned for integer data types: -* minimum, maximum, mean, and standard deviation values +* minimum, maximum, mean, median, and standard deviation values * number and proportion of `null` values * histogram with less than or equal to 10 bins @@ -378,7 +378,7 @@ If the proportion of unique values in a string column within requested split is If string column does not satisfy the conditions to be treated as a `string_label`, it is considered to be a column containing texts and response contains statistics over text lengths which are calculated by character number. The following measures are computed: -* minimum, maximum, mean, and standard deviation of text lengths +* minimum, maximum, mean, median, and standard deviation of text lengths * number and proportion of `null` values * histogram of text lengths with 10 bins @@ -435,7 +435,7 @@ If string column does not satisfy the conditions to be treated as a `string_labe For lists, the distribution of their lengths is computed. The following measures are returned: -* minimum, maximum, mean, and standard deviation of lists lengths +* minimum, maximum, mean, median, and standard deviation of lists lengths * number and proportion of `null` values * histogram of lists lengths with up to 10 bins @@ -481,7 +481,7 @@ Note that dictionaries of lists are not supported. For audio data, the distribution of audio files durations is computed. The following measures are returned: -* minimum, maximum, mean, and standard deviation of audio files durations +* minimum, maximum, mean, median, and standard deviation of audio files durations * number and proportion of `null` values * histogram of audio files durations with 10 bins @@ -540,7 +540,7 @@ For audio data, the distribution of audio files durations is computed. The follo For image data, the distribution of images widths is computed. The following measures are returned: -* minimum, maximum, mean, and standard deviation of widths of image files +* minimum, maximum, mean, median, and standard deviation of widths of image files * number and proportion of `null` values * histogram of images widths with 10 bins @@ -595,7 +595,11 @@ For image data, the distribution of images widths is computed. The following mea ### datetime -The distribution of datetime is computed. +The distribution of datetime is computed. The following measures are returned: + +* minimum, maximum, mean, median, and standard deviation of datetimes represented as strings with precision up to seconds +* number and proportion of `null` values +* histogram of datetimes with 10 bins
Example

diff --git a/services/worker/README.md b/services/worker/README.md index b7722ed173..a1ae1a7555 100644 --- a/services/worker/README.md +++ b/services/worker/README.md @@ -116,6 +116,7 @@ The response has three fields: `num_examples`, `statistics`, and `partial`. `par * `list` - for lists of other data types (including lists) * `audio` - for audio data * `image` - for image data +* `datetime` - for datetime data `column_statistics` content depends on the feature type, see examples below. ##### class_label @@ -591,6 +592,59 @@ Shows distribution of image files widths.

+ +##### datetime + +Shows distribution of datetimes. + +
example: +

+ +```python +{ + "column_name": "date", + "column_type": "datetime", + "column_statistics": { + "nan_count": 0, + "nan_proportion": 0.0, + "min": "2013-05-18 04:54:11", + "max": "2013-06-20 10:01:41", + "mean": "2013-05-27 18:03:39", + "median": "2013-05-23 11:55:50", + "std": "11 days, 4:57:32.322450", + "histogram": { + "hist": [ + 318776, + 393036, + 173904, + 0, + 0, + 0, + 0, + 0, + 0, + 206284 + ], + "bin_edges": [ + "2013-05-18 04:54:11", + "2013-05-21 12:36:57", + "2013-05-24 20:19:43", + "2013-05-28 04:02:29", + "2013-05-31 11:45:15", + "2013-06-03 19:28:01", + "2013-06-07 03:10:47", + "2013-06-10 10:53:33", + "2013-06-13 18:36:19", + "2013-06-17 02:19:05", + "2013-06-20 10:01:41" + ] + } + } +} +``` +

+
+ ### Splits worker The `splits` worker does not need any additional configuration. From 1df95ff729b4eb8153eb9d4fe77b6e012c8c6a45 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Thu, 16 Jan 2025 15:14:24 +0100 Subject: [PATCH 39/40] fix openapi specs --- docs/source/openapi.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/openapi.json b/docs/source/openapi.json index c51a928009..d971558f42 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1205,13 +1205,13 @@ "hist": { "type": "array", "items": { - "type": "string" + "type": "integer" } }, "bin_edges": { "type": "array", "items": { - "type": "number" + "type": "string" } } } @@ -1339,7 +1339,7 @@ }, { "$ref": "#/components/schemas/BoolStatisticsItem" - }, + } ] }, "StatisticsPerColumnItem": { @@ -6346,7 +6346,7 @@ } ], "partial": true - } + } }, "A split (nyu-mll/glue) with a string (text) column": { "summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.", From f9d7a8a7f8db8c89468609029ecc2604a46c8e9e Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 17 Jan 2025 12:41:33 +0100 Subject: [PATCH 40/40] fix polars timezone switching --- services/worker/src/worker/statistics_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py index 590d9286b9..1c9afe0c24 100644 --- a/services/worker/src/worker/statistics_utils.py +++ b/services/worker/src/worker/statistics_utils.py @@ -801,13 +801,14 @@ def _compute_statistics( ) original_timezone = None if isinstance(data[column_name].dtype, pl.String): - original_timezone = get_timezone(data[column_name][0]) # let polars identify format itself. provide manually in case of error try: + original_timezone = get_timezone(data[column_name][0]) data = data.with_columns(pl.col(column_name).str.to_datetime()) except pl.ComputeError: datetime_format = cls.get_format(data, column_name) data = data.with_columns(pl.col(column_name).str.to_datetime(format=datetime_format)) + original_timezone = None min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly timedelta_column_name = f"{column_name}_timedelta"