From 79790e0ed3cded901a5e22ad7adeaac0ce9e3d67 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 31 Jul 2024 14:37:42 +0200
Subject: [PATCH 01/40] compute stats for datetimes
---
.../worker/src/worker/statistics_utils.py | 109 +++++++++++++++++-
1 file changed, 107 insertions(+), 2 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index f2651bb091..ccb28ace6b 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The HuggingFace Authors.
+import datetime
import enum
import io
import logging
@@ -50,11 +51,12 @@ class ColumnType(str, enum.Enum):
STRING_TEXT = "string_text"
AUDIO = "audio"
IMAGE = "image"
+ DATETIME = "datetime"
class Histogram(TypedDict):
hist: list[int]
- bin_edges: list[Union[int, float]]
+ bin_edges: list[Union[int, float, str]]
class NumericalStatisticsItem(TypedDict):
@@ -68,6 +70,17 @@ class NumericalStatisticsItem(TypedDict):
histogram: Optional[Histogram]
+class DatetimeStatisticsItem(TypedDict):
+ nan_count: int
+ nan_proportion: float
+ min: Optional[str] # might be None in very rare cases when the whole column is only None values
+ max: Optional[str]
+ mean: Optional[str]
+ median: Optional[str]
+ std: Optional[str] # string representation of timedelta
+ histogram: Optional[Histogram]
+
+
class CategoricalStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
@@ -83,7 +96,9 @@ class BoolStatisticsItem(TypedDict):
frequencies: dict[str, int]
-SupportedStatistics = Union[NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem]
+SupportedStatistics = Union[
+ NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem, DatetimeStatisticsItem
+]
class StatisticsPerColumnItem(TypedDict):
@@ -699,3 +714,93 @@ def get_shape(example: Optional[Union[bytes, dict[str, Any]]]) -> Union[tuple[No
@classmethod
def transform(cls, example: Optional[Union[bytes, dict[str, Any]]]) -> Optional[int]:
return cls.get_width(example)
+
+
+class DatetimeColumn(Column):
+ transform_column = IntColumn
+
+ @classmethod
+ def compute_transformed_data(
+ cls,
+ data: pl.DataFrame,
+ column_name: str,
+ transformed_column_name: str,
+ min_date: datetime.datetime,
+ ) -> pl.DataFrame:
+ return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))
+
+ @staticmethod
+ def shift_and_convert_to_string(min_date, seconds) -> str:
+ return datetime_to_string(min_date + datetime.timedelta(seconds=seconds))
+
+ @classmethod
+ def _compute_statistics(
+ cls,
+ data: pl.DataFrame,
+ column_name: str,
+ n_samples: int,
+ ) -> DatetimeStatisticsItem:
+ nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
+ if nan_count == n_samples: # all values are None
+ return DatetimeStatisticsItem(
+ nan_count=n_samples,
+ nan_proportion=1.0,
+ min=None,
+ max=None,
+ mean=None,
+ median=None,
+ std=None,
+ histogram=None,
+ )
+
+ min_date = data[column_name].min()
+ timedelta_column_name = f"{column_name}_timedelta"
+ # compute distribution of time passed from min date in **seconds**
+ timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
+ timedelta_stats: NumericalStatisticsItem = cls.transform_column.compute_statistics(
+ timedelta_df,
+ column_name=timedelta_column_name,
+ n_samples=n_samples,
+ )
+ for stat in ("max", "mean", "median"):
+ timedelta_stats[stat] = cls.shift_and_convert_to_string(min_date, timedelta_stats[stat])
+
+ bin_edges = [
+ cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
+ ]
+
+ return DatetimeStatisticsItem(
+ nan_count=nan_count,
+ nan_proportion=nan_proportion,
+ min=datetime_to_string(min_date),
+ max=timedelta_stats["max"],
+ mean=timedelta_stats["mean"],
+ median=timedelta_stats["median"],
+ std=str(timedelta_stats["std"]),
+ histogram=Histogram(
+ hist=timedelta_stats["histogram"]["hist"],
+ bin_edges=bin_edges,
+ ),
+ )
+
+ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
+ stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
+ return StatisticsPerColumnItem(
+ column_name=self.name,
+ column_type=ColumnType.DATETIME,
+ column_statistics=stats,
+ )
+
+
+def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S") -> str:
+ """
+ Convert a datetime.datetime object to a string.
+
+ Args:
+ dt (datetime): The datetime object to convert.
+ format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S".
+
+ Returns:
+ str: The datetime object as a string.
+ """
+ return dt.strftime(format)
From 851ec1b434a586e92e04de90e3ad4967ca674bc2 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 31 Jul 2024 16:42:59 +0200
Subject: [PATCH 02/40] fix typing
---
.../worker/src/worker/statistics_utils.py | 43 +++++++++++--------
1 file changed, 26 insertions(+), 17 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index ccb28ace6b..dfd2599164 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -56,14 +56,19 @@ class ColumnType(str, enum.Enum):
class Histogram(TypedDict):
hist: list[int]
- bin_edges: list[Union[int, float, str]]
+ bin_edges: list[Union[int, float]]
+
+
+class DatetimeHistogram(TypedDict):
+ hist: list[int]
+ bin_edges: list[str] # edges are string representations of dates
class NumericalStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
- min: Optional[float] # might be None in very rare cases when the whole column is only None values
- max: Optional[float]
+ min: Optional[Union[int, float]] # might be None in very rare cases when the whole column is only None values
+ max: Optional[Union[int, float]]
mean: Optional[float]
median: Optional[float]
std: Optional[float]
@@ -78,7 +83,7 @@ class DatetimeStatisticsItem(TypedDict):
mean: Optional[str]
median: Optional[str]
std: Optional[str] # string representation of timedelta
- histogram: Optional[Histogram]
+ histogram: Optional[DatetimeHistogram]
class CategoricalStatisticsItem(TypedDict):
@@ -730,8 +735,8 @@ def compute_transformed_data(
return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))
@staticmethod
- def shift_and_convert_to_string(min_date, seconds) -> str:
- return datetime_to_string(min_date + datetime.timedelta(seconds=seconds))
+ def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str:
+ return datetime_to_string(base_date + datetime.timedelta(seconds=seconds))
@classmethod
def _compute_statistics(
@@ -753,7 +758,7 @@ def _compute_statistics(
histogram=None,
)
- min_date = data[column_name].min()
+ min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
# compute distribution of time passed from min date in **seconds**
timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
@@ -762,10 +767,14 @@ def _compute_statistics(
column_name=timedelta_column_name,
n_samples=n_samples,
)
- for stat in ("max", "mean", "median"):
- timedelta_stats[stat] = cls.shift_and_convert_to_string(min_date, timedelta_stats[stat])
-
- bin_edges = [
+ # to assure mypy that there values are not None to pass to conversion functions:
+ assert timedelta_stats["histogram"] is not None
+ assert timedelta_stats["max"] is not None
+ assert timedelta_stats["mean"] is not None
+ assert timedelta_stats["median"] is not None
+ assert timedelta_stats["std"] is not None
+
+ datetime_bin_edges = [
cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
]
@@ -773,13 +782,13 @@ def _compute_statistics(
nan_count=nan_count,
nan_proportion=nan_proportion,
min=datetime_to_string(min_date),
- max=timedelta_stats["max"],
- mean=timedelta_stats["mean"],
- median=timedelta_stats["median"],
- std=str(timedelta_stats["std"]),
- histogram=Histogram(
+ max=cls.shift_and_convert_to_string(min_date, timedelta_stats["max"]),
+ mean=cls.shift_and_convert_to_string(min_date, timedelta_stats["mean"]),
+ median=cls.shift_and_convert_to_string(min_date, timedelta_stats["median"]),
+ std=str(datetime.timedelta(seconds=timedelta_stats["std"])),
+ histogram=DatetimeHistogram(
hist=timedelta_stats["histogram"]["hist"],
- bin_edges=bin_edges,
+ bin_edges=datetime_bin_edges,
),
)
From 3347c134fa2d062a9d4e4844f14118758d428838 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 1 Aug 2024 17:11:12 +0200
Subject: [PATCH 03/40] add testcase
---
services/worker/tests/fixtures/datasets.py | 2 +
.../tests/fixtures/statistics_dataset.py | 25 ++++++++
.../worker/tests/test_statistics_utils.py | 57 ++++++++++++++++++-
3 files changed, 83 insertions(+), 1 deletion(-)
diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index 77e41e2ae4..2b471a9861 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -28,6 +28,7 @@
from .statistics_dataset import (
audio_dataset,
+ datetime_dataset,
image_dataset,
null_column,
statistics_dataset,
@@ -238,4 +239,5 @@ def datasets() -> Mapping[str, Dataset]:
"descriptive_statistics_not_supported": statistics_not_supported_dataset,
"audio_statistics": audio_dataset,
"image_statistics": image_dataset,
+ "datetime_statistics": datetime_dataset,
}
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index f32e404131..7d60fd100c 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The HuggingFace Authors.
+from datetime import datetime
from pathlib import Path
from typing import Optional
@@ -1698,3 +1699,27 @@ def null_column(n_samples: int) -> list[None]:
}
),
)
+
+
+datetime_dataset = Dataset.from_dict(
+ {
+ "datetime": [
+ datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ ]
+ },
+ features=Features(
+ {
+ "datetime": Value("timestamp[s]"),
+ }
+ ),
+)
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 80f41f317f..29abdfb3eb 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The HuggingFace Authors.
+import datetime
from collections.abc import Mapping
-from typing import Optional, Union
+from typing import Any, Optional, Union
import numpy as np
import pandas as pd
@@ -22,6 +23,7 @@
BoolColumn,
ClassLabelColumn,
ColumnType,
+ DatetimeColumn,
FloatColumn,
ImageColumn,
IntColumn,
@@ -470,3 +472,56 @@ def test_image_statistics(
n_samples=4,
)
assert computed == expected
+
+
+def count_expected_statistics_for_datetime() -> dict[str, Any]:
+ seconds_in_day = 24 * 60 * 60
+ timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+ std = timedeltas.std()
+ std_str = str(datetime.timedelta(seconds=std))
+ std_str = std_str.split(".")[0] # check precision up to seconds
+ return {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": "2024-01-01 00:00:00",
+ "max": "2024-01-11 00:00:00",
+ "mean": "2024-01-06 00:00:00",
+ "median": "2024-01-06 00:00:00",
+ "std": std_str,
+ "histogram": {
+ "hist": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+ "bin_edges": [
+ "2024-01-01 00:00:00",
+ "2024-01-02 00:00:01",
+ "2024-01-03 00:00:02",
+ "2024-01-04 00:00:03",
+ "2024-01-05 00:00:04",
+ "2024-01-06 00:00:05",
+ "2024-01-07 00:00:06",
+ "2024-01-08 00:00:07",
+ "2024-01-09 00:00:08",
+ "2024-01-10 00:00:09",
+ "2024-01-11 00:00:00",
+ ],
+ },
+ }
+
+
+@pytest.mark.parametrize(
+ "column_name",
+ ["datetime_column"],
+)
+def test_datetime_statistics(
+ column_name: str,
+ datasets: Mapping[str, Dataset],
+) -> None:
+ column_name = "datetime"
+ expected = count_expected_statistics_for_datetime()
+ data = datasets["datetime_statistics"].to_pandas()
+ computed = DatetimeColumn.compute_statistics(
+ data=pl.from_pandas(data),
+ column_name=column_name,
+ n_samples=len(data[column_name]),
+ )
+ assert computed.pop("std").split(".")[0] == expected.pop("std")
+ assert computed == expected
From 0340b54c25bd47d2af51b4eaf553139c8023fe1b Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 5 Aug 2024 15:37:00 +0200
Subject: [PATCH 04/40] moar tests: column with nulls and all nulls column
---
.../worker/src/worker/statistics_utils.py | 4 +-
.../tests/fixtures/statistics_dataset.py | 18 +++-
.../worker/tests/test_statistics_utils.py | 89 +++++++++++++------
3 files changed, 80 insertions(+), 31 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index dfd2599164..9b4a2302b2 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -801,13 +801,13 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum
)
-def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S") -> str:
+def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
"""
Convert a datetime.datetime object to a string.
Args:
dt (datetime): The datetime object to convert.
- format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S".
+ format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z".
Returns:
str: The datetime object as a string.
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index 7d60fd100c..c00c63afc5 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1715,11 +1715,27 @@ def null_column(n_samples: int) -> list[None]:
datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
- ]
+ ],
+ "datetime_null": [
+ datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ None,
+ datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ None,
+ datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ None,
+ datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ None,
+ datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ None,
+ datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ ],
+ "datetime_all_null": [None] * 11,
},
features=Features(
{
"datetime": Value("timestamp[s]"),
+ "datetime_null": Value("timestamp[s]"),
+ "datetime_all_null": Value("timestamp[s]"),
}
),
)
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 29abdfb3eb..84eee81448 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -2,7 +2,7 @@
# Copyright 2024 The HuggingFace Authors.
import datetime
from collections.abc import Mapping
-from typing import Any, Optional, Union
+from typing import Optional, Union
import numpy as np
import pandas as pd
@@ -474,54 +474,87 @@ def test_image_statistics(
assert computed == expected
-def count_expected_statistics_for_datetime() -> dict[str, Any]:
+def count_expected_statistics_for_datetime(column: pd.Series, column_name: str) -> dict: # type: ignore
+ n_samples = column.shape[0]
+ nan_count = column.isna().sum()
+ if nan_count == n_samples:
+ return {
+ "nan_count": n_samples,
+ "nan_proportion": 1.0,
+ "min": None,
+ "max": None,
+ "mean": None,
+ "median": None,
+ "std": None,
+ "histogram": None,
+ }
+
+ # hardcode expected values
+ minv = "2024-01-01 00:00:00"
+ maxv = "2024-01-11 00:00:00"
+ mean = "2024-01-06 00:00:00"
+ median = "2024-01-06 00:00:00"
+ bin_edges = [
+ "2024-01-01 00:00:00",
+ "2024-01-02 00:00:01",
+ "2024-01-03 00:00:02",
+ "2024-01-04 00:00:03",
+ "2024-01-05 00:00:04",
+ "2024-01-06 00:00:05",
+ "2024-01-07 00:00:06",
+ "2024-01-08 00:00:07",
+ "2024-01-09 00:00:08",
+ "2024-01-10 00:00:09",
+ "2024-01-11 00:00:00",
+ ]
+
+ # compute std
seconds_in_day = 24 * 60 * 60
- timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+ if column_name == "datetime":
+ timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+ hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+ elif column_name == "datetime_null":
+ timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every second day
+ hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+ else:
+ raise ValueError("Incorrect column")
+
std = timedeltas.std()
std_str = str(datetime.timedelta(seconds=std))
- std_str = std_str.split(".")[0] # check precision up to seconds
+
return {
- "nan_count": 0,
- "nan_proportion": 0.0,
- "min": "2024-01-01 00:00:00",
- "max": "2024-01-11 00:00:00",
- "mean": "2024-01-06 00:00:00",
- "median": "2024-01-06 00:00:00",
+ "nan_count": nan_count,
+ "nan_proportion": np.round(nan_count / n_samples, DECIMALS).item() if nan_count else 0.0,
+ "min": minv,
+ "max": maxv,
+ "mean": mean,
+ "median": median,
"std": std_str,
"histogram": {
- "hist": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
- "bin_edges": [
- "2024-01-01 00:00:00",
- "2024-01-02 00:00:01",
- "2024-01-03 00:00:02",
- "2024-01-04 00:00:03",
- "2024-01-05 00:00:04",
- "2024-01-06 00:00:05",
- "2024-01-07 00:00:06",
- "2024-01-08 00:00:07",
- "2024-01-09 00:00:08",
- "2024-01-10 00:00:09",
- "2024-01-11 00:00:00",
- ],
+ "hist": hist,
+ "bin_edges": bin_edges,
},
}
@pytest.mark.parametrize(
"column_name",
- ["datetime_column"],
+ ["datetime", "datetime_null", "datetime_all_null"],
)
def test_datetime_statistics(
column_name: str,
datasets: Mapping[str, Dataset],
) -> None:
- column_name = "datetime"
- expected = count_expected_statistics_for_datetime()
data = datasets["datetime_statistics"].to_pandas()
+ expected = count_expected_statistics_for_datetime(data[column_name], column_name)
computed = DatetimeColumn.compute_statistics(
data=pl.from_pandas(data),
column_name=column_name,
n_samples=len(data[column_name]),
)
- assert computed.pop("std").split(".")[0] == expected.pop("std")
+ computed_std, expected_std = computed.pop("std"), expected.pop("std")
+ if computed_std:
+ assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds
+ else:
+ assert computed_std == expected_std
assert computed == expected
From 434b2d8a0d487425d0e8078f9ff3c9392de69a3c Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 8 Aug 2024 14:07:01 +0200
Subject: [PATCH 05/40] add datetime to worker
---
.../job_runners/split/descriptive_statistics.py | 15 ++++++++++++++-
services/worker/src/worker/statistics_utils.py | 11 ++++++-----
2 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
index ed8bb6aa17..06d485d5bb 100644
--- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py
+++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -32,6 +32,7 @@
from worker.dtos import CompleteJobResult
from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
from worker.statistics_utils import (
+ DATETIME_DTYPES,
FLOAT_DTYPES,
INTEGER_DTYPES,
NUMERICAL_DTYPES,
@@ -39,6 +40,7 @@
AudioColumn,
BoolColumn,
ClassLabelColumn,
+ DatetimeColumn,
FloatColumn,
ImageColumn,
IntColumn,
@@ -57,7 +59,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict):
SupportedColumns = Union[
- ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn
+ ClassLabelColumn,
+ IntColumn,
+ FloatColumn,
+ StringColumn,
+ BoolColumn,
+ ListColumn,
+ AudioColumn,
+ ImageColumn,
+ DatetimeColumn,
]
@@ -238,6 +248,9 @@ def _column_from_feature(
if dataset_feature.get("dtype") == "bool":
return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)
+
+ if dataset_feature.get("dtype") in DATETIME_DTYPES:
+ return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
return None
columns: list[SupportedColumns] = []
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 9b4a2302b2..23e80ab775 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -39,6 +39,7 @@
FLOAT_DTYPES = ["float16", "float32", "float64"]
NUMERICAL_DTYPES = INTEGER_DTYPES + FLOAT_DTYPES
STRING_DTYPES = ["string", "large_string"]
+DATETIME_DTYPES = ["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"]
class ColumnType(str, enum.Enum):
@@ -768,11 +769,11 @@ def _compute_statistics(
n_samples=n_samples,
)
# to assure mypy that there values are not None to pass to conversion functions:
- assert timedelta_stats["histogram"] is not None
- assert timedelta_stats["max"] is not None
- assert timedelta_stats["mean"] is not None
- assert timedelta_stats["median"] is not None
- assert timedelta_stats["std"] is not None
+ assert timedelta_stats["histogram"] is not None # nosec
+ assert timedelta_stats["max"] is not None # nosec
+ assert timedelta_stats["mean"] is not None # nosec
+ assert timedelta_stats["median"] is not None # nosec
+ assert timedelta_stats["std"] is not None # nosec
datetime_bin_edges = [
cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
From 260458758c3f64c7af1bd17eef7475444a3414ce Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 8 Aug 2024 14:07:12 +0200
Subject: [PATCH 06/40] add test
---
services/worker/tests/fixtures/hub.py | 20 ++++++
.../split/test_descriptive_statistics.py | 67 ++++++++++++++-----
.../worker/tests/test_statistics_utils.py | 4 +-
3 files changed, 71 insertions(+), 20 deletions(-)
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index 62046b66c7..d5b890dbcc 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -354,6 +354,13 @@ def hub_public_image_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str
delete_hub_dataset_repo(repo_id=repo_id)
+@pytest.fixture(scope="session")
+def hub_public_datetime_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str]:
+ repo_id = create_hub_dataset_repo(prefix="datetime_statistics", dataset=datasets["datetime_statistics"])
+ yield repo_id
+ delete_hub_dataset_repo(repo_id=repo_id)
+
+
@pytest.fixture(scope="session")
def hub_public_n_configs_with_default(datasets: Mapping[str, Dataset]) -> Iterator[str]:
default_config_name, _ = get_default_config_split()
@@ -1207,6 +1214,19 @@ def hub_responses_image_statistics(
}
+@pytest.fixture
+def hub_responses_datetime_statistics(
+ hub_public_datetime_statistics: str,
+) -> HubDatasetTest:
+ return {
+ "name": hub_public_datetime_statistics,
+ "config_names_response": create_config_names_response(hub_public_datetime_statistics),
+ "splits_response": create_splits_response(hub_public_datetime_statistics),
+ "first_rows_response": None,
+ "parquet_and_info_response": None,
+ }
+
+
@pytest.fixture
def hub_responses_descriptive_statistics_parquet_builder(
hub_public_descriptive_statistics_parquet_builder: str,
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 54f1f53954..a95932d67a 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -3,7 +3,7 @@
from collections.abc import Callable, Mapping
from dataclasses import replace
from http import HTTPStatus
-from typing import Optional
+from typing import Any, Optional
import pandas as pd
import polars as pl
@@ -30,6 +30,7 @@
from ...test_statistics_utils import (
count_expected_statistics_for_bool_column,
count_expected_statistics_for_categorical_column,
+ count_expected_statistics_for_datetime_column,
count_expected_statistics_for_list_column,
count_expected_statistics_for_numerical_column,
count_expected_statistics_for_string_column,
@@ -215,7 +216,7 @@ def _get_job_runner(
@pytest.fixture
-def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict: # type: ignore
+def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
ds = datasets["descriptive_statistics"]
df = ds.to_pandas()
expected_statistics = {}
@@ -253,7 +254,7 @@ def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict: #
@pytest.fixture
-def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) -> dict: # type: ignore
+def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
ds = datasets["descriptive_statistics_string_text"]
df = ds.to_pandas()
expected_statistics = {}
@@ -270,7 +271,7 @@ def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset])
@pytest.fixture
-def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, Dataset]) -> dict: # type: ignore
+def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
ds = datasets["descriptive_statistics_string_text"]
df = ds.to_pandas()[:50] # see `fixtures.hub.hub_public_descriptive_statistics_parquet_builder`
expected_statistics = {}
@@ -287,7 +288,7 @@ def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, D
@pytest.fixture
-def audio_statistics_expected() -> dict: # type: ignore
+def audio_statistics_expected() -> dict[str, Any]:
column_names_to_durations = [
("audio", [1.0, 2.0, 3.0, 4.0]), # datasets consists of 4 audio files of 1, 2, 3, 4 seconds lengths
("audio_null", [1.0, None, 3.0, None]), # take first and third audio file for this testcase
@@ -312,7 +313,7 @@ def audio_statistics_expected() -> dict: # type: ignore
@pytest.fixture
-def image_statistics_expected() -> dict: # type: ignore
+def image_statistics_expected() -> dict[str, Any]:
column_names_to_widths = [
("image", [640, 1440, 520, 1240]), # datasets consists of 4 image files
("image_null", [640, None, 520, None]), # take first and third image file for this testcase
@@ -334,6 +335,21 @@ def image_statistics_expected() -> dict: # type: ignore
}
+@pytest.fixture
+def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
+ ds = datasets["datetime_statistics"]
+ df = ds.to_pandas()
+ expected_statistics = {}
+ for column_name in df.columns:
+ statistics = count_expected_statistics_for_datetime_column(column=df[column_name], column_name=column_name)
+ expected_statistics[column_name] = {
+ "column_name": column_name,
+ "column_type": ColumnType.DATETIME,
+ "column_statistics": statistics,
+ }
+ return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False}
+
+
@pytest.fixture
def struct_thread_panic_error_parquet_file(tmp_path_factory: pytest.TempPathFactory) -> str:
repo_id = "__DUMMY_TRANSFORMERS_USER__/test_polars_panic_error"
@@ -369,13 +385,14 @@ def test_polars_struct_thread_panic_error(struct_thread_panic_error_parquet_file
@pytest.mark.parametrize(
"hub_dataset_name,expected_error_code",
[
- ("descriptive_statistics", None),
- ("descriptive_statistics_string_text", None),
- ("descriptive_statistics_string_text_partial", None),
- ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
- ("audio_statistics", None),
- ("image_statistics", None),
- ("gated", None),
+ # ("descriptive_statistics", None),
+ # ("descriptive_statistics_string_text", None),
+ # ("descriptive_statistics_string_text_partial", None),
+ # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
+ # ("audio_statistics", None),
+ # ("image_statistics", None),
+ ("datetime_statistics", None),
+ # ("gated", None),
],
)
def test_compute(
@@ -391,13 +408,15 @@ def test_compute(
hub_responses_descriptive_statistics_not_supported: HubDatasetTest,
hub_responses_audio_statistics: HubDatasetTest,
hub_responses_image_statistics: HubDatasetTest,
+ hub_responses_datetime_statistics: HubDatasetTest,
hub_dataset_name: str,
expected_error_code: Optional[str],
- descriptive_statistics_expected: dict, # type: ignore
- descriptive_statistics_string_text_expected: dict, # type: ignore
- descriptive_statistics_string_text_partial_expected: dict, # type: ignore
- audio_statistics_expected: dict, # type: ignore
- image_statistics_expected: dict, # type: ignore
+ descriptive_statistics_expected: dict[str, Any],
+ descriptive_statistics_string_text_expected: dict[str, Any],
+ descriptive_statistics_string_text_partial_expected: dict[str, Any],
+ audio_statistics_expected: dict[str, Any],
+ image_statistics_expected: dict[str, Any],
+ datetime_statistics_expected: dict[str, Any],
) -> None:
hub_datasets = {
"descriptive_statistics": hub_responses_descriptive_statistics,
@@ -407,6 +426,7 @@ def test_compute(
"gated": hub_responses_gated_descriptive_statistics,
"audio_statistics": hub_responses_audio_statistics,
"image_statistics": hub_responses_image_statistics,
+ "datetime_statistics": hub_responses_datetime_statistics,
}
expected = {
"descriptive_statistics": descriptive_statistics_expected,
@@ -416,6 +436,7 @@ def test_compute(
"descriptive_statistics_string_text_partial": descriptive_statistics_string_text_partial_expected,
"audio_statistics": audio_statistics_expected,
"image_statistics": image_statistics_expected,
+ "datetime_statistics": datetime_statistics_expected,
}
dataset = hub_datasets[hub_dataset_name]["name"]
splits_response = hub_datasets[hub_dataset_name]["splits_response"]
@@ -534,5 +555,15 @@ def test_compute(
column_response_stats.pop("nan_proportion")
) == expected_column_response_stats.pop("nan_proportion")
assert column_response_stats == expected_column_response_stats
+ elif column_response["column_type"] is ColumnType.DATETIME:
+ std, expected_std = (
+ column_response_stats.pop("std"),
+ expected_column_response_stats.pop("std"),
+ )
+ if std:
+ assert std.split(".")[0] == expected_std.split(".")[0]
+ else:
+ assert std == expected_std
+ assert column_response_stats == expected_column_response_stats
else:
raise ValueError("Incorrect data type")
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 84eee81448..377cb47c86 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -474,7 +474,7 @@ def test_image_statistics(
assert computed == expected
-def count_expected_statistics_for_datetime(column: pd.Series, column_name: str) -> dict: # type: ignore
+def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict: # type: ignore
n_samples = column.shape[0]
nan_count = column.isna().sum()
if nan_count == n_samples:
@@ -546,7 +546,7 @@ def test_datetime_statistics(
datasets: Mapping[str, Dataset],
) -> None:
data = datasets["datetime_statistics"].to_pandas()
- expected = count_expected_statistics_for_datetime(data[column_name], column_name)
+ expected = count_expected_statistics_for_datetime_column(data[column_name], column_name)
computed = DatetimeColumn.compute_statistics(
data=pl.from_pandas(data),
column_name=column_name,
From 913f812f472e30ca1eca102ac0eaa5eecb7814b3 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 9 Aug 2024 13:23:39 +0200
Subject: [PATCH 07/40] include timezone aware
---
.../split/descriptive_statistics.py | 23 ++++++++++---------
.../worker/src/worker/statistics_utils.py | 1 -
.../tests/fixtures/statistics_dataset.py | 14 +++++++++++
.../split/test_descriptive_statistics.py | 14 +++++------
.../worker/tests/test_statistics_utils.py | 7 ++++--
5 files changed, 38 insertions(+), 21 deletions(-)
diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
index 06d485d5bb..3c3886d703 100644
--- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py
+++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -32,7 +32,6 @@
from worker.dtos import CompleteJobResult
from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
from worker.statistics_utils import (
- DATETIME_DTYPES,
FLOAT_DTYPES,
INTEGER_DTYPES,
NUMERICAL_DTYPES,
@@ -225,31 +224,33 @@ def _column_from_feature(
return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples)
if isinstance(dataset_feature, dict):
- if dataset_feature.get("_type") == "ClassLabel":
+ _type = dataset_feature.get("_type")
+ if _type == "ClassLabel":
return ClassLabelColumn(
feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature
)
- if dataset_feature.get("_type") == "Audio":
+ if _type == "Audio":
return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples)
- if dataset_feature.get("_type") == "Image":
+ if _type == "Image":
return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples)
- if dataset_feature.get("_type") == "Value":
- if dataset_feature.get("dtype") in INTEGER_DTYPES:
+ if _type == "Value":
+ dtype = dataset_feature.get("dtype", "")
+ if dtype in INTEGER_DTYPES:
return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples)
- if dataset_feature.get("dtype") in FLOAT_DTYPES:
+ if dtype in FLOAT_DTYPES:
return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples)
- if dataset_feature.get("dtype") in STRING_DTYPES:
+ if dtype in STRING_DTYPES:
return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples)
- if dataset_feature.get("dtype") == "bool":
+ if dtype == "bool":
return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)
- if dataset_feature.get("dtype") in DATETIME_DTYPES:
+ if dtype.startswith("timestamp"):
return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
return None
@@ -262,7 +263,7 @@ def _column_from_feature(
if not columns:
raise NoSupportedFeaturesError(
"No columns for statistics computation found. Currently supported feature types are: "
- f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. "
+ f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. "
)
column_names_str = ", ".join([column.name for column in columns])
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 23e80ab775..d514cec931 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -39,7 +39,6 @@
FLOAT_DTYPES = ["float16", "float32", "float64"]
NUMERICAL_DTYPES = INTEGER_DTYPES + FLOAT_DTYPES
STRING_DTYPES = ["string", "large_string"]
-DATETIME_DTYPES = ["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"]
class ColumnType(str, enum.Enum):
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index c00c63afc5..c233e61639 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1716,6 +1716,19 @@ def null_column(n_samples: int) -> list[None]:
datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
],
+ "datetime_tz": [
+ datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-03 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-04 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-05 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-06 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-07 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-08 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-09 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-10 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ datetime.strptime("2024-01-11 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+ ],
"datetime_null": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
None,
@@ -1734,6 +1747,7 @@ def null_column(n_samples: int) -> list[None]:
features=Features(
{
"datetime": Value("timestamp[s]"),
+ "datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
"datetime_all_null": Value("timestamp[s]"),
}
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index a95932d67a..7cdd785def 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -385,14 +385,14 @@ def test_polars_struct_thread_panic_error(struct_thread_panic_error_parquet_file
@pytest.mark.parametrize(
"hub_dataset_name,expected_error_code",
[
- # ("descriptive_statistics", None),
- # ("descriptive_statistics_string_text", None),
- # ("descriptive_statistics_string_text_partial", None),
- # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
- # ("audio_statistics", None),
- # ("image_statistics", None),
+ ("descriptive_statistics", None),
+ ("descriptive_statistics_string_text", None),
+ ("descriptive_statistics_string_text_partial", None),
+ ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
+ ("audio_statistics", None),
+ ("image_statistics", None),
("datetime_statistics", None),
- # ("gated", None),
+ ("gated", None),
],
)
def test_compute(
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 377cb47c86..1a34fadce1 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -507,10 +507,13 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"2024-01-10 00:00:09",
"2024-01-11 00:00:00",
]
+ if column_name == "datetime_tz":
+ bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges]
+ minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200"
# compute std
seconds_in_day = 24 * 60 * 60
- if column_name == "datetime":
+ if column_name in ["datetime", "datetime_tz"]:
timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
elif column_name == "datetime_null":
@@ -539,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
@pytest.mark.parametrize(
"column_name",
- ["datetime", "datetime_null", "datetime_all_null"],
+ ["datetime", "datetime_tz", "datetime_null", "datetime_all_null"],
)
def test_datetime_statistics(
column_name: str,
From d51739356a2834ef2df49fb8f8ae86dd1c9561e6 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 14 Oct 2024 15:10:19 +0200
Subject: [PATCH 08/40] refactor
---
libs/libcommon/src/libcommon/utils.py | 14 +++++++
.../worker/src/worker/statistics_utils.py | 38 +++++++++++--------
.../split/test_descriptive_statistics.py | 1 -
.../worker/tests/test_statistics_utils.py | 2 +-
4 files changed, 37 insertions(+), 18 deletions(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index c85079b697..3a08ebf8d1 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -2,12 +2,14 @@
# Copyright 2022 The HuggingFace Authors.
import base64
+import datetime
import functools
import logging
import mimetypes
import time
from collections.abc import Callable, Sequence
from datetime import datetime, timedelta, timezone
+from dateutil import parser
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, Optional, TypeVar, Union, cast
@@ -93,6 +95,18 @@ def get_datetime(days: Optional[float] = None) -> datetime:
return date
+def is_datetime(string: str):
+ try:
+ parser.parse(string)
+ return True
+ except ValueError:
+ return False
+
+
+def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
+ return dt.strftime(format)
+
+
def get_duration(started_at: datetime) -> float:
"""
Get time in seconds that has passed from `started_at` until now.
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index d514cec931..28d340faa5 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -15,6 +15,7 @@
from libcommon.exceptions import (
StatisticsComputationError,
)
+from libcommon.utils import datetime_to_string, is_datetime
from PIL import Image
from tqdm.contrib.concurrent import thread_map
@@ -476,6 +477,13 @@ def is_class(n_unique: int, n_samples: int) -> bool:
n_unique / n_samples <= MAX_PROPORTION_STRING_LABELS and n_unique <= MAX_NUM_STRING_LABELS
) or n_unique <= NUM_BINS
+ @staticmethod
+ def is_datetime(data: pl.DataFrame, column_name: str) -> bool:
+ """Check if first 1000 non-null samples in a column match datetime format."""
+
+ values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
+ return all(is_datetime(value) for value in values)
+
@classmethod
def compute_transformed_data(
cls,
@@ -493,7 +501,7 @@ def _compute_statistics(
data: pl.DataFrame,
column_name: str,
n_samples: int,
- ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem]:
+ ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
if cls.is_class(n_unique, n_samples):
@@ -509,6 +517,13 @@ def _compute_statistics(
n_unique=len(labels2counts),
frequencies=labels2counts,
)
+ if cls.is_datetime(data, column_name):
+ datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
+ data.select(pl.col(column_name).cast(pl.Datetime)),
+ column_name=column_name,
+ n_samples=n_samples,
+ )
+ return datetime_stats
lengths_column_name = f"{column_name}_len"
lengths_df = cls.compute_transformed_data(data, column_name, transformed_column_name=lengths_column_name)
@@ -519,7 +534,12 @@ def _compute_statistics(
def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
- string_type = ColumnType.STRING_LABEL if "frequencies" in stats else ColumnType.STRING_TEXT
+ if "frequencies" in stats:
+ string_type = ColumnType.STRING_LABEL
+ elif isinstance(stats["histogram"], DatetimeHistogram): # type: ignore
+ string_type = ColumnType.DATETIME
+ else:
+ string_type = ColumnType.STRING_TEXT
return StatisticsPerColumnItem(
column_name=self.name,
column_type=string_type,
@@ -799,17 +819,3 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum
column_type=ColumnType.DATETIME,
column_statistics=stats,
)
-
-
-def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
- """
- Convert a datetime.datetime object to a string.
-
- Args:
- dt (datetime): The datetime object to convert.
- format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z".
-
- Returns:
- str: The datetime object as a string.
- """
- return dt.strftime(format)
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 14fb9dbf3a..4aa1c68900 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -347,7 +347,6 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A
return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False}
-
@pytest.mark.parametrize(
"hub_dataset_name,expected_error_code",
[
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 1a34fadce1..dc74d9a31c 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -517,7 +517,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
elif column_name == "datetime_null":
- timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every second day
+ timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every other day
hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1]
else:
raise ValueError("Incorrect column")
From 7046d8b7d67d1926d2e3e41b80420395b1f0f647 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 14 Oct 2024 15:21:10 +0200
Subject: [PATCH 09/40] fix
---
libs/libcommon/src/libcommon/utils.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index 3a08ebf8d1..b81ff70fff 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -2,14 +2,12 @@
# Copyright 2022 The HuggingFace Authors.
import base64
-import datetime
import functools
import logging
import mimetypes
import time
from collections.abc import Callable, Sequence
from datetime import datetime, timedelta, timezone
-from dateutil import parser
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, Optional, TypeVar, Union, cast
@@ -17,6 +15,7 @@
import orjson
import pandas as pd
import pytz
+from dateutil import parser
from huggingface_hub import constants, hf_hub_download
from requests.exceptions import ReadTimeout
@@ -95,7 +94,7 @@ def get_datetime(days: Optional[float] = None) -> datetime:
return date
-def is_datetime(string: str):
+def is_datetime(string: str) -> bool:
try:
parser.parse(string)
return True
From 945dff0378043a9ae4ab79ef56fa82f1b8abab44 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 14 Oct 2024 15:37:04 +0200
Subject: [PATCH 10/40] do not typecheck dateutil
---
libs/libcommon/pyproject.toml | 1 +
1 file changed, 1 insertion(+)
diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml
index 48cc7629bb..c6c0b9e679 100644
--- a/libs/libcommon/pyproject.toml
+++ b/libs/libcommon/pyproject.toml
@@ -76,6 +76,7 @@ module = [
"moto.*",
"aiobotocore.*",
"requests.*",
+ "dateutil.*"
]
# ^ huggingface_hub is not typed since version 0.13.0
ignore_missing_imports = true
From bdec2e475b0bfc4c3a47ba3a6a1b2ca17cfa4d1d Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 23 Dec 2024 13:29:13 +0100
Subject: [PATCH 11/40] fix
---
services/worker/src/worker/statistics_utils.py | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 28d340faa5..d53597fff8 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -504,6 +504,14 @@ def _compute_statistics(
) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
+ if cls.is_datetime(data, column_name):
+ datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
+ data.select(pl.col(column_name).cast(pl.Datetime)),
+ column_name=column_name,
+ n_samples=n_samples,
+ )
+ return datetime_stats
+
if cls.is_class(n_unique, n_samples):
labels2counts: dict[str, int] = value_counts(data, column_name) if nan_count != n_samples else {}
logging.debug(f"{n_unique=} {nan_count=} {nan_proportion=} {labels2counts=}")
@@ -517,13 +525,6 @@ def _compute_statistics(
n_unique=len(labels2counts),
frequencies=labels2counts,
)
- if cls.is_datetime(data, column_name):
- datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
- data.select(pl.col(column_name).cast(pl.Datetime)),
- column_name=column_name,
- n_samples=n_samples,
- )
- return datetime_stats
lengths_column_name = f"{column_name}_len"
lengths_df = cls.compute_transformed_data(data, column_name, transformed_column_name=lengths_column_name)
@@ -536,7 +537,7 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum
stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
if "frequencies" in stats:
string_type = ColumnType.STRING_LABEL
- elif isinstance(stats["histogram"], DatetimeHistogram): # type: ignore
+ elif isinstance(stats["histogram"]["bin_edges"][0], str):
string_type = ColumnType.DATETIME
else:
string_type = ColumnType.STRING_TEXT
From f9ffe82d9a7f9ac018fa9aa436d783fe05115f1b Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 23 Dec 2024 13:29:22 +0100
Subject: [PATCH 12/40] more tests
---
.../tests/fixtures/statistics_dataset.py | 14 +++++++++++++
.../worker/tests/test_statistics_utils.py | 21 ++++++++++++-------
2 files changed, 28 insertions(+), 7 deletions(-)
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index c233e61639..acbec4858a 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1716,6 +1716,19 @@ def null_column(n_samples: int) -> list[None]:
datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
],
+ "datetime_string": [
+ "2024-01-01 00:00:00",
+ "2024-01-02 00:00:00",
+ "2024-01-03 00:00:00",
+ "2024-01-04 00:00:00",
+ "2024-01-05 00:00:00",
+ "2024-01-06 00:00:00",
+ "2024-01-07 00:00:00",
+ "2024-01-08 00:00:00",
+ "2024-01-09 00:00:00",
+ "2024-01-10 00:00:00",
+ "2024-01-11 00:00:00",
+ ],
"datetime_tz": [
datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
@@ -1747,6 +1760,7 @@ def null_column(n_samples: int) -> list[None]:
features=Features(
{
"datetime": Value("timestamp[s]"),
+ "datetime_string": Value("string"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
"datetime_all_null": Value("timestamp[s]"),
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index dc74d9a31c..e2ff01f120 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -513,7 +513,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
# compute std
seconds_in_day = 24 * 60 * 60
- if column_name in ["datetime", "datetime_tz"]:
+ if column_name in ["datetime", "datetime_string", "datetime_tz"]:
timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
elif column_name == "datetime_null":
@@ -542,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
@pytest.mark.parametrize(
"column_name",
- ["datetime", "datetime_tz", "datetime_null", "datetime_all_null"],
+ ["datetime", "datetime_string", "datetime_tz", "datetime_null", "datetime_all_null"],
)
def test_datetime_statistics(
column_name: str,
@@ -550,11 +550,18 @@ def test_datetime_statistics(
) -> None:
data = datasets["datetime_statistics"].to_pandas()
expected = count_expected_statistics_for_datetime_column(data[column_name], column_name)
- computed = DatetimeColumn.compute_statistics(
- data=pl.from_pandas(data),
- column_name=column_name,
- n_samples=len(data[column_name]),
- )
+ if column_name == "datetime_string":
+ computed = StringColumn.compute_statistics(
+ data=pl.from_pandas(data),
+ column_name=column_name,
+ n_samples=len(data[column_name]),
+ )
+ else:
+ computed = DatetimeColumn.compute_statistics(
+ data=pl.from_pandas(data),
+ column_name=column_name,
+ n_samples=len(data[column_name]),
+ )
computed_std, expected_std = computed.pop("std"), expected.pop("std")
if computed_std:
assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds
From d2c37c6b71f4b0f457353d1f0a87ad8a5033f35d Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 26 Dec 2024 14:02:56 +0100
Subject: [PATCH 13/40] fix string to datetime conversion: add format inferring
---
libs/libcommon/src/libcommon/utils.py | 44 +++++++++++++++++++
.../worker/src/worker/statistics_utils.py | 25 ++++++++---
.../split/test_descriptive_statistics.py | 10 ++---
3 files changed, 68 insertions(+), 11 deletions(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index b81ff70fff..a89f49d980 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -106,6 +106,50 @@ def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str
return dt.strftime(format)
+def identify_datetime_format(datetime_string: str) -> Optional[str]:
+ # Common datetime formats
+ common_formats = [
+ "%Y-%m-%dT%H:%M:%S%z",
+ "%Y-%m-%dT%H:%M:%S",
+ "%Y-%m-%dT%H:%M:%S.%f",
+ "%Y-%m-%d %H:%M:%S%z",
+ "%Y-%m-%d %H:%M:%S",
+ "%Y-%m-%d %H:%M",
+ "%Y-%m-%d",
+ "%d-%m-%Y %H:%M:%S%z",
+ "%d-%m-%Y %H:%M:%S",
+ "%d-%m-%Y %H:%M",
+ "%d-%m-%Y",
+ "%m-%d-%Y %H:%M:%S%z",
+ "%m-%d-%Y %H:%M:%S",
+ "%m-%d-%Y %H:%M",
+ "%m-%d-%Y",
+
+ "%Y/%m/%d %H:%M:%S%z",
+ "%Y/%m/%d %H:%M:%S",
+ "%Y/%m/%d %H:%M",
+ "%Y/%m/%d",
+ "%d/%m/%Y %H:%M:%S%z",
+ "%d/%m/%Y %H:%M:%S",
+ "%d/%m/%Y %H:%M",
+ "%d/%m/%Y",
+ "%m/%d/%Y %H:%M:%S%z",
+ "%m/%d/%Y %H:%M:%S",
+ "%m/%d/%Y %H:%M",
+ "%m/%d/%Y",
+
+ "%B %d, %Y",
+ "%d %B %Y",
+ ]
+
+ for fmt in common_formats:
+ try:
+ datetime.strptime(datetime_string, fmt)
+ return fmt
+ except ValueError:
+ continue
+
+
def get_duration(started_at: datetime) -> float:
"""
Get time in seconds that has passed from `started_at` until now.
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index d53597fff8..cb5e19f8ca 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -15,7 +15,7 @@
from libcommon.exceptions import (
StatisticsComputationError,
)
-from libcommon.utils import datetime_to_string, is_datetime
+from libcommon.utils import datetime_to_string, identify_datetime_format, is_datetime
from PIL import Image
from tqdm.contrib.concurrent import thread_map
@@ -478,11 +478,19 @@ def is_class(n_unique: int, n_samples: int) -> bool:
) or n_unique <= NUM_BINS
@staticmethod
- def is_datetime(data: pl.DataFrame, column_name: str) -> bool:
- """Check if first 1000 non-null samples in a column match datetime format."""
+ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]:
+ """Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format"""
values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
- return all(is_datetime(value) for value in values)
+ _is_datetime = all(is_datetime(value) for value in values)
+
+ if _is_datetime:
+ formats = [identify_datetime_format(value) for value in values]
+ if len(set(formats)) == 1:
+ return True, formats[0]
+ raise StatisticsComputationError("Multiple datetime formats detected. ")
+
+ return False, None
@classmethod
def compute_transformed_data(
@@ -504,11 +512,13 @@ def _compute_statistics(
) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
- if cls.is_datetime(data, column_name):
+ _is_datetime, datetime_format = cls.is_datetime(data, column_name)
+ if _is_datetime:
datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
- data.select(pl.col(column_name).cast(pl.Datetime)),
+ data,
column_name=column_name,
n_samples=n_samples,
+ format=datetime_format,
)
return datetime_stats
@@ -765,6 +775,7 @@ def _compute_statistics(
data: pl.DataFrame,
column_name: str,
n_samples: int,
+ format: Optional[str] = None,
) -> DatetimeStatisticsItem:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
if nan_count == n_samples: # all values are None
@@ -778,6 +789,8 @@ def _compute_statistics(
std=None,
histogram=None,
)
+ if isinstance(data[column_name].dtype, pl.String):
+ data = data.with_columns(pl.col(column_name).str.to_datetime(format=format))
min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 4aa1c68900..20ca8b369f 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -353,11 +353,11 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A
("descriptive_statistics", None),
("descriptive_statistics_string_text", None),
("descriptive_statistics_string_text_partial", None),
- ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
- ("audio_statistics", None),
- ("image_statistics", None),
- ("datetime_statistics", None),
- ("gated", None),
+ # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
+ # ("audio_statistics", None),
+ # ("image_statistics", None),
+ # ("datetime_statistics", None),
+ # ("gated", None),
],
)
def test_compute(
From 658719e1fc585f55dbefd0d64e840144f5708fb7 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 26 Dec 2024 14:45:02 +0100
Subject: [PATCH 14/40] fix style
---
libs/libcommon/src/libcommon/utils.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index a89f49d980..90c181ae21 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -124,7 +124,6 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
"%m-%d-%Y %H:%M:%S",
"%m-%d-%Y %H:%M",
"%m-%d-%Y",
-
"%Y/%m/%d %H:%M:%S%z",
"%Y/%m/%d %H:%M:%S",
"%Y/%m/%d %H:%M",
@@ -137,7 +136,6 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
"%m/%d/%Y %H:%M:%S",
"%m/%d/%Y %H:%M",
"%m/%d/%Y",
-
"%B %d, %Y",
"%d %B %Y",
]
@@ -148,6 +146,7 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
return fmt
except ValueError:
continue
+ return None
def get_duration(started_at: datetime) -> float:
From 5c2d94a546d07355938ee4c45b53c3fca96fb899 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 27 Dec 2024 12:45:23 +0100
Subject: [PATCH 15/40] fix check for datetime
---
services/worker/src/worker/statistics_utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index cb5e19f8ca..e35e04a266 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -482,7 +482,7 @@ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[st
"""Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format"""
values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
- _is_datetime = all(is_datetime(value) for value in values)
+ _is_datetime = all(is_datetime(value) for value in values) if len(values) > 0 else False
if _is_datetime:
formats = [identify_datetime_format(value) for value in values]
From 359a30bf1b625e93fb64f786e75844882d27d28e Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 27 Dec 2024 13:41:06 +0100
Subject: [PATCH 16/40] minor
---
.../split/test_descriptive_statistics.py | 10 +++++-----
services/worker/tests/test_statistics_utils.py | 14 +++++++-------
2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 20ca8b369f..4aa1c68900 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -353,11 +353,11 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A
("descriptive_statistics", None),
("descriptive_statistics_string_text", None),
("descriptive_statistics_string_text_partial", None),
- # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
- # ("audio_statistics", None),
- # ("image_statistics", None),
- # ("datetime_statistics", None),
- # ("gated", None),
+ ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
+ ("audio_statistics", None),
+ ("image_statistics", None),
+ ("datetime_statistics", None),
+ ("gated", None),
],
)
def test_compute(
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index e2ff01f120..d693acf501 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -2,7 +2,7 @@
# Copyright 2024 The HuggingFace Authors.
import datetime
from collections.abc import Mapping
-from typing import Optional, Union
+from typing import Any, Optional, Union
import numpy as np
import pandas as pd
@@ -68,7 +68,7 @@ def test_generate_bins(
def count_expected_statistics_for_numerical_column(
column: pd.Series, # type: ignore
dtype: ColumnType,
-) -> dict: # type: ignore
+) -> dict[str, Any]:
minimum, maximum, mean, median, std = (
column.min(),
column.max(),
@@ -125,7 +125,7 @@ def count_expected_statistics_for_numerical_column(
}
-def count_expected_statistics_for_list_column(column: pd.Series) -> dict: # type: ignore
+def count_expected_statistics_for_list_column(column: pd.Series) -> dict[str, Any]:
if column.isnull().all():
lengths_column = pd.Series([None] * column.shape[0])
return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT)
@@ -141,7 +141,7 @@ def count_expected_statistics_for_list_column(column: pd.Series) -> dict: # typ
def count_expected_statistics_for_categorical_column(
column: pd.Series, # type: ignore
class_label_feature: ClassLabel,
-) -> dict: # type: ignore
+) -> dict[str, Any]:
n_samples = column.shape[0]
nan_count = column.isna().sum()
value_counts = column.value_counts(dropna=True).to_dict()
@@ -160,7 +160,7 @@ def count_expected_statistics_for_categorical_column(
}
-def count_expected_statistics_for_string_column(column: pd.Series) -> dict: # type: ignore
+def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, Any]:
n_samples = column.shape[0]
nan_count = column.isna().sum()
value_counts = column.value_counts(dropna=True).to_dict()
@@ -183,7 +183,7 @@ def count_expected_statistics_for_string_column(column: pd.Series) -> dict: # t
return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT)
-def count_expected_statistics_for_bool_column(column: pd.Series) -> dict: # type: ignore
+def count_expected_statistics_for_bool_column(column: pd.Series) -> dict[str, Any]:
n_samples = column.shape[0]
nan_count = column.isna().sum()
value_counts = column.value_counts(dropna=True).to_dict()
@@ -474,7 +474,7 @@ def test_image_statistics(
assert computed == expected
-def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict: # type: ignore
+def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict[str, Any]:
n_samples = column.shape[0]
nan_count = column.isna().sum()
if nan_count == n_samples:
From 0744e075b4ec8d41a0d84004fd96b7cfb83f1e57 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 27 Dec 2024 14:01:50 +0100
Subject: [PATCH 17/40] mypy
---
services/worker/tests/test_statistics_utils.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index d693acf501..648cf6791b 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -125,7 +125,7 @@ def count_expected_statistics_for_numerical_column(
}
-def count_expected_statistics_for_list_column(column: pd.Series) -> dict[str, Any]:
+def count_expected_statistics_for_list_column(column: pd.Series) -> dict[str, Any]: # type: ignore
if column.isnull().all():
lengths_column = pd.Series([None] * column.shape[0])
return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT)
@@ -160,7 +160,7 @@ def count_expected_statistics_for_categorical_column(
}
-def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, Any]:
+def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str, Any]: # type: ignore
n_samples = column.shape[0]
nan_count = column.isna().sum()
value_counts = column.value_counts(dropna=True).to_dict()
@@ -183,7 +183,7 @@ def count_expected_statistics_for_string_column(column: pd.Series) -> dict[str,
return count_expected_statistics_for_numerical_column(lengths_column, dtype=ColumnType.INT)
-def count_expected_statistics_for_bool_column(column: pd.Series) -> dict[str, Any]:
+def count_expected_statistics_for_bool_column(column: pd.Series) -> dict[str, Any]: # type: ignore
n_samples = column.shape[0]
nan_count = column.isna().sum()
value_counts = column.value_counts(dropna=True).to_dict()
@@ -474,7 +474,7 @@ def test_image_statistics(
assert computed == expected
-def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict[str, Any]:
+def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict[str, Any]: # type: ignore
n_samples = column.shape[0]
nan_count = column.isna().sum()
if nan_count == n_samples:
From 53e210085722afbda4e50b762edbabb13f5cd941 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 6 Jan 2025 17:50:36 +0100
Subject: [PATCH 18/40] add testcase
currently not passing, bug on polars side?
---
services/worker/src/worker/statistics_utils.py | 6 ++++--
.../worker/tests/fixtures/statistics_dataset.py | 14 ++++++++++++++
services/worker/tests/test_statistics_utils.py | 6 +++---
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index e35e04a266..b1322ce66a 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -826,8 +826,10 @@ def _compute_statistics(
),
)
- def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
- stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
+ def compute_and_prepare_response(
+ self, data: pl.DataFrame, format: Optional[str] = None
+ ) -> StatisticsPerColumnItem:
+ stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples, format=format)
return StatisticsPerColumnItem(
column_name=self.name,
column_type=ColumnType.DATETIME,
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index acbec4858a..8768f854f0 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1729,6 +1729,19 @@ def null_column(n_samples: int) -> list[None]:
"2024-01-10 00:00:00",
"2024-01-11 00:00:00",
],
+ "datetime_string_z": [
+ "2024-01-01 00:00:00Z",
+ "2024-01-02 00:00:00Z",
+ "2024-01-03 00:00:00Z",
+ "2024-01-04 00:00:00Z",
+ "2024-01-05 00:00:00Z",
+ "2024-01-06 00:00:00Z",
+ "2024-01-07 00:00:00Z",
+ "2024-01-08 00:00:00Z",
+ "2024-01-09 00:00:00Z",
+ "2024-01-10 00:00:00Z",
+ "2024-01-11 00:00:00Z",
+ ],
"datetime_tz": [
datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
@@ -1761,6 +1774,7 @@ def null_column(n_samples: int) -> list[None]:
{
"datetime": Value("timestamp[s]"),
"datetime_string": Value("string"),
+ "datetime_string_z": Value("string"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
"datetime_all_null": Value("timestamp[s]"),
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 648cf6791b..5b85ddc340 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -513,7 +513,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
# compute std
seconds_in_day = 24 * 60 * 60
- if column_name in ["datetime", "datetime_string", "datetime_tz"]:
+ if column_name in ["datetime", "datetime_string", "datetime_string_z", "datetime_tz"]:
timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
elif column_name == "datetime_null":
@@ -542,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
@pytest.mark.parametrize(
"column_name",
- ["datetime", "datetime_string", "datetime_tz", "datetime_null", "datetime_all_null"],
+ ["datetime", "datetime_string", "datetime_string_z", "datetime_tz", "datetime_null", "datetime_all_null"],
)
def test_datetime_statistics(
column_name: str,
@@ -550,7 +550,7 @@ def test_datetime_statistics(
) -> None:
data = datasets["datetime_statistics"].to_pandas()
expected = count_expected_statistics_for_datetime_column(data[column_name], column_name)
- if column_name == "datetime_string":
+ if "_string" in column_name:
computed = StringColumn.compute_statistics(
data=pl.from_pandas(data),
column_name=column_name,
From 3df62647a417addfd79856473836d8b08b97a2e7 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 8 Jan 2025 16:22:35 +0100
Subject: [PATCH 19/40] fix?
---
libs/libcommon/src/libcommon/utils.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index 90c181ae21..837d595d7c 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -142,7 +142,9 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
for fmt in common_formats:
try:
- datetime.strptime(datetime_string, fmt)
+ _ = datetime.strptime(datetime_string, fmt)
+ if fmt.endswith("%z") and any(datetime_string.endswith(timezone) for timezone in ["Z", "UTC", "ACST"]):
+ fmt = f"{fmt.rstrip('%z')}%Z"
return fmt
except ValueError:
continue
From 812bf36d5011fc1eb632ab1ea00b47eac6e6ec77 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 8 Jan 2025 17:53:41 +0100
Subject: [PATCH 20/40] add example to docs
---
docs/source/statistics.md | 55 +++++++++++++++++++++++++++++++++++++++
1 file changed, 55 insertions(+)
diff --git a/docs/source/statistics.md b/docs/source/statistics.md
index 3b061c010c..a849a11638 100644
--- a/docs/source/statistics.md
+++ b/docs/source/statistics.md
@@ -178,6 +178,7 @@ Currently, statistics are supported for strings, float and integer numbers, list
* `list` - for lists of any other data types (including lists)
* `audio` - for audio data
* `image` - for image data
+* `datetime` - for datetime data
### `class_label`
@@ -591,3 +592,57 @@ For image data, the distribution of images widths is computed. The following mea
+
+### datetime
+
+The distribution of datetime is computed.
+
+Example
+
+
+```json
+{
+ "column_name": "date",
+ "column_type": "datetime",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": "2013-05-18 04:54:11",
+ "max": "2013-06-20 10:01:41",
+ "mean": "2013-05-27 18:03:39",
+ "median": "2013-05-23 11:55:50",
+ "std": "11 days, 4:57:32.322450",
+ "histogram": {
+ "hist": [
+ 318776,
+ 393036,
+ 173904,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 206284
+ ],
+ "bin_edges": [
+ "2013-05-18 04:54:11",
+ "2013-05-21 12:36:57",
+ "2013-05-24 20:19:43",
+ "2013-05-28 04:02:29",
+ "2013-05-31 11:45:15",
+ "2013-06-03 19:28:01",
+ "2013-06-07 03:10:47",
+ "2013-06-10 10:53:33",
+ "2013-06-13 18:36:19",
+ "2013-06-17 02:19:05",
+ "2013-06-20 10:01:41"
+ ]
+ }
+ }
+}
+```
+
+
+
+
From c68efb7f744f8428b039281e7de847e03b999700 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 9 Jan 2025 13:21:49 +0100
Subject: [PATCH 21/40] fix + add tz string (%Z) to formats
---
libs/libcommon/src/libcommon/utils.py | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index 837d595d7c..b2052f0265 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -109,29 +109,36 @@ def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str
def identify_datetime_format(datetime_string: str) -> Optional[str]:
# Common datetime formats
common_formats = [
+ "%Y-%m-%dT%H:%M:%S%Z",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%dT%H:%M:%S.%f",
+ "%Y-%m-%d %H:%M:%S%Z",
"%Y-%m-%d %H:%M:%S%z",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M",
"%Y-%m-%d",
+ "%d-%m-%Y %H:%M:%S%Z",
"%d-%m-%Y %H:%M:%S%z",
"%d-%m-%Y %H:%M:%S",
"%d-%m-%Y %H:%M",
"%d-%m-%Y",
+ "%m-%d-%Y %H:%M:%S%Z",
"%m-%d-%Y %H:%M:%S%z",
"%m-%d-%Y %H:%M:%S",
"%m-%d-%Y %H:%M",
"%m-%d-%Y",
+ "%Y/%m/%d %H:%M:%S%Z",
"%Y/%m/%d %H:%M:%S%z",
"%Y/%m/%d %H:%M:%S",
"%Y/%m/%d %H:%M",
"%Y/%m/%d",
+ "%d/%m/%Y %H:%M:%S%Z",
"%d/%m/%Y %H:%M:%S%z",
"%d/%m/%Y %H:%M:%S",
"%d/%m/%Y %H:%M",
"%d/%m/%Y",
+ "%m/%d/%Y %H:%M:%S%Z",
"%m/%d/%Y %H:%M:%S%z",
"%m/%d/%Y %H:%M:%S",
"%m/%d/%Y %H:%M",
@@ -143,7 +150,7 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
for fmt in common_formats:
try:
_ = datetime.strptime(datetime_string, fmt)
- if fmt.endswith("%z") and any(datetime_string.endswith(timezone) for timezone in ["Z", "UTC", "ACST"]):
+ if fmt.endswith("%z") and any(datetime_string.endswith(tz) for tz in ["Z", "ACST"]):
fmt = f"{fmt.rstrip('%z')}%Z"
return fmt
except ValueError:
From 351ef5cfeb0b47d3e504b1632db524a56e4c8a30 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 9 Jan 2025 14:22:29 +0100
Subject: [PATCH 22/40] test for string timezone
not sure it works as expected
---
.../tests/fixtures/statistics_dataset.py | 56 ++++++++++++++-----
.../worker/tests/test_statistics_utils.py | 38 +++++++++++--
2 files changed, 74 insertions(+), 20 deletions(-)
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index 8768f854f0..7215472ca2 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1703,19 +1703,6 @@ def null_column(n_samples: int) -> list[None]:
datetime_dataset = Dataset.from_dict(
{
- "datetime": [
- datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
- datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
- ],
"datetime_string": [
"2024-01-01 00:00:00",
"2024-01-02 00:00:00",
@@ -1742,6 +1729,45 @@ def null_column(n_samples: int) -> list[None]:
"2024-01-10 00:00:00Z",
"2024-01-11 00:00:00Z",
],
+ "datetime_string_cet": [
+ "2024-01-01 00:00:00CET",
+ "2024-01-02 00:00:00CET",
+ "2024-01-03 00:00:00CET",
+ "2024-01-04 00:00:00CET",
+ "2024-01-05 00:00:00CET",
+ "2024-01-06 00:00:00CET",
+ "2024-01-07 00:00:00CET",
+ "2024-01-08 00:00:00CET",
+ "2024-01-09 00:00:00CET",
+ "2024-01-10 00:00:00CET",
+ "2024-01-11 00:00:00CET",
+ ],
+ "datetime_string_tz": [
+ "2024-01-01 00:00:00+0200",
+ "2024-01-02 00:00:00+0200",
+ "2024-01-03 00:00:00+0200",
+ "2024-01-04 00:00:00+0200",
+ "2024-01-05 00:00:00+0200",
+ "2024-01-06 00:00:00+0200",
+ "2024-01-07 00:00:00+0200",
+ "2024-01-08 00:00:00+0200",
+ "2024-01-09 00:00:00+0200",
+ "2024-01-10 00:00:00+0200",
+ "2024-01-11 00:00:00+0200",
+ ],
+ "datetime": [
+ datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+ ],
"datetime_tz": [
datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
@@ -1772,9 +1798,11 @@ def null_column(n_samples: int) -> list[None]:
},
features=Features(
{
- "datetime": Value("timestamp[s]"),
"datetime_string": Value("string"),
"datetime_string_z": Value("string"),
+ "datetime_string_cet": Value("string"),
+ "datetime_string_tz": Value("string"),
+ "datetime": Value("timestamp[s]"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
"datetime_all_null": Value("timestamp[s]"),
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 5b85ddc340..86ec8167f8 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -510,17 +510,34 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
if column_name == "datetime_tz":
bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges]
minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200"
+ elif column_name == "datetime_string_tz":
+ # switch everything to two hours earlier in UTC timezone
+ minv = "2023-12-31 22:00:00+0000"
+ maxv = "2024-01-10 22:00:00+0000"
+ mean = "2024-01-05 22:00:00+0000"
+ median = "2024-01-05 22:00:00+0000"
+ bin_edges = [
+ "2023-12-31 22:00:00+0000",
+ "2024-01-01 22:00:01+0000",
+ "2024-01-02 22:00:02+0000",
+ "2024-01-03 22:00:03+0000",
+ "2024-01-04 22:00:04+0000",
+ "2024-01-05 22:00:05+0000",
+ "2024-01-06 22:00:06+0000",
+ "2024-01-07 22:00:07+0000",
+ "2024-01-08 22:00:08+0000",
+ "2024-01-09 22:00:09+0000",
+ "2024-01-10 22:00:00+0000",
+ ]
# compute std
seconds_in_day = 24 * 60 * 60
- if column_name in ["datetime", "datetime_string", "datetime_string_z", "datetime_tz"]:
- timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
- hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
- elif column_name == "datetime_null":
+ if column_name == "datetime_null":
timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day)) # take every other day
hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1]
else:
- raise ValueError("Incorrect column")
+ timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+ hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
std = timedeltas.std()
std_str = str(datetime.timedelta(seconds=std))
@@ -542,7 +559,16 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
@pytest.mark.parametrize(
"column_name",
- ["datetime", "datetime_string", "datetime_string_z", "datetime_tz", "datetime_null", "datetime_all_null"],
+ [
+ "datetime",
+ "datetime_string",
+ "datetime_string_z",
+ "datetime_string_cet",
+ "datetime_string_tz",
+ "datetime_tz",
+ "datetime_null",
+ "datetime_all_null",
+ ],
)
def test_datetime_statistics(
column_name: str,
From 787ad3bbdb36735d4a81e8f204fff8e05860c756 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 10 Jan 2025 13:23:41 +0100
Subject: [PATCH 23/40] try to debug
test fail is not reproduced locally
---
services/worker/src/worker/statistics_utils.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index b1322ce66a..6711c61a9c 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -487,6 +487,9 @@ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[st
if _is_datetime:
formats = [identify_datetime_format(value) for value in values]
if len(set(formats)) == 1:
+ datetime_format = formats[0]
+ if not datetime_format:
+ raise ValueError("Values are datetime but format is not identified")
return True, formats[0]
raise StatisticsComputationError("Multiple datetime formats detected. ")
From 5163500b092b0950e29b82ab76634d386df2553a Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 10 Jan 2025 16:24:10 +0100
Subject: [PATCH 24/40] test identify_datetime_format
to debug why test is not passed in CI but works locally
---
libs/libcommon/tests/test_utils.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py
index d929e8d7de..fef9784e56 100644
--- a/libs/libcommon/tests/test_utils.py
+++ b/libs/libcommon/tests/test_utils.py
@@ -16,6 +16,7 @@
get_datetime,
get_duration,
get_expires,
+ identify_datetime_format,
inputs_to_string,
is_image_url,
orjson_dumps,
@@ -153,3 +154,13 @@ def test_serialize_and_truncate_raises(obj: Any, max_bytes: int) -> None:
def test_get_duration() -> None:
assert get_duration(get_datetime() - timedelta(seconds=10)) == pytest.approx(10, rel=0.01)
+
+
+@pytest.mark.parametrize(
+ "datetime_string,expected_format",
+ [
+ ("2024-01-01 00:00:00CET", "%Y-%m-%d %H:%M:%S%Z"),
+ ],
+)
+def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None:
+ assert identify_datetime_format(datetime_string) == expected_format
From 033e29e5b9672599ed7562a11fa23e4fdaebf4ce Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 13 Jan 2025 12:19:52 +0100
Subject: [PATCH 25/40] test datetime.strptime
---
libs/libcommon/tests/test_utils.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py
index fef9784e56..67bf33cee0 100644
--- a/libs/libcommon/tests/test_utils.py
+++ b/libs/libcommon/tests/test_utils.py
@@ -163,4 +163,5 @@ def test_get_duration() -> None:
],
)
def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None:
+ assert datetime.strptime(datetime_string, expected_format), "datetime error"
assert identify_datetime_format(datetime_string) == expected_format
From 349b65185010eb98bc927c00dc842f0c4b16cf04 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Mon, 13 Jan 2025 13:14:31 +0100
Subject: [PATCH 26/40] test
---
libs/libcommon/tests/test_utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py
index 67bf33cee0..05b294b064 100644
--- a/libs/libcommon/tests/test_utils.py
+++ b/libs/libcommon/tests/test_utils.py
@@ -159,7 +159,7 @@ def test_get_duration() -> None:
@pytest.mark.parametrize(
"datetime_string,expected_format",
[
- ("2024-01-01 00:00:00CET", "%Y-%m-%d %H:%M:%S%Z"),
+ ("2024-01-01 00:00:00 CET", "%Y-%m-%d %H:%M:%S %Z"),
],
)
def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None:
From 6c60c273499468dcb306d6ae69f796188746effc Mon Sep 17 00:00:00 2001
From: Polina Kazakova
Date: Wed, 15 Jan 2025 12:17:31 +0100
Subject: [PATCH 27/40] Update services/worker/src/worker/statistics_utils.py
Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
services/worker/src/worker/statistics_utils.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 6711c61a9c..3514771166 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -479,9 +479,9 @@ def is_class(n_unique: int, n_samples: int) -> bool:
@staticmethod
def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]:
- """Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format"""
+ """Check if first 100 non-null samples in a column match datetime format. If true, also return datetime format"""
- values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
+ values = data.filter(pl.col(column_name).is_not_null()).head(100)[column_name].to_list()
_is_datetime = all(is_datetime(value) for value in values) if len(values) > 0 else False
if _is_datetime:
From db10500ad471e39b61c98c5b72b945b62d276c7a Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 15 Jan 2025 14:37:40 +0100
Subject: [PATCH 28/40] keep original timezone for string dates
polars switches dates to utc when casting from string while we want to preserve original dates
---
libs/libcommon/src/libcommon/utils.py | 6 ++++-
.../worker/src/worker/statistics_utils.py | 9 +++++--
.../tests/fixtures/statistics_dataset.py | 26 +++++++++----------
.../worker/tests/test_statistics_utils.py | 23 ++--------------
4 files changed, 27 insertions(+), 37 deletions(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index b2052f0265..3e8134da64 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -7,7 +7,7 @@
import mimetypes
import time
from collections.abc import Callable, Sequence
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta, timezone, tzinfo
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, Optional, TypeVar, Union, cast
@@ -102,6 +102,10 @@ def is_datetime(string: str) -> bool:
return False
+def get_timezone(string: str) -> Optional[tzinfo]:
+ return parser.parse(string).tzinfo
+
+
def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
return dt.strftime(format)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 3514771166..9bc5a2a343 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -15,7 +15,7 @@
from libcommon.exceptions import (
StatisticsComputationError,
)
-from libcommon.utils import datetime_to_string, identify_datetime_format, is_datetime
+from libcommon.utils import datetime_to_string, get_timezone, identify_datetime_format, is_datetime
from PIL import Image
from tqdm.contrib.concurrent import thread_map
@@ -490,7 +490,7 @@ def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[st
datetime_format = formats[0]
if not datetime_format:
raise ValueError("Values are datetime but format is not identified")
- return True, formats[0]
+ return True, datetime_format
raise StatisticsComputationError("Multiple datetime formats detected. ")
return False, None
@@ -792,7 +792,9 @@ def _compute_statistics(
std=None,
histogram=None,
)
+ original_timezone = None
if isinstance(data[column_name].dtype, pl.String):
+ original_timezone = get_timezone(data[column_name][0])
data = data.with_columns(pl.col(column_name).str.to_datetime(format=format))
min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
@@ -811,6 +813,9 @@ def _compute_statistics(
assert timedelta_stats["median"] is not None # nosec
assert timedelta_stats["std"] is not None # nosec
+ if original_timezone:
+ min_date = min_date.astimezone(original_timezone)
+
datetime_bin_edges = [
cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
]
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index 7215472ca2..9033a9c470 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1729,18 +1729,18 @@ def null_column(n_samples: int) -> list[None]:
"2024-01-10 00:00:00Z",
"2024-01-11 00:00:00Z",
],
- "datetime_string_cet": [
- "2024-01-01 00:00:00CET",
- "2024-01-02 00:00:00CET",
- "2024-01-03 00:00:00CET",
- "2024-01-04 00:00:00CET",
- "2024-01-05 00:00:00CET",
- "2024-01-06 00:00:00CET",
- "2024-01-07 00:00:00CET",
- "2024-01-08 00:00:00CET",
- "2024-01-09 00:00:00CET",
- "2024-01-10 00:00:00CET",
- "2024-01-11 00:00:00CET",
+ "datetime_string_t_z": [
+ "2024-01-01T00:00:00Z",
+ "2024-01-02T00:00:00Z",
+ "2024-01-03T00:00:00Z",
+ "2024-01-04T00:00:00Z",
+ "2024-01-05T00:00:00Z",
+ "2024-01-06T00:00:00Z",
+ "2024-01-07T00:00:00Z",
+ "2024-01-08T00:00:00Z",
+ "2024-01-09T00:00:00Z",
+ "2024-01-10T00:00:00Z",
+ "2024-01-11T00:00:00Z",
],
"datetime_string_tz": [
"2024-01-01 00:00:00+0200",
@@ -1800,7 +1800,7 @@ def null_column(n_samples: int) -> list[None]:
{
"datetime_string": Value("string"),
"datetime_string_z": Value("string"),
- "datetime_string_cet": Value("string"),
+ "datetime_string_t_z": Value("string"),
"datetime_string_tz": Value("string"),
"datetime": Value("timestamp[s]"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 86ec8167f8..a4bd281fe7 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -507,28 +507,9 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"2024-01-10 00:00:09",
"2024-01-11 00:00:00",
]
- if column_name == "datetime_tz":
+ if column_name in ["datetime_tz", "datetime_string_tz"]:
bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges]
minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200"
- elif column_name == "datetime_string_tz":
- # switch everything to two hours earlier in UTC timezone
- minv = "2023-12-31 22:00:00+0000"
- maxv = "2024-01-10 22:00:00+0000"
- mean = "2024-01-05 22:00:00+0000"
- median = "2024-01-05 22:00:00+0000"
- bin_edges = [
- "2023-12-31 22:00:00+0000",
- "2024-01-01 22:00:01+0000",
- "2024-01-02 22:00:02+0000",
- "2024-01-03 22:00:03+0000",
- "2024-01-04 22:00:04+0000",
- "2024-01-05 22:00:05+0000",
- "2024-01-06 22:00:06+0000",
- "2024-01-07 22:00:07+0000",
- "2024-01-08 22:00:08+0000",
- "2024-01-09 22:00:09+0000",
- "2024-01-10 22:00:00+0000",
- ]
# compute std
seconds_in_day = 24 * 60 * 60
@@ -563,7 +544,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"datetime",
"datetime_string",
"datetime_string_z",
- "datetime_string_cet",
+ "datetime_string_t_z",
"datetime_string_tz",
"datetime_tz",
"datetime_null",
From 8794b7ae3b17329f6e73b0e35083f2d26e5a54b9 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 15 Jan 2025 15:24:22 +0100
Subject: [PATCH 29/40] let polars identify datetime format by itself
provide manually only in case of failure
---
services/worker/src/worker/statistics_utils.py | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 9bc5a2a343..7afa2e996c 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -795,7 +795,11 @@ def _compute_statistics(
original_timezone = None
if isinstance(data[column_name].dtype, pl.String):
original_timezone = get_timezone(data[column_name][0])
- data = data.with_columns(pl.col(column_name).str.to_datetime(format=format))
+ # let polars identify format itself. provide manually in case of error
+ try:
+ data = data.with_columns(pl.col(column_name).str.to_datetime())
+ except pl.ComputeError:
+ data = data.with_columns(pl.col(column_name).str.to_datetime(format=format))
min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
@@ -806,7 +810,7 @@ def _compute_statistics(
column_name=timedelta_column_name,
n_samples=n_samples,
)
- # to assure mypy that there values are not None to pass to conversion functions:
+ # to assure mypy that these values are not None to pass to conversion functions:
assert timedelta_stats["histogram"] is not None # nosec
assert timedelta_stats["max"] is not None # nosec
assert timedelta_stats["mean"] is not None # nosec
From e0e7c91989c9018615683265e4a35647bb700b16 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 15 Jan 2025 15:24:49 +0100
Subject: [PATCH 30/40] do not display +0000 in timestamps (if timezone is UTC)
---
libs/libcommon/src/libcommon/utils.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index 3e8134da64..c540d77e48 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -7,7 +7,7 @@
import mimetypes
import time
from collections.abc import Callable, Sequence
-from datetime import datetime, timedelta, timezone, tzinfo
+from datetime import datetime, timedelta, timezone
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, Optional, TypeVar, Union, cast
@@ -102,11 +102,13 @@ def is_datetime(string: str) -> bool:
return False
-def get_timezone(string: str) -> Optional[tzinfo]:
+def get_timezone(string: str) -> Any:
return parser.parse(string).tzinfo
def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
+ if dt.utcoffset() == timedelta(0):
+ format = "%Y-%m-%d %H:%M:%S" # do not display +0000
return dt.strftime(format)
@@ -154,8 +156,6 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
for fmt in common_formats:
try:
_ = datetime.strptime(datetime_string, fmt)
- if fmt.endswith("%z") and any(datetime_string.endswith(tz) for tz in ["Z", "ACST"]):
- fmt = f"{fmt.rstrip('%z')}%Z"
return fmt
except ValueError:
continue
From 8afade1c9bbaf1502e70b5567d7f1c50d20c78a6 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 15 Jan 2025 15:59:23 +0100
Subject: [PATCH 31/40] remove utils test
---
libs/libcommon/tests/test_utils.py | 11 -----------
1 file changed, 11 deletions(-)
diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py
index 05b294b064..67e909f1c6 100644
--- a/libs/libcommon/tests/test_utils.py
+++ b/libs/libcommon/tests/test_utils.py
@@ -154,14 +154,3 @@ def test_serialize_and_truncate_raises(obj: Any, max_bytes: int) -> None:
def test_get_duration() -> None:
assert get_duration(get_datetime() - timedelta(seconds=10)) == pytest.approx(10, rel=0.01)
-
-
-@pytest.mark.parametrize(
- "datetime_string,expected_format",
- [
- ("2024-01-01 00:00:00 CET", "%Y-%m-%d %H:%M:%S %Z"),
- ],
-)
-def test_identify_datetime_format(datetime_string: str, expected_format: str) -> None:
- assert datetime.strptime(datetime_string, expected_format), "datetime error"
- assert identify_datetime_format(datetime_string) == expected_format
From 341676c5c0001d5afda908192689000e5aba653d Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Wed, 15 Jan 2025 15:59:29 +0100
Subject: [PATCH 32/40] refactor: identify datetime format manually only when
polars failed
---
.../worker/src/worker/statistics_utils.py | 46 +++++++++----------
1 file changed, 23 insertions(+), 23 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 7afa2e996c..653eb79aac 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -478,22 +478,11 @@ def is_class(n_unique: int, n_samples: int) -> bool:
) or n_unique <= NUM_BINS
@staticmethod
- def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]:
- """Check if first 100 non-null samples in a column match datetime format. If true, also return datetime format"""
+ def is_datetime(data: pl.DataFrame, column_name: str) -> bool:
+ """Check if first 100 non-null samples in a column match datetime format."""
values = data.filter(pl.col(column_name).is_not_null()).head(100)[column_name].to_list()
- _is_datetime = all(is_datetime(value) for value in values) if len(values) > 0 else False
-
- if _is_datetime:
- formats = [identify_datetime_format(value) for value in values]
- if len(set(formats)) == 1:
- datetime_format = formats[0]
- if not datetime_format:
- raise ValueError("Values are datetime but format is not identified")
- return True, datetime_format
- raise StatisticsComputationError("Multiple datetime formats detected. ")
-
- return False, None
+ return all(is_datetime(value) for value in values) if len(values) > 0 else False
@classmethod
def compute_transformed_data(
@@ -515,13 +504,11 @@ def _compute_statistics(
) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
- _is_datetime, datetime_format = cls.is_datetime(data, column_name)
- if _is_datetime:
+ if cls.is_datetime(data, column_name):
datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
data,
column_name=column_name,
n_samples=n_samples,
- format=datetime_format,
)
return datetime_stats
@@ -772,13 +759,27 @@ def compute_transformed_data(
def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str:
return datetime_to_string(base_date + datetime.timedelta(seconds=seconds))
+ @staticmethod
+ def get_format(data: pl.DataFrame, column_name: str) -> str:
+ values = data.filter(pl.col(column_name).is_not_null()).head(100)[column_name].to_list()
+ formats = [identify_datetime_format(value) for value in values]
+ if len(set(formats)) == 1:
+ datetime_format = formats[0]
+ if not datetime_format:
+ raise StatisticsComputationError(
+ f"Values are datetime but format is not identified. Example: {values[0]}. "
+ )
+ else:
+ raise StatisticsComputationError("Multiple datetime formats detected. ")
+
+ return datetime_format
+
@classmethod
def _compute_statistics(
cls,
data: pl.DataFrame,
column_name: str,
n_samples: int,
- format: Optional[str] = None,
) -> DatetimeStatisticsItem:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
if nan_count == n_samples: # all values are None
@@ -799,7 +800,8 @@ def _compute_statistics(
try:
data = data.with_columns(pl.col(column_name).str.to_datetime())
except pl.ComputeError:
- data = data.with_columns(pl.col(column_name).str.to_datetime(format=format))
+ datetime_format = cls.get_format(data, column_name)
+ data = data.with_columns(pl.col(column_name).str.to_datetime(format=datetime_format))
min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
@@ -838,10 +840,8 @@ def _compute_statistics(
),
)
- def compute_and_prepare_response(
- self, data: pl.DataFrame, format: Optional[str] = None
- ) -> StatisticsPerColumnItem:
- stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples, format=format)
+ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
+ stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
return StatisticsPerColumnItem(
column_name=self.name,
column_type=ColumnType.DATETIME,
From 3b5d9506df1a439f1c23bca26a06f2164b8f4481 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 12:34:50 +0100
Subject: [PATCH 33/40] style
---
libs/libcommon/tests/test_utils.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/libs/libcommon/tests/test_utils.py b/libs/libcommon/tests/test_utils.py
index 67e909f1c6..d929e8d7de 100644
--- a/libs/libcommon/tests/test_utils.py
+++ b/libs/libcommon/tests/test_utils.py
@@ -16,7 +16,6 @@
get_datetime,
get_duration,
get_expires,
- identify_datetime_format,
inputs_to_string,
is_image_url,
orjson_dumps,
From 21977db52bb36c97f4a4983ec0ebefe5df2f18f5 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 12:35:01 +0100
Subject: [PATCH 34/40] log formats in error message
---
services/worker/src/worker/statistics_utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 653eb79aac..3453ff825e 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -770,7 +770,7 @@ def get_format(data: pl.DataFrame, column_name: str) -> str:
f"Values are datetime but format is not identified. Example: {values[0]}. "
)
else:
- raise StatisticsComputationError("Multiple datetime formats detected. ")
+ raise StatisticsComputationError(f"Multiple datetime formats detected: {set(formats)}. ")
return datetime_format
From 0ee76bfe623c36cca66d466e951b7b7c5a532992 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 14:19:40 +0100
Subject: [PATCH 35/40] update openapi specs
---
docs/source/openapi.json | 427 +++++++++++++++++++++++++-
libs/libcommon/src/libcommon/utils.py | 5 +
2 files changed, 430 insertions(+), 2 deletions(-)
diff --git a/docs/source/openapi.json b/docs/source/openapi.json
index 5844fda06a..c51a928009 100644
--- a/docs/source/openapi.json
+++ b/docs/source/openapi.json
@@ -1176,7 +1176,8 @@
"bool",
"list",
"audio",
- "image"
+ "image",
+ "datetime"
]
},
"Histogram": {
@@ -1197,6 +1198,24 @@
}
}
},
+ "DatetimeHistogram": {
+ "type": "object",
+ "required": ["hist", "bin_edges"],
+ "properties": {
+ "hist": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "bin_edges": {
+ "type": "array",
+ "items": {
+ "type": "number"
+ }
+ }
+ }
+ },
"NumericalStatisticsItem": {
"type": "object",
"required": [
@@ -1229,6 +1248,38 @@
}
}
},
+ "DatetimeStatisticsItem": {
+ "type": "object",
+ "required": [
+ "nan_count",
+ "nan_proportion",
+ "min",
+ "max",
+ "mean",
+ "median",
+ "std",
+ "histogram"
+ ],
+ "properties": {
+ "nan_count": {
+ "type": "integer"
+ },
+ "nan_proportion": {
+ "type": "number"
+ },
+ "min": { "oneOf": [{ "type": "string" }, { "type": "null" }] },
+ "max": { "oneOf": [{ "type": "string" }, { "type": "null" }] },
+ "mean": { "oneOf": [{ "type": "string" }, { "type": "null" }] },
+ "median": { "oneOf": [{ "type": "string" }, { "type": "null" }] },
+ "std": { "oneOf": [{ "type": "string" }, { "type": "null" }] },
+ "histogram": {
+ "oneOf": [
+ { "$ref": "#/components/schemas/DatetimeHistogram" },
+ { "type": "null" }
+ ]
+ }
+ }
+ },
"CategoricalStatisticsItem": {
"type": "object",
"description": "note that fields 'no_label_count' and 'no_label_proportion' are not required, because some old entries still miss them, and we don't want to recompute all of them. See https://github.com/huggingface/dataset-viewer/issues/2573.",
@@ -1280,12 +1331,15 @@
{
"$ref": "#/components/schemas/NumericalStatisticsItem"
},
+ {
+ "$ref": "#/components/schemas/DatetimeStatisticsItem"
+ },
{
"$ref": "#/components/schemas/CategoricalStatisticsItem"
},
{
"$ref": "#/components/schemas/BoolStatisticsItem"
- }
+ },
]
},
"StatisticsPerColumnItem": {
@@ -5925,6 +5979,375 @@
"partial": false
}
},
+ "A split (CL-ETM/datetimeevents) with a datetime column": {
+ "summary": "Statistics on a split with datetime columns 'charttime', 'storetime' and 'value'. ",
+ "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=CL-ETM/datetimeevents&config=mnist&split=train.",
+ "value": {
+ "num_examples": 6653174,
+ "statistics": [
+ {
+ "column_name": "caregiver_id",
+ "column_type": "int",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": 45,
+ "max": 99872,
+ "mean": 49146.20367,
+ "median": 46354.0,
+ "std": 28893.09204,
+ "histogram": {
+ "hist": [
+ 586864,
+ 696061,
+ 882127,
+ 627295,
+ 759981,
+ 594546,
+ 544977,
+ 653948,
+ 507192,
+ 800183
+ ],
+ "bin_edges": [
+ 45,
+ 10028,
+ 20011,
+ 29994,
+ 39977,
+ 49960,
+ 59943,
+ 69926,
+ 79909,
+ 89892,
+ 99872
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "charttime",
+ "column_type": "datetime",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": "2110-01-13 09:39:00",
+ "max": "2214-07-26 08:00:00",
+ "mean": "2153-03-20 23:15:24",
+ "median": "2153-01-19 04:19:30",
+ "std": "8691 days, 20:22:21.464930",
+ "histogram": {
+ "hist": [
+ 644662,
+ 824869,
+ 883173,
+ 884980,
+ 861445,
+ 863916,
+ 838647,
+ 664347,
+ 156213,
+ 30922
+ ],
+ "bin_edges": [
+ "2110-01-13 09:39:00",
+ "2120-06-27 07:05:07",
+ "2130-12-10 04:31:14",
+ "2141-05-24 01:57:21",
+ "2151-11-05 23:23:28",
+ "2162-04-19 20:49:35",
+ "2172-10-01 18:15:42",
+ "2183-03-16 15:41:49",
+ "2193-08-28 13:07:56",
+ "2204-02-11 10:34:03",
+ "2214-07-26 08:00:00"
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "hadm_id",
+ "column_type": "int",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": 20000094,
+ "max": 29999828,
+ "mean": 25027899.88926,
+ "median": 25052613.0,
+ "std": 2869146.55704,
+ "histogram": {
+ "hist": [
+ 638196,
+ 656157,
+ 656168,
+ 661133,
+ 678335,
+ 693220,
+ 676587,
+ 653053,
+ 674626,
+ 665699
+ ],
+ "bin_edges": [
+ 20000094,
+ 21000068,
+ 22000042,
+ 23000016,
+ 23999990,
+ 24999964,
+ 25999938,
+ 26999912,
+ 27999886,
+ 28999860,
+ 29999828
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "itemid",
+ "column_type": "int",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": 224183,
+ "max": 230120,
+ "mean": 225487.4805,
+ "median": 224290.0,
+ "std": 1820.04267,
+ "histogram": {
+ "hist": [
+ 3742726,
+ 568047,
+ 1012645,
+ 75427,
+ 21011,
+ 41780,
+ 311155,
+ 100074,
+ 249544,
+ 530765
+ ],
+ "bin_edges": [
+ 224183,
+ 224777,
+ 225371,
+ 225965,
+ 226559,
+ 227153,
+ 227747,
+ 228341,
+ 228935,
+ 229529,
+ 230120
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "stay_id",
+ "column_type": "int",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": 30000153,
+ "max": 39999858,
+ "mean": 34988877.57506,
+ "median": 34997302.0,
+ "std": 2873138.27766,
+ "histogram": {
+ "hist": [
+ 669019,
+ 638622,
+ 695479,
+ 665010,
+ 659205,
+ 659496,
+ 696313,
+ 662500,
+ 671230,
+ 636300
+ ],
+ "bin_edges": [
+ 30000153,
+ 31000124,
+ 32000095,
+ 33000066,
+ 34000037,
+ 35000008,
+ 35999979,
+ 36999950,
+ 37999921,
+ 38999892,
+ 39999858
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "storetime",
+ "column_type": "datetime",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": "2110-01-13 13:13:00",
+ "max": "2214-07-26 09:20:00",
+ "mean": "2153-03-20 23:57:17",
+ "median": "2153-01-19 03:42:00",
+ "std": "8691 days, 20:22:32.902370",
+ "histogram": {
+ "hist": [
+ 644728,
+ 824803,
+ 883215,
+ 884951,
+ 861438,
+ 863915,
+ 838652,
+ 664336,
+ 156214,
+ 30922
+ ],
+ "bin_edges": [
+ "2110-01-13 13:13:00",
+ "2120-06-27 10:25:43",
+ "2130-12-10 07:38:26",
+ "2141-05-24 04:51:09",
+ "2151-11-06 02:03:52",
+ "2162-04-19 23:16:35",
+ "2172-10-01 20:29:18",
+ "2183-03-16 17:42:01",
+ "2193-08-28 14:54:44",
+ "2204-02-11 12:07:27",
+ "2214-07-26 09:20:00"
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "subject_id",
+ "column_type": "int",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": 10000032,
+ "max": 16657691,
+ "mean": 13340551.62433,
+ "median": 13334004.0,
+ "std": 1927957.39956,
+ "histogram": {
+ "hist": [
+ 638347,
+ 684908,
+ 691450,
+ 631212,
+ 672810,
+ 659625,
+ 641987,
+ 654011,
+ 702989,
+ 675835
+ ],
+ "bin_edges": [
+ 10000032,
+ 10665798,
+ 11331564,
+ 11997330,
+ 12663096,
+ 13328862,
+ 13994628,
+ 14660394,
+ 15326160,
+ 15991926,
+ 16657691
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "value",
+ "column_type": "datetime",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": "2109-08-02 00:00:00",
+ "max": "2214-07-24 09:57:00",
+ "mean": "2153-03-17 00:32:04",
+ "median": "2153-01-15 00:00:00",
+ "std": "8691 days, 20:07:56.642090",
+ "histogram": {
+ "hist": [
+ 611811,
+ 820557,
+ 897262,
+ 880309,
+ 876200,
+ 860348,
+ 845238,
+ 673106,
+ 157352,
+ 30991
+ ],
+ "bin_edges": [
+ "2109-08-02 00:00:00",
+ "2120-01-31 03:23:43",
+ "2130-07-31 06:47:26",
+ "2141-01-28 10:11:09",
+ "2151-07-29 13:34:52",
+ "2162-01-26 16:58:35",
+ "2172-07-26 20:22:18",
+ "2183-01-24 23:46:01",
+ "2193-07-25 03:09:44",
+ "2204-01-24 06:33:27",
+ "2214-07-24 09:57:00"
+ ]
+ }
+ }
+ },
+ {
+ "column_name": "valueuom",
+ "column_type": "string_label",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "no_label_count": 0,
+ "no_label_proportion": 0.0,
+ "n_unique": 2,
+ "frequencies": {
+ "Date and Time": 1885855,
+ "Date": 4767319
+ }
+ }
+ },
+ {
+ "column_name": "warning",
+ "column_type": "int",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": 0,
+ "max": 1,
+ "mean": 0.00028,
+ "median": 0.0,
+ "std": 0.01674,
+ "histogram": {
+ "hist": [
+ 6651308,
+ 1866
+ ],
+ "bin_edges": [
+ 0,
+ 1,
+ 1
+ ]
+ }
+ }
+ }
+ ],
+ "partial": true
+ }
+ },
"A split (nyu-mll/glue) with a string (text) column": {
"summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.",
"description": "Try with https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=ax&split=test.",
diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index c540d77e48..c86077d583 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -151,6 +151,11 @@ def identify_datetime_format(datetime_string: str) -> Optional[str]:
"%m/%d/%Y",
"%B %d, %Y",
"%d %B %Y",
+ "%m-%Y",
+ "%Y-%m",
+ "%m/%Y",
+ "%Y/%m",
+ "%Y",
]
for fmt in common_formats:
From b7fee0bdb643a5f8d0e12e6b32c5ba828951afef Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 14:20:39 +0100
Subject: [PATCH 36/40] fallback to string stats if datetime didn't work
+ test
---
services/worker/src/worker/statistics_utils.py | 18 ++++++++++++------
.../tests/fixtures/statistics_dataset.py | 14 ++++++++++++++
services/worker/tests/test_statistics_utils.py | 8 +++++++-
3 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 3453ff825e..590d9286b9 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -505,12 +505,18 @@ def _compute_statistics(
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
if cls.is_datetime(data, column_name):
- datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
- data,
- column_name=column_name,
- n_samples=n_samples,
- )
- return datetime_stats
+ try:
+ stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
+ data,
+ column_name=column_name,
+ n_samples=n_samples,
+ )
+ return stats
+ except Exception as error:
+ logging.info(
+ f"Column {column_name} is datetime, but datetime stats compute failed ({error}), "
+ f"compute string stats instead. "
+ )
if cls.is_class(n_unique, n_samples):
labels2counts: dict[str, int] = value_counts(data, column_name) if nan_count != n_samples else {}
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index 9033a9c470..416a68f1a2 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1755,6 +1755,19 @@ def null_column(n_samples: int) -> list[None]:
"2024-01-10 00:00:00+0200",
"2024-01-11 00:00:00+0200",
],
+ "datetime_string_error": [
+ "16/01/2023",
+ "17/01/2023",
+ "18/01/2023",
+ "19/01/2023",
+ "01/2023",
+ "02/2023",
+ "20/01/2023",
+ "21/01/2023",
+ "03/2023",
+ "25/01/2023",
+ "26/01/2023",
+ ],
"datetime": [
datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
@@ -1802,6 +1815,7 @@ def null_column(n_samples: int) -> list[None]:
"datetime_string_z": Value("string"),
"datetime_string_t_z": Value("string"),
"datetime_string_tz": Value("string"),
+ "datetime_string_error": Value("string"),
"datetime": Value("timestamp[s]"),
"datetime_tz": Value("timestamp[s, tz=+02:00]"),
"datetime_null": Value("timestamp[s]"),
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index a4bd281fe7..5b3ea88417 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -489,6 +489,10 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"histogram": None,
}
+ # testcase contains multiple datetime formats, and we compute string lengths distributions instead of error
+ if column_name == "datetime_string_error":
+ return count_expected_statistics_for_string_column(column)
+
# hardcode expected values
minv = "2024-01-01 00:00:00"
maxv = "2024-01-11 00:00:00"
@@ -546,6 +550,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
"datetime_string_z",
"datetime_string_t_z",
"datetime_string_tz",
+ "datetime_string_error",
"datetime_tz",
"datetime_null",
"datetime_all_null",
@@ -569,8 +574,9 @@ def test_datetime_statistics(
column_name=column_name,
n_samples=len(data[column_name]),
)
+
computed_std, expected_std = computed.pop("std"), expected.pop("std")
- if computed_std:
+ if computed_std and column_name != "datetime_string_error":
assert computed_std.split(".")[0] == expected_std.split(".")[0] # check with precision up to seconds
else:
assert computed_std == expected_std
From 6a76dd9e35a3155b5153389b4016bf67b5b4e158 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 14:48:07 +0100
Subject: [PATCH 37/40] fix test
---
.../tests/job_runners/split/test_descriptive_statistics.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 4aa1c68900..0837b73b60 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -341,7 +341,7 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A
statistics = count_expected_statistics_for_datetime_column(column=df[column_name], column_name=column_name)
expected_statistics[column_name] = {
"column_name": column_name,
- "column_type": ColumnType.DATETIME,
+ "column_type": ColumnType.DATETIME if column_name != "datetime_string_error" else ColumnType.STRING_TEXT,
"column_statistics": statistics,
}
return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False}
From f3eefea1368768bf427b0013e187dc858874411b Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 15:00:01 +0100
Subject: [PATCH 38/40] update docs
---
docs/source/statistics.md | 20 +++++++++------
services/worker/README.md | 54 +++++++++++++++++++++++++++++++++++++++
2 files changed, 66 insertions(+), 8 deletions(-)
diff --git a/docs/source/statistics.md b/docs/source/statistics.md
index a849a11638..15e820da8a 100644
--- a/docs/source/statistics.md
+++ b/docs/source/statistics.md
@@ -165,7 +165,7 @@ The response JSON contains three keys:
## Response structure by data type
-Currently, statistics are supported for strings, float and integer numbers, lists, audio and image data and the special [`datasets.ClassLabel`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.ClassLabel) feature type of the [`datasets`](https://huggingface.co/docs/datasets/) library.
+Currently, statistics are supported for strings, float and integer numbers, lists, datetimes, audio and image data and the special [`datasets.ClassLabel`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.ClassLabel) feature type of the [`datasets`](https://huggingface.co/docs/datasets/) library.
`column_type` in response can be one of the following values:
@@ -217,7 +217,7 @@ This type represents categorical data encoded as [`ClassLabel`](https://huggingf
The following measures are returned for float data types:
-* minimum, maximum, mean, and standard deviation values
+* minimum, maximum, mean, median, and standard deviation values
* number and proportion of `null` and `NaN` values (`NaN` values are treated as `null`)
* histogram with 10 bins
@@ -274,7 +274,7 @@ The following measures are returned for float data types:
The following measures are returned for integer data types:
-* minimum, maximum, mean, and standard deviation values
+* minimum, maximum, mean, median, and standard deviation values
* number and proportion of `null` values
* histogram with less than or equal to 10 bins
@@ -378,7 +378,7 @@ If the proportion of unique values in a string column within requested split is
If string column does not satisfy the conditions to be treated as a `string_label`, it is considered to be a column containing texts and response contains statistics over text lengths which are calculated by character number. The following measures are computed:
-* minimum, maximum, mean, and standard deviation of text lengths
+* minimum, maximum, mean, median, and standard deviation of text lengths
* number and proportion of `null` values
* histogram of text lengths with 10 bins
@@ -435,7 +435,7 @@ If string column does not satisfy the conditions to be treated as a `string_labe
For lists, the distribution of their lengths is computed. The following measures are returned:
-* minimum, maximum, mean, and standard deviation of lists lengths
+* minimum, maximum, mean, median, and standard deviation of lists lengths
* number and proportion of `null` values
* histogram of lists lengths with up to 10 bins
@@ -481,7 +481,7 @@ Note that dictionaries of lists are not supported.
For audio data, the distribution of audio files durations is computed. The following measures are returned:
-* minimum, maximum, mean, and standard deviation of audio files durations
+* minimum, maximum, mean, median, and standard deviation of audio files durations
* number and proportion of `null` values
* histogram of audio files durations with 10 bins
@@ -540,7 +540,7 @@ For audio data, the distribution of audio files durations is computed. The follo
For image data, the distribution of images widths is computed. The following measures are returned:
-* minimum, maximum, mean, and standard deviation of widths of image files
+* minimum, maximum, mean, median, and standard deviation of widths of image files
* number and proportion of `null` values
* histogram of images widths with 10 bins
@@ -595,7 +595,11 @@ For image data, the distribution of images widths is computed. The following mea
### datetime
-The distribution of datetime is computed.
+The distribution of datetime is computed. The following measures are returned:
+
+* minimum, maximum, mean, median, and standard deviation of datetimes represented as strings with precision up to seconds
+* number and proportion of `null` values
+* histogram of datetimes with 10 bins
Example
diff --git a/services/worker/README.md b/services/worker/README.md
index b7722ed173..a1ae1a7555 100644
--- a/services/worker/README.md
+++ b/services/worker/README.md
@@ -116,6 +116,7 @@ The response has three fields: `num_examples`, `statistics`, and `partial`. `par
* `list` - for lists of other data types (including lists)
* `audio` - for audio data
* `image` - for image data
+* `datetime` - for datetime data
`column_statistics` content depends on the feature type, see examples below.
##### class_label
@@ -591,6 +592,59 @@ Shows distribution of image files widths.
+
+##### datetime
+
+Shows distribution of datetimes.
+
+example:
+
+
+```python
+{
+ "column_name": "date",
+ "column_type": "datetime",
+ "column_statistics": {
+ "nan_count": 0,
+ "nan_proportion": 0.0,
+ "min": "2013-05-18 04:54:11",
+ "max": "2013-06-20 10:01:41",
+ "mean": "2013-05-27 18:03:39",
+ "median": "2013-05-23 11:55:50",
+ "std": "11 days, 4:57:32.322450",
+ "histogram": {
+ "hist": [
+ 318776,
+ 393036,
+ 173904,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 206284
+ ],
+ "bin_edges": [
+ "2013-05-18 04:54:11",
+ "2013-05-21 12:36:57",
+ "2013-05-24 20:19:43",
+ "2013-05-28 04:02:29",
+ "2013-05-31 11:45:15",
+ "2013-06-03 19:28:01",
+ "2013-06-07 03:10:47",
+ "2013-06-10 10:53:33",
+ "2013-06-13 18:36:19",
+ "2013-06-17 02:19:05",
+ "2013-06-20 10:01:41"
+ ]
+ }
+ }
+}
+```
+
+
+
### Splits worker
The `splits` worker does not need any additional configuration.
From 1df95ff729b4eb8153eb9d4fe77b6e012c8c6a45 Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Thu, 16 Jan 2025 15:14:24 +0100
Subject: [PATCH 39/40] fix openapi specs
---
docs/source/openapi.json | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/docs/source/openapi.json b/docs/source/openapi.json
index c51a928009..d971558f42 100644
--- a/docs/source/openapi.json
+++ b/docs/source/openapi.json
@@ -1205,13 +1205,13 @@
"hist": {
"type": "array",
"items": {
- "type": "string"
+ "type": "integer"
}
},
"bin_edges": {
"type": "array",
"items": {
- "type": "number"
+ "type": "string"
}
}
}
@@ -1339,7 +1339,7 @@
},
{
"$ref": "#/components/schemas/BoolStatisticsItem"
- },
+ }
]
},
"StatisticsPerColumnItem": {
@@ -6346,7 +6346,7 @@
}
],
"partial": true
- }
+ }
},
"A split (nyu-mll/glue) with a string (text) column": {
"summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.",
From f9d7a8a7f8db8c89468609029ecc2604a46c8e9e Mon Sep 17 00:00:00 2001
From: polinaeterna
Date: Fri, 17 Jan 2025 12:41:33 +0100
Subject: [PATCH 40/40] fix polars timezone switching
---
services/worker/src/worker/statistics_utils.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 590d9286b9..1c9afe0c24 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -801,13 +801,14 @@ def _compute_statistics(
)
original_timezone = None
if isinstance(data[column_name].dtype, pl.String):
- original_timezone = get_timezone(data[column_name][0])
# let polars identify format itself. provide manually in case of error
try:
+ original_timezone = get_timezone(data[column_name][0])
data = data.with_columns(pl.col(column_name).str.to_datetime())
except pl.ComputeError:
datetime_format = cls.get_format(data, column_name)
data = data.with_columns(pl.col(column_name).str.to_datetime(format=datetime_format))
+ original_timezone = None
min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"