Skip to content

Commit

Permalink
fix string to datetime conversion: add format inferring
Browse files Browse the repository at this point in the history
  • Loading branch information
polinaeterna committed Dec 26, 2024
1 parent f9ffe82 commit d2c37c6
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 11 deletions.
44 changes: 44 additions & 0 deletions libs/libcommon/src/libcommon/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,50 @@ def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str
return dt.strftime(format)


def identify_datetime_format(datetime_string: str) -> Optional[str]:
# Common datetime formats
common_formats = [
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%d %H:%M:%S%z",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M",
"%Y-%m-%d",
"%d-%m-%Y %H:%M:%S%z",
"%d-%m-%Y %H:%M:%S",
"%d-%m-%Y %H:%M",
"%d-%m-%Y",
"%m-%d-%Y %H:%M:%S%z",
"%m-%d-%Y %H:%M:%S",
"%m-%d-%Y %H:%M",
"%m-%d-%Y",

"%Y/%m/%d %H:%M:%S%z",
"%Y/%m/%d %H:%M:%S",
"%Y/%m/%d %H:%M",
"%Y/%m/%d",
"%d/%m/%Y %H:%M:%S%z",
"%d/%m/%Y %H:%M:%S",
"%d/%m/%Y %H:%M",
"%d/%m/%Y",
"%m/%d/%Y %H:%M:%S%z",
"%m/%d/%Y %H:%M:%S",
"%m/%d/%Y %H:%M",
"%m/%d/%Y",

"%B %d, %Y",
"%d %B %Y",
]

for fmt in common_formats:
try:
datetime.strptime(datetime_string, fmt)
return fmt
except ValueError:
continue


def get_duration(started_at: datetime) -> float:
"""
Get time in seconds that has passed from `started_at` until now.
Expand Down
25 changes: 19 additions & 6 deletions services/worker/src/worker/statistics_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from libcommon.exceptions import (
StatisticsComputationError,
)
from libcommon.utils import datetime_to_string, is_datetime
from libcommon.utils import datetime_to_string, identify_datetime_format, is_datetime
from PIL import Image
from tqdm.contrib.concurrent import thread_map

Expand Down Expand Up @@ -478,11 +478,19 @@ def is_class(n_unique: int, n_samples: int) -> bool:
) or n_unique <= NUM_BINS

@staticmethod
def is_datetime(data: pl.DataFrame, column_name: str) -> bool:
"""Check if first 1000 non-null samples in a column match datetime format."""
def is_datetime(data: pl.DataFrame, column_name: str) -> tuple[bool, Optional[str]]:
"""Check if first 1000 non-null samples in a column match datetime format. If true, also return datetime format"""

values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
return all(is_datetime(value) for value in values)
_is_datetime = all(is_datetime(value) for value in values)

if _is_datetime:
formats = [identify_datetime_format(value) for value in values]
if len(set(formats)) == 1:
return True, formats[0]
raise StatisticsComputationError("Multiple datetime formats detected. ")

return False, None

@classmethod
def compute_transformed_data(
Expand All @@ -504,11 +512,13 @@ def _compute_statistics(
) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
if cls.is_datetime(data, column_name):
_is_datetime, datetime_format = cls.is_datetime(data, column_name)
if _is_datetime:
datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
data.select(pl.col(column_name).cast(pl.Datetime)),
data,
column_name=column_name,
n_samples=n_samples,
format=datetime_format,
)
return datetime_stats

Expand Down Expand Up @@ -765,6 +775,7 @@ def _compute_statistics(
data: pl.DataFrame,
column_name: str,
n_samples: int,
format: Optional[str] = None,
) -> DatetimeStatisticsItem:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
if nan_count == n_samples: # all values are None
Expand All @@ -778,6 +789,8 @@ def _compute_statistics(
std=None,
histogram=None,
)
if isinstance(data[column_name].dtype, pl.String):
data = data.with_columns(pl.col(column_name).str.to_datetime(format=format))

min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -353,11 +353,11 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A
("descriptive_statistics", None),
("descriptive_statistics_string_text", None),
("descriptive_statistics_string_text_partial", None),
("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
("audio_statistics", None),
("image_statistics", None),
("datetime_statistics", None),
("gated", None),
# ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
# ("audio_statistics", None),
# ("image_statistics", None),
# ("datetime_statistics", None),
# ("gated", None),
],
)
def test_compute(
Expand Down

0 comments on commit d2c37c6

Please sign in to comment.