huggingface · polinaeterna · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Aug 1, 2024
diff --git a/docs/source/openapi.json b/docs/source/openapi.json
diff --git a/docs/source/statistics.md b/docs/source/statistics.md
@@ -165,7 +165,7 @@ The response JSON contains three keys:
 
 ## Response structure by data type
 
-Currently, statistics are supported for strings, float and integer numbers, lists, audio and image data and the special [`datasets.ClassLabel`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.ClassLabel) feature type of the [`datasets`](https://huggingface.co/docs/datasets/) library.
+Currently, statistics are supported for strings, float and integer numbers, lists, datetimes, audio and image data and the special [`datasets.ClassLabel`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.ClassLabel) feature type of the [`datasets`](https://huggingface.co/docs/datasets/) library.
 
 `column_type` in response can be one of the following values:
 
@@ -178,6 +178,7 @@ Currently, statistics are supported for strings, float and integer numbers, list
 * `list` - for lists of any other data types (including lists)
 * `audio` - for audio data
 * `image` - for image data
+* `datetime` - for datetime data 
 
 ### `class_label`
 
@@ -216,7 +217,7 @@ This type represents categorical data encoded as [`ClassLabel`](https://huggingf
 
 The following measures are returned for float data types:
 
-* minimum, maximum, mean, and standard deviation values
+* minimum, maximum, mean, median, and standard deviation values
 * number and proportion of `null` and `NaN` values (`NaN` values are treated as `null`)
 * histogram with 10 bins
 
@@ -273,7 +274,7 @@ The following measures are returned for float data types:
 
 The following measures are returned for integer data types:
 
-* minimum, maximum, mean, and standard deviation values
+* minimum, maximum, mean, median, and standard deviation values
 * number and proportion of `null` values
 * histogram with less than or equal to 10 bins
 
@@ -377,7 +378,7 @@ If the proportion of unique values in a string column within requested split is
 
 If string column does not satisfy the conditions to be treated as a `string_label`, it is considered to be a column containing texts and response contains statistics over text lengths which are calculated by character number. The following measures are computed:
 
-* minimum, maximum, mean, and standard deviation of text lengths
+* minimum, maximum, mean, median, and standard deviation of text lengths
 * number and proportion of `null` values
 * histogram of text lengths with 10 bins
 
@@ -434,7 +435,7 @@ If string column does not satisfy the conditions to be treated as a `string_labe
 
 For lists, the distribution of their lengths is computed. The following measures are returned:
 
-* minimum, maximum, mean, and standard deviation of lists lengths
+* minimum, maximum, mean, median, and standard deviation of lists lengths
 * number and proportion of `null` values
 * histogram of lists lengths with up to 10 bins
 
@@ -480,7 +481,7 @@ Note that dictionaries of lists are not supported.
 
 For audio data, the distribution of audio files durations is computed. The following measures are returned:
 
-* minimum, maximum, mean, and standard deviation of audio files durations
+* minimum, maximum, mean, median, and standard deviation of audio files durations
 * number and proportion of `null` values
 * histogram of audio files durations with 10 bins
 
@@ -539,7 +540,7 @@ For audio data, the distribution of audio files durations is computed. The follo
 
 For image data, the distribution of images widths is computed. The following measures are returned:
 
-* minimum, maximum, mean, and standard deviation of widths of image files
+* minimum, maximum, mean, median, and standard deviation of widths of image files
 * number and proportion of `null` values
 * histogram of images widths with 10 bins
 
@@ -591,3 +592,61 @@ For image data, the distribution of images widths is computed. The following mea
 
 </p>
 </details>
+
+### datetime
+
+The distribution of datetime is computed. The following measures are returned:
+
+* minimum, maximum, mean, median, and standard deviation of datetimes represented as strings with precision up to seconds
+* number and proportion of `null` values
+* histogram of datetimes with 10 bins
+
+<details><summary>Example </summary>
+<p>
+
+```json
+{
+    "column_name": "date",
+    "column_type": "datetime",
+    "column_statistics": {
+        "nan_count": 0,
+        "nan_proportion": 0.0,
+        "min": "2013-05-18 04:54:11",
+        "max": "2013-06-20 10:01:41",
+        "mean": "2013-05-27 18:03:39",
+        "median": "2013-05-23 11:55:50",
+        "std": "11 days, 4:57:32.322450",
+        "histogram": {
+            "hist": [
+                318776,
+                393036,
+                173904,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                206284
+            ],
+            "bin_edges": [
+                "2013-05-18 04:54:11",
+                "2013-05-21 12:36:57",
+                "2013-05-24 20:19:43",
+                "2013-05-28 04:02:29",
+                "2013-05-31 11:45:15",
+                "2013-06-03 19:28:01",
+                "2013-06-07 03:10:47",
+                "2013-06-10 10:53:33",
+                "2013-06-13 18:36:19",
+                "2013-06-17 02:19:05",
+                "2013-06-20 10:01:41"
+            ]
+        }
+    }
+}
+```
+
+</p>
+</details>
+
diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml
@@ -76,6 +76,7 @@ module = [
     "moto.*",
     "aiobotocore.*",
     "requests.*",
+    "dateutil.*"
 ]
 # ^ huggingface_hub is not typed since version 0.13.0
 ignore_missing_imports = true

diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
@@ -15,6 +15,7 @@
 import orjson
 import pandas as pd
 import pytz
+from dateutil import parser
 from huggingface_hub import constants, hf_hub_download
 from requests.exceptions import ReadTimeout
 
@@ -93,6 +94,79 @@ def get_datetime(days: Optional[float] = None) -> datetime:
     return date
 
 
+def is_datetime(string: str) -> bool:
+    try:
+        parser.parse(string)
+        return True
+    except ValueError:
+        return False
+
+
+def get_timezone(string: str) -> Any:
+    return parser.parse(string).tzinfo
+
+
+def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
+    if dt.utcoffset() == timedelta(0):
+        format = "%Y-%m-%d %H:%M:%S"  # do not display +0000
+    return dt.strftime(format)
+
+
+def identify_datetime_format(datetime_string: str) -> Optional[str]:
+    # Common datetime formats
+    common_formats = [
+        "%Y-%m-%dT%H:%M:%S%Z",
+        "%Y-%m-%dT%H:%M:%S%z",
+        "%Y-%m-%dT%H:%M:%S",
+        "%Y-%m-%dT%H:%M:%S.%f",
+        "%Y-%m-%d %H:%M:%S%Z",
+        "%Y-%m-%d %H:%M:%S%z",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%d %H:%M",
+        "%Y-%m-%d",
+        "%d-%m-%Y %H:%M:%S%Z",
+        "%d-%m-%Y %H:%M:%S%z",
+        "%d-%m-%Y %H:%M:%S",
+        "%d-%m-%Y %H:%M",
+        "%d-%m-%Y",
+        "%m-%d-%Y %H:%M:%S%Z",
+        "%m-%d-%Y %H:%M:%S%z",
+        "%m-%d-%Y %H:%M:%S",
+        "%m-%d-%Y %H:%M",
+        "%m-%d-%Y",
+        "%Y/%m/%d %H:%M:%S%Z",
+        "%Y/%m/%d %H:%M:%S%z",
+        "%Y/%m/%d %H:%M:%S",
+        "%Y/%m/%d %H:%M",
+        "%Y/%m/%d",
+        "%d/%m/%Y %H:%M:%S%Z",
+        "%d/%m/%Y %H:%M:%S%z",
+        "%d/%m/%Y %H:%M:%S",
+        "%d/%m/%Y %H:%M",
+        "%d/%m/%Y",
+        "%m/%d/%Y %H:%M:%S%Z",
+        "%m/%d/%Y %H:%M:%S%z",
+        "%m/%d/%Y %H:%M:%S",
+        "%m/%d/%Y %H:%M",
+        "%m/%d/%Y",
+        "%B %d, %Y",
+        "%d %B %Y",
+        "%m-%Y",
+        "%Y-%m",
+        "%m/%Y",
+        "%Y/%m",
+        "%Y",
+    ]
+
+    for fmt in common_formats:
+        try:
+            _ = datetime.strptime(datetime_string, fmt)
+            return fmt
+        except ValueError:
+            continue
+    return None
+
+
 def get_duration(started_at: datetime) -> float:
     """
     Get time in seconds that has passed from `started_at` until now.

diff --git a/services/worker/README.md b/services/worker/README.md
@@ -116,6 +116,7 @@ The response has three fields: `num_examples`, `statistics`, and `partial`. `par
 * `list` - for lists of other data types (including lists)
 * `audio` - for audio data
 * `image` - for image data
+* `datetime` - for datetime data
 
 `column_statistics` content depends on the feature type, see examples below.
 ##### class_label
@@ -591,6 +592,59 @@ Shows distribution of image files widths.
 </p>
 </details>
 
+
+##### datetime
+
+Shows distribution of datetimes.
+
+<details><summary>example: </summary>
+<p>
+
+```python
+{
+    "column_name": "date",
+    "column_type": "datetime",
+    "column_statistics": {
+        "nan_count": 0,
+        "nan_proportion": 0.0,
+        "min": "2013-05-18 04:54:11",
+        "max": "2013-06-20 10:01:41",
+        "mean": "2013-05-27 18:03:39",
+        "median": "2013-05-23 11:55:50",
+        "std": "11 days, 4:57:32.322450",
+        "histogram": {
+            "hist": [
+                318776,
+                393036,
+                173904,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                206284
+            ],
+            "bin_edges": [
+                "2013-05-18 04:54:11",
+                "2013-05-21 12:36:57",
+                "2013-05-24 20:19:43",
+                "2013-05-28 04:02:29",
+                "2013-05-31 11:45:15",
+                "2013-06-03 19:28:01",
+                "2013-06-07 03:10:47",
+                "2013-06-10 10:53:33",
+                "2013-06-13 18:36:19",
+                "2013-06-17 02:19:05",
+                "2013-06-20 10:01:41"
+            ]
+        }
+    }
+}
+```
+</p>
+</details>
+
 ### Splits worker
 
 The `splits` worker does not need any additional configuration.

diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -39,6 +39,7 @@
     AudioColumn,
     BoolColumn,
     ClassLabelColumn,
+    DatetimeColumn,
     FloatColumn,
     ImageColumn,
     IntColumn,
@@ -57,7 +58,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict):
 
 
 SupportedColumns = Union[
-    ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn
+    ClassLabelColumn,
+    IntColumn,
+    FloatColumn,
+    StringColumn,
+    BoolColumn,
+    ListColumn,
+    AudioColumn,
+    ImageColumn,
+    DatetimeColumn,
 ]
 
 
@@ -215,29 +224,34 @@ def _column_from_feature(
                 return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
         if isinstance(dataset_feature, dict):
-            if dataset_feature.get("_type") == "ClassLabel":
+            _type = dataset_feature.get("_type")
+            if _type == "ClassLabel":
                 return ClassLabelColumn(
                     feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature
                 )
 
-            if dataset_feature.get("_type") == "Audio":
+            if _type == "Audio":
                 return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-            if dataset_feature.get("_type") == "Image":
+            if _type == "Image":
                 return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-            if dataset_feature.get("_type") == "Value":
-                if dataset_feature.get("dtype") in INTEGER_DTYPES:
+            if _type == "Value":
+                dtype = dataset_feature.get("dtype", "")
+                if dtype in INTEGER_DTYPES:
                     return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in FLOAT_DTYPES:
+                if dtype in FLOAT_DTYPES:
                     return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in STRING_DTYPES:
+                if dtype in STRING_DTYPES:
                     return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") == "bool":
+                if dtype == "bool":
                     return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)
+
+                if dtype.startswith("timestamp"):
+                    return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
         return None
 
     columns: list[SupportedColumns] = []
@@ -249,7 +263,7 @@ def _column_from_feature(
     if not columns:
         raise NoSupportedFeaturesError(
             "No columns for statistics computation found. Currently supported feature types are: "
-            f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. "
+            f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. "
         )
 
     column_names_str = ", ".join([column.name for column in columns])