Avoid redundant names/descriptions (#3106)

* Avoid redundant names/descriptions * Update test_croissant_utils
huggingface · Dec 3, 2024 · 539b437 · 539b437
1 parent 282364e
commit 539b437
Show file tree

Hide file tree

Showing 4 changed files with 1 addition and 25 deletions.
diff --git a/libs/libcommon/src/libcommon/croissant_utils.py b/libs/libcommon/src/libcommon/croissant_utils.py
@@ -76,8 +76,6 @@ def feature_to_croissant_field(
         return {
             "@type": "cr:Field",
             "@id": field_name,
-            "name": field_name,
-            "description": f"Column '{column}' from the Hugging Face parquet file.",
             "dataType": HF_TO_CROISSANT_VALUE_TYPE[feature.dtype],
             "source": get_source(distribution_name, column, add_transform, json_path),
         }
@@ -90,18 +88,13 @@ def feature_to_croissant_field(
         return {
             "@type": "cr:Field",
             "@id": field_name,
-            "name": field_name,
-            "description": f"Image column '{column}' from the Hugging Face parquet file.",
             "dataType": "sc:ImageObject",
             "source": source,
         }
     elif isinstance(feature, ClassLabel):
         return {
             "@type": "cr:Field",
             "@id": field_name,
-            "name": field_name,
-            "description": f"ClassLabel column '{column}' from the Hugging Face parquet file.\nLabels:\n"
-            + ", ".join(f"{name} ({i})" for i, name in enumerate(feature.names)),
             "dataType": "sc:Integer",
             "source": get_source(distribution_name, column, add_transform, json_path),
         }
@@ -110,8 +103,6 @@ def feature_to_croissant_field(
         return {
             "@type": "cr:Field",
             "@id": field_name,
-            "name": field_name,
-            "description": f"Column '{column}' from the Hugging Face parquet file.",
             "subField": [
                 feature_to_croissant_field(
                     distribution_name,

diff --git a/libs/libcommon/tests/test_croissant_utils.py b/libs/libcommon/tests/test_croissant_utils.py
@@ -38,8 +38,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N
             {
                 "@type": "cr:Field",
                 "@id": "field_name",
-                "name": "field_name",
-                "description": "Column 'column_name' from the Hugging Face parquet file.",
                 "dataType": "sc:Integer",
                 "source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}},
             },
@@ -49,8 +47,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N
             {
                 "@type": "cr:Field",
                 "@id": "field_name",
-                "name": "field_name",
-                "description": "Column 'column_name' from the Hugging Face parquet file.",
                 "dataType": "sc:Integer",
                 "source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}},
                 "repeated": True,
@@ -61,8 +57,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N
             {
                 "@type": "cr:Field",
                 "@id": "field_name",
-                "name": "field_name",
-                "description": "Column 'column_name' from the Hugging Face parquet file.",
                 "dataType": "sc:Integer",
                 "source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}},
                 "repeated": True,

diff --git a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py
@@ -82,8 +82,6 @@ def get_croissant_crumbs_from_dataset_infos(
                 {
                     "@type": "cr:FileSet",
                     "@id": distribution_name,
-                    "name": distribution_name,
-                    "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
                     "containedIn": {"@id": repo_name},
                     "encodingFormat": "application/x-parquet",
                     "includes": f"{config}/*/*.parquet",
@@ -99,8 +97,6 @@ def get_croissant_crumbs_from_dataset_infos(
             {
                 "@type": "cr:Field",
                 "@id": f"{split_record_set_name}/split_name",
-                "name": "split_name",
-                "description": "The name of the split.",
                 "dataType": "sc:Text",
             }
         )
@@ -124,8 +120,6 @@ def get_croissant_crumbs_from_dataset_infos(
             {
                 "@type": "cr:Field",
                 "@id": f"{record_set_name}/split",
-                "name": f"{record_set_name}/split",
-                "description": "Split to which the example belongs to.",
                 "dataType": "sc:Text",
                 "source": {
                     "fileSet": {"@id": distribution_name},
@@ -160,7 +154,6 @@ def get_croissant_crumbs_from_dataset_infos(
                 {
                     "@type": "cr:RecordSet",
                     "@id": record_set_name,
-                    "name": record_set_name,
                     "description": description,
                     "field": fields,
                 }

diff --git a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py
@@ -124,8 +124,6 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None:
         assert croissant_crumbs["recordSet"][i]["dataType"] == "cr:Split"
         assert croissant_crumbs["recordSet"][i]["key"]["@id"].endswith("name")
     assert croissant_crumbs["recordSet"][1]["@type"] == croissant_crumbs["recordSet"][3]["@type"] == "cr:RecordSet"
-    assert croissant_crumbs["recordSet"][1]["name"] == "record_set_user_squad_with_space"
-    assert croissant_crumbs["recordSet"][3]["name"] == "record_set_user_squad_with_space_0"
     assert isinstance(croissant_crumbs["recordSet"][1]["field"], list)
     assert isinstance(squad_info["features"], dict)
     assert "skipped column" not in croissant_crumbs["recordSet"][1]["description"]
@@ -147,7 +145,7 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None:
                     assert sub_field["source"]["fileSet"]["@id"]
                     assert "extract" in sub_field["source"]
                     assert "transform" in sub_field["source"]
-            if field["description"] == "Split to which the example belongs to.":
+            if field["@id"].endswith("split"):
                 assert "regex" in field["source"]["transform"]
                 assert field["source"]["extract"]["fileProperty"] == "fullpath"
                 assert field["references"]["field"]["@id"] == croissant_crumbs["recordSet"][i - 1]["field"][0]["@id"]