Skip to content

Commit

Permalink
Avoid redundant names/descriptions (#3106)
Browse files Browse the repository at this point in the history
* Avoid redundant names/descriptions

* Update test_croissant_utils
  • Loading branch information
ccl-core authored Dec 3, 2024
1 parent 282364e commit 539b437
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 25 deletions.
9 changes: 0 additions & 9 deletions libs/libcommon/src/libcommon/croissant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ def feature_to_croissant_field(
return {
"@type": "cr:Field",
"@id": field_name,
"name": field_name,
"description": f"Column '{column}' from the Hugging Face parquet file.",
"dataType": HF_TO_CROISSANT_VALUE_TYPE[feature.dtype],
"source": get_source(distribution_name, column, add_transform, json_path),
}
Expand All @@ -90,18 +88,13 @@ def feature_to_croissant_field(
return {
"@type": "cr:Field",
"@id": field_name,
"name": field_name,
"description": f"Image column '{column}' from the Hugging Face parquet file.",
"dataType": "sc:ImageObject",
"source": source,
}
elif isinstance(feature, ClassLabel):
return {
"@type": "cr:Field",
"@id": field_name,
"name": field_name,
"description": f"ClassLabel column '{column}' from the Hugging Face parquet file.\nLabels:\n"
+ ", ".join(f"{name} ({i})" for i, name in enumerate(feature.names)),
"dataType": "sc:Integer",
"source": get_source(distribution_name, column, add_transform, json_path),
}
Expand All @@ -110,8 +103,6 @@ def feature_to_croissant_field(
return {
"@type": "cr:Field",
"@id": field_name,
"name": field_name,
"description": f"Column '{column}' from the Hugging Face parquet file.",
"subField": [
feature_to_croissant_field(
distribution_name,
Expand Down
6 changes: 0 additions & 6 deletions libs/libcommon/tests/test_croissant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N
{
"@type": "cr:Field",
"@id": "field_name",
"name": "field_name",
"description": "Column 'column_name' from the Hugging Face parquet file.",
"dataType": "sc:Integer",
"source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}},
},
Expand All @@ -49,8 +47,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N
{
"@type": "cr:Field",
"@id": "field_name",
"name": "field_name",
"description": "Column 'column_name' from the Hugging Face parquet file.",
"dataType": "sc:Integer",
"source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}},
"repeated": True,
Expand All @@ -61,8 +57,6 @@ def test_truncate_features_from_croissant_crumbs_response(num_columns: int) -> N
{
"@type": "cr:Field",
"@id": "field_name",
"name": "field_name",
"description": "Column 'column_name' from the Hugging Face parquet file.",
"dataType": "sc:Integer",
"source": {"fileSet": {"@id": "distribution_name"}, "extract": {"column": "column_name"}},
"repeated": True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,6 @@ def get_croissant_crumbs_from_dataset_infos(
{
"@type": "cr:FileSet",
"@id": distribution_name,
"name": distribution_name,
"description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
"containedIn": {"@id": repo_name},
"encodingFormat": "application/x-parquet",
"includes": f"{config}/*/*.parquet",
Expand All @@ -99,8 +97,6 @@ def get_croissant_crumbs_from_dataset_infos(
{
"@type": "cr:Field",
"@id": f"{split_record_set_name}/split_name",
"name": "split_name",
"description": "The name of the split.",
"dataType": "sc:Text",
}
)
Expand All @@ -124,8 +120,6 @@ def get_croissant_crumbs_from_dataset_infos(
{
"@type": "cr:Field",
"@id": f"{record_set_name}/split",
"name": f"{record_set_name}/split",
"description": "Split to which the example belongs to.",
"dataType": "sc:Text",
"source": {
"fileSet": {"@id": distribution_name},
Expand Down Expand Up @@ -160,7 +154,6 @@ def get_croissant_crumbs_from_dataset_infos(
{
"@type": "cr:RecordSet",
"@id": record_set_name,
"name": record_set_name,
"description": description,
"field": fields,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,6 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None:
assert croissant_crumbs["recordSet"][i]["dataType"] == "cr:Split"
assert croissant_crumbs["recordSet"][i]["key"]["@id"].endswith("name")
assert croissant_crumbs["recordSet"][1]["@type"] == croissant_crumbs["recordSet"][3]["@type"] == "cr:RecordSet"
assert croissant_crumbs["recordSet"][1]["name"] == "record_set_user_squad_with_space"
assert croissant_crumbs["recordSet"][3]["name"] == "record_set_user_squad_with_space_0"
assert isinstance(croissant_crumbs["recordSet"][1]["field"], list)
assert isinstance(squad_info["features"], dict)
assert "skipped column" not in croissant_crumbs["recordSet"][1]["description"]
Expand All @@ -147,7 +145,7 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None:
assert sub_field["source"]["fileSet"]["@id"]
assert "extract" in sub_field["source"]
assert "transform" in sub_field["source"]
if field["description"] == "Split to which the example belongs to.":
if field["@id"].endswith("split"):
assert "regex" in field["source"]["transform"]
assert field["source"]["extract"]["fileProperty"] == "fullpath"
assert field["references"]["field"]["@id"] == croissant_crumbs["recordSet"][i - 1]["field"][0]["@id"]
Expand Down

0 comments on commit 539b437

Please sign in to comment.