diff --git a/assets/compile_stats.py b/assets/compile_stats.py index 75b4cca..cb2df46 100644 --- a/assets/compile_stats.py +++ b/assets/compile_stats.py @@ -186,13 +186,15 @@ def to_name_subset(name): def to_language_name_subset(name, subset=None): + _languages = ["fr", "en", "de", "es", "it"] if subset is None: name, subset = to_name_subset(name) - for lan in "fr", "en", "de", "es", "it": + for lan in _languages: subset2 = subset.rstrip(":.0123456789") if subset.startswith(lan) and (len(subset) == len(lan) or subset[len(lan)] in ".:-"): - if "-" in subset and len(subset2) == 5: - subset = subset2 + if len(subset2) >= 5 and subset[len(lan)] in ".:-" and subset2[3:5] in _languages: + # multi-lingual + subset = subset2[:5] lan = subset subset = subset[len(lan) :].strip(":.") subset = subset.strip(":_") diff --git a/assets/hugging_face/README_dataset_header.yaml b/assets/hugging_face/README_dataset_header.yaml index df81b44..d3d429b 100644 --- a/assets/hugging_face/README_dataset_header.yaml +++ b/assets/hugging_face/README_dataset_header.yaml @@ -1,49 +1,403 @@ pretty_name: Lucie Training Dataset license: cc-by-nc-sa-4.0 language: - - en - - fr - - de - - es - - it - - code +- en +- fr +- de +- es +- it +- code multilinguality: - - multilingual +- multilingual task_categories: - - text-generation - - text2text-generation +- text-generation +- text2text-generation task_ids: - - language-modeling +- language-modeling tags: - - text-generation - - conditional-text-generation +- text-generation +- conditional-text-generation +size_categories: +- n>1T viewer: true configs: - - config_name: default - data_files: - - split: train - path: data/*/*/*parquet - - config_name: en - data_files: - - split: train - path: data/*/en/*parquet - - config_name: fr - data_files: - - split: train - path: data/*/fr/*parquet - - config_name: de - data_files: - - split: train - path: data/*/de/*parquet - - config_name: es - data_files: - - split: train - path: data/*/es/*parquet - - config_name: it - data_files: - - split: train - path: data/*/it/*parquet - - config_name: code - data_files: - - split: train - path: data/*/code/*parquet +- config_name: default + data_files: + - path: data/*/*/*/*parquet + split: train +- config_name: en + data_files: + - path: data/natural/en/*/*parquet + split: train +- config_name: fr + data_files: + - path: data/natural/fr/*/*parquet + split: train +- config_name: de + data_files: + - path: data/natural/de/*/*parquet + split: train +- config_name: es + data_files: + - path: data/natural/es/*/*parquet + split: train +- config_name: it + data_files: + - path: data/natural/it/*/*parquet + split: train +- config_name: de,fr + data_files: + - path: data/natural/de-fr/*/*.parquet + split: train +- config_name: es,en + data_files: + - path: data/natural/es-en/*/*.parquet + split: train +- config_name: fr,en + data_files: + - path: data/natural/fr-en/*/*.parquet + split: train +- config_name: it,en + data_files: + - path: data/natural/it-en/*/*.parquet + split: train +- config_name: code + data_files: + - path: data/code/*/*/*parquet + split: train +- config_name: code-assembly + data_files: + - path: data/code/assembly/*/*.parquet + split: train +- config_name: code-c + data_files: + - path: data/code/c/*/*.parquet + split: train +- config_name: code-c# + data_files: + - path: data/code/c#/*/*.parquet + split: train +- config_name: code-c++ + data_files: + - path: data/code/c++/*/*.parquet + split: train +- config_name: code-clojure + data_files: + - path: data/code/clojure/*/*.parquet + split: train +- config_name: code-dart + data_files: + - path: data/code/dart/*/*.parquet + split: train +- config_name: code-elixir + data_files: + - path: data/code/elixir/*/*.parquet + split: train +- config_name: code-erlang + data_files: + - path: data/code/erlang/*/*.parquet + split: train +- config_name: code-fortran + data_files: + - path: data/code/fortran/*/*.parquet + split: train +- config_name: code-go + data_files: + - path: data/code/go/*/*.parquet + split: train +- config_name: code-haskell + data_files: + - path: data/code/haskell/*/*.parquet + split: train +- config_name: code-java + data_files: + - path: data/code/java/*/*.parquet + split: train +- config_name: code-javascript + data_files: + - path: data/code/javascript/*/*.parquet + split: train +- config_name: code-julia + data_files: + - path: data/code/julia/*/*.parquet + split: train +- config_name: code-kotlin + data_files: + - path: data/code/kotlin/*/*.parquet + split: train +- config_name: code-lua + data_files: + - path: data/code/lua/*/*.parquet + split: train +- config_name: code-mathematica + data_files: + - path: data/code/mathematica/*/*.parquet + split: train +- config_name: code-matlab + data_files: + - path: data/code/matlab/*/*.parquet + split: train +- config_name: code-ocaml + data_files: + - path: data/code/ocaml/*/*.parquet + split: train +- config_name: code-perl + data_files: + - path: data/code/perl/*/*.parquet + split: train +- config_name: code-php + data_files: + - path: data/code/php/*/*.parquet + split: train +- config_name: code-python + data_files: + - path: data/code/python/*/*.parquet + split: train +- config_name: code-r + data_files: + - path: data/code/r/*/*.parquet + split: train +- config_name: code-racket + data_files: + - path: data/code/racket/*/*.parquet + split: train +- config_name: code-ruby + data_files: + - path: data/code/ruby/*/*.parquet + split: train +- config_name: code-rust + data_files: + - path: data/code/rust/*/*.parquet + split: train +- config_name: code-scala + data_files: + - path: data/code/scala/*/*.parquet + split: train +- config_name: code-swift + data_files: + - path: data/code/swift/*/*.parquet + split: train +- config_name: code-tex + data_files: + - path: data/code/tex/*/*.parquet + split: train +- config_name: code-typescript + data_files: + - path: data/code/typescript/*/*.parquet + split: train +- config_name: AmendementsParlement + data_files: + - path: data/natural/*/AmendementsParlement/*.parquet + split: train +- config_name: AmericanStories + data_files: + - path: data/natural/*/AmericanStories/*.parquet + split: train +- config_name: Claire + data_files: + - path: data/natural/*/Claire/*.parquet + split: train +- config_name: Claire-en + data_files: + - path: data/natural/en/Claire/*.parquet + split: train +- config_name: Claire-fr + data_files: + - path: data/natural/fr/Claire/*.parquet + split: train +- config_name: CroissantAligned + data_files: + - path: data/natural/*/CroissantAligned/*.parquet + split: train +- config_name: DiscoursPublics + data_files: + - path: data/natural/*/DiscoursPublics/*.parquet + split: train +- config_name: Europarl + data_files: + - path: data/natural/*/Europarl/*.parquet + split: train +- config_name: Europarl-de + data_files: + - path: data/natural/de/Europarl/*.parquet + split: train +- config_name: Europarl-en + data_files: + - path: data/natural/en/Europarl/*.parquet + split: train +- config_name: Europarl-es + data_files: + - path: data/natural/es/Europarl/*.parquet + split: train +- config_name: Europarl-fr + data_files: + - path: data/natural/fr/Europarl/*.parquet + split: train +- config_name: EuroparlAligned + data_files: + - path: data/natural/*/EuroparlAligned/*.parquet + split: train +- config_name: EuroparlAligned-de,fr + data_files: + - path: data/natural/de-fr/EuroparlAligned/*.parquet + split: train +- config_name: EuroparlAligned-es,en + data_files: + - path: data/natural/es-en/EuroparlAligned/*.parquet + split: train +- config_name: EuroparlAligned-fr,en + data_files: + - path: data/natural/fr-en/EuroparlAligned/*.parquet + split: train +- config_name: EuroparlAligned-it,en + data_files: + - path: data/natural/it-en/EuroparlAligned/*.parquet + split: train +- config_name: Eurovoc + data_files: + - path: data/natural/*/Eurovoc/*.parquet + split: train +- config_name: Eurovoc-de + data_files: + - path: data/natural/de/Eurovoc/*.parquet + split: train +- config_name: Eurovoc-en + data_files: + - path: data/natural/en/Eurovoc/*.parquet + split: train +- config_name: Eurovoc-es + data_files: + - path: data/natural/es/Eurovoc/*.parquet + split: train +- config_name: Eurovoc-it + data_files: + - path: data/natural/it/Eurovoc/*.parquet + split: train +- config_name: FineWebEdu + data_files: + - path: data/natural/*/FineWebEdu/*.parquet + split: train +- config_name: GallicaMonographies + data_files: + - path: data/natural/*/GallicaMonographies/*.parquet + split: train +- config_name: GallicaPress + data_files: + - path: data/natural/*/GallicaPress/*.parquet + split: train +- config_name: Gutenberg + data_files: + - path: data/natural/*/Gutenberg/*.parquet + split: train +- config_name: Gutenberg-de + data_files: + - path: data/natural/de/Gutenberg/*.parquet + split: train +- config_name: Gutenberg-en + data_files: + - path: data/natural/en/Gutenberg/*.parquet + split: train +- config_name: Gutenberg-es + data_files: + - path: data/natural/es/Gutenberg/*.parquet + split: train +- config_name: Gutenberg-fr + data_files: + - path: data/natural/fr/Gutenberg/*.parquet + split: train +- config_name: Gutenberg-it + data_files: + - path: data/natural/it/Gutenberg/*.parquet + split: train +- config_name: HAL + data_files: + - path: data/natural/*/HAL/*.parquet + split: train +- config_name: MathPile + data_files: + - path: data/natural/*/MathPile/*.parquet + split: train +- config_name: OpenData + data_files: + - path: data/natural/*/OpenData/*.parquet + split: train +- config_name: OpenEdition + data_files: + - path: data/natural/*/OpenEdition/*.parquet + split: train +- config_name: PeS2o + data_files: + - path: data/natural/*/PeS2o/*.parquet + split: train +- config_name: Persee + data_files: + - path: data/natural/*/Persee/*.parquet + split: train +- config_name: RedPajama + data_files: + - path: data/natural/*/RedPajama/*.parquet + split: train +- config_name: RedPajama-de + data_files: + - path: data/natural/de/RedPajama/*.parquet + split: train +- config_name: RedPajama-es + data_files: + - path: data/natural/es/RedPajama/*.parquet + split: train +- config_name: RedPajama-fr + data_files: + - path: data/natural/fr/RedPajama/*.parquet + split: train +- config_name: RedPajama-it + data_files: + - path: data/natural/it/RedPajama/*.parquet + split: train +- config_name: Stac + data_files: + - path: data/natural/*/Stac/*.parquet + split: train +- config_name: TheStack + data_files: + - path: data/code/*/TheStack/*.parquet + split: train +- config_name: Theses + data_files: + - path: data/natural/*/Theses/*.parquet + split: train +- config_name: Wikipedia + data_files: + - path: data/natural/*/Wikipedia/*.parquet + split: train +- config_name: Wikipedia-de + data_files: + - path: data/natural/de/Wikipedia/*.parquet + split: train +- config_name: Wikipedia-en + data_files: + - path: data/natural/en/Wikipedia/*.parquet + split: train +- config_name: Wikipedia-es + data_files: + - path: data/natural/es/Wikipedia/*.parquet + split: train +- config_name: Wikipedia-fr + data_files: + - path: data/natural/fr/Wikipedia/*.parquet + split: train +- config_name: Wikipedia-it + data_files: + - path: data/natural/it/Wikipedia/*.parquet + split: train +- config_name: Wikisource + data_files: + - path: data/natural/*/Wikisource/*.parquet + split: train +- config_name: Wiktionary + data_files: + - path: data/natural/*/Wiktionary/*.parquet + split: train +- config_name: YouTube + data_files: + - path: data/natural/*/YouTube/*.parquet + split: train diff --git a/assets/hugging_face/hf_upload_dataset.py b/assets/hugging_face/hf_upload_dataset.py index 97bf7f9..4a6c13f 100644 --- a/assets/hugging_face/hf_upload_dataset.py +++ b/assets/hugging_face/hf_upload_dataset.py @@ -36,17 +36,60 @@ def dump_dataset_config(): + # Trick to dump OrderedDict def represent_ordereddict(dumper, data): return dumper.represent_dict(data.items()) yaml.add_representer(OrderedDict, represent_ordereddict, Dumper=yaml.SafeDumper) + + # TODO: reload the original file and merge it (in case it was modified in the meantime) ? + with open(_readme_header_file, "w") as f: yaml.dump(OrderedDict(_dataset_header), f, Dumper=yaml.SafeDumper, default_flow_style=False) +def sort_config_key(name): + """ + return a tuple where the name is prefixed by an index, indicating the order (among several configs). + + example use: + sorted(..., key=lambda config: sort_config_key(config["config_name"])) + + # 1. (0) default + # 2. (1 ... N) en, fr, de, ... # natural languages + # 3. (N + 1) multi-lingual + # 4. (N + 2) code + # 5. (N + 3) python, c++, ... # programming languages + # 6. (N + 4) individual subsets + """ + _languages = ["en", "fr", "de", "es", "it"] + N = len(_languages) + if any(name.startswith(f"{language},") for language in ["en", "fr", "de", "es", "it"]): + # Bilingual settings + order_idx = N + 1 + elif name.startswith("code-"): + # Programming languages + order_idx = N + 3 + name = name.split("-")[1] + else: + order_idx = ( + {lang: i + 1 for i, lang in enumerate(_languages)} + | { + "default": 0, + "code": N + 2, + } + ).get(name, N + 4) # individual subsets will come at the end + + return (order_idx, name) + + def to_language(name, **kwargs): - lan, _, __ = to_language_name_subset(name, **kwargs) - return lan + lan, _, sub = to_language_name_subset(name, **kwargs) + lan_type = "natural" + if lan == "code": + lan_type = "code" # "programming" + lan = sub + return lan_type, lan def to_source_and_id_func(name, **kwargs): @@ -221,6 +264,9 @@ def get_union(metadatas, desc=None): parser.add_argument( "--clean", default=False, action="store_true", help="Clean the parquet after they have been uploaded" ) + parser.add_argument( + "--update_each", type=int, default=20, help="Update each N parquet files (to avoid too frequent uploads)" + ) args = parser.parse_args() metadata_fields = { @@ -258,6 +304,7 @@ def get_union(metadatas, desc=None): (os.path.splitext(args.collect_metadata)[0] + "_types.json") if args.collect_metadata else None ) do_upload = args.repository and not args.collect_metadata + must_update_readme = False if args.collect_metadata: if os.path.exists(args.collect_metadata): @@ -287,49 +334,75 @@ def dump_metadata(metadatas, examples): progress_bar = tqdm.tqdm(all_datas.items()) previous_pseudo = None hf_api = None - for dataset_name, dataset in progress_bar: - dataset_pseudo = dataset_name.split("-")[0] - if previous_pseudo != dataset_pseudo: - previous_pseudo = dataset_pseudo - random.seed(1234) # Hack. Try to reproduce same randomness as when tokenizing - - # To yield dictionaries with metadata instead of just the text - language = to_language(dataset_name) - source, source_pseudo, update_dict_func = to_source_and_id_func(dataset_name) - assert source_pseudo and language and dataset_name - path_in_repo = f"{source_pseudo}/{language}/{dataset_name}.parquet" - - multi_lingual_configs[source_pseudo] = multi_lingual_configs.get(source_pseudo, {}) | { - language: { - "config_name": f"{source_pseudo}/{language}", - "data_files": [{"split": "train", "path": f"data/{source_pseudo}/{language}/*.parquet"}], - } - } - - config_names = [c["config_name"] for c in _dataset_header["configs"]] - for config in [ - { - "config_name": source_pseudo, - "data_files": [{"split": "train", "path": f"data/{source_pseudo}/*/*.parquet"}], - }, - {"config_name": language, "data_files": [{"split": "train", "path": f"data/*/{language}/*.parquet"}]}, - ] + ( - multi_lingual_configs.get(source_pseudo).values() if len(multi_lingual_configs[source_pseudo]) > 1 else [] - ): - if config["config_name"] not in config_names: - _dataset_header["configs"].append(config) + parquet_finished, parquet_filename = True, None + parquet_files_created = [] + lock_files = [] + try: + for dataset_name, dataset in progress_bar: + progress_bar.set_description(f"Processing {dataset_name}...") + dataset_pseudo = dataset_name.split("-")[0] + if previous_pseudo != dataset_pseudo: + previous_pseudo = dataset_pseudo + random.seed(1234) # Hack. Try to reproduce same randomness as when tokenizing + + # To yield dictionaries with metadata instead of just the text + language_category, language = to_language(dataset_name) + source, source_pseudo, update_dict_func = to_source_and_id_func(dataset_name) + assert ( + source_pseudo and language_category and language and dataset_name + ), f"{source=} -- {source_pseudo=} -- {language=} -- {language_category=} -- {dataset_name=}" + path_in_repo = f"data/{language_category}/{language}/{source_pseudo}/{dataset_name}.parquet" + + language_configname = language.replace("-", ",") # fr-en -> fr,en + + if source != "TheStack": # This one is already under code-* + multi_lingual_configs[source] = multi_lingual_configs.get(source, {}) | { + language: { + "config_name": f"{source}-{language_configname}", + "data_files": [ + {"split": "train", "path": f"data/{language_category}/{language}/{source_pseudo}/*.parquet"} + ], + } + } + + config_names = [c["config_name"] for c in _dataset_header["configs"]] + for config in [ + { + "config_name": source, + "data_files": [{"split": "train", "path": f"data/{language_category}/*/{source_pseudo}/*.parquet"}], + }, + { + "config_name": language_configname + if (language_category == "natural") + else f"{language_category}-{language_configname}", + "data_files": [{"split": "train", "path": f"data/{language_category}/{language}/*/*.parquet"}], + }, + ] + ( + list(multi_lingual_configs.get(source).values()) + if len(multi_lingual_configs.get(source, [])) > 1 + else [] + ): + if config["config_name"] not in config_names: + _dataset_header["configs"].append(config) + _dataset_header["configs"] = sorted( + _dataset_header["configs"], key=lambda x: sort_config_key(x["config_name"]) + ) + must_update_readme = True + + if must_update_readme: dump_dataset_config() - parquet_filename = os.path.join(args.folder, path_in_repo) - os.makedirs(os.path.dirname(parquet_filename), exist_ok=True) + parquet_filename = os.path.join(args.folder, path_in_repo) + os.makedirs(os.path.dirname(parquet_filename), exist_ok=True) - lock_file = parquet_filename + ".lock" - if os.path.exists(lock_file): - continue - with open(lock_file, "w") as f: - f.write("lock") + lock_file = parquet_filename + ".lock" + if do_upload: + if os.path.exists(lock_file): + continue + with open(lock_file, "w") as f: + f.write("lock") + lock_files.append(lock_file) - try: dataset.SetYieldMetadata( uniformize_metadata=args.uniformize_metadata, extra_metadata=dict( @@ -339,10 +412,6 @@ def dump_metadata(metadatas, examples): update_dict_func=update_dict_func, ) - # if args.collect_metadata and source in metadatas: - # continue - progress_bar.set_description(f"Processing {dataset_name}...") - metadatas[source] = metadatas.get(source, {}) examples[source] = examples.get(source, {}) @@ -352,92 +421,143 @@ def dump_metadata(metadatas, examples): all_data[k] = [] has_data = False - for i, sample in enumerate(dataset): - has_data = True - assert isinstance(sample, dict), f"Sample is not a dictionary: {type(sample)}" - assert "text" in sample and isinstance(sample["text"], str) - - # Update metadata - if args.collect_metadata: - # Update types - metadatas[source] = get_union( - [{k: get_type(sample[k]) for k in sorted(sample.keys()) if k != "text"}, metadatas[source]], - desc=dataset_name, - ) - # Update examples - for k, v in sample.items(): - if v is None or k in ["text"]: - continue - if k not in examples[source]: - examples[source][k] = get_example_preview( - v, enforce_dict=False - ) # True # args.uniformize_metadata) - if k not in examples[_UNION_KEY]: - examples[_UNION_KEY][k] = get_example_preview(v) - dump_metadata(metadatas, examples) - if None not in metadatas[source].values() or i > 100: - break - - # Add sample data to (parquet) dataset + parquet_finished = do_upload and os.path.isfile(parquet_filename) + if parquet_finished: + print(f"Warning: Using existing parquet file {parquet_filename}") + else: + for i, sample in enumerate(dataset): + has_data = True + assert isinstance(sample, dict), f"Sample is not a dictionary: {type(sample)}" + assert "text" in sample and isinstance(sample["text"], str) + + # Update metadata + if args.collect_metadata: + # Update types + metadatas[source] = get_union( + [{k: get_type(sample[k]) for k in sorted(sample.keys()) if k != "text"}, metadatas[source]], + desc=dataset_name, + ) + # Update examples + for k, v in sample.items(): + if v is None or k in ["text"]: + continue + if k not in examples[source]: + examples[source][k] = get_example_preview( + v, enforce_dict=False + ) # True # args.uniformize_metadata) + if k not in examples[_UNION_KEY]: + examples[_UNION_KEY][k] = get_example_preview(v) + dump_metadata(metadatas, examples) + if None not in metadatas[source].values() or i > 100: + break + + # Add sample data to (parquet) dataset + if do_upload: + if not all_data: + all_data = {k: [] for k in sample.keys()} + for k, v in sample.items(): + if k not in all_data: + assert all_data + num_samples = len(all_data[list(all_data.keys())[0]]) + all_data[k] = [_DEFAULT_VALUE] * num_samples + if v is None: + v = _DEFAULT_VALUE + all_data[k].append(v) + for k in all_data: + if k not in sample: + all_data[k].append(_DEFAULT_VALUE) + + if not has_data: + raise RuntimeError(f"Dataset {dataset_name} has no data") + parquet_finished = True + if do_upload: - if not all_data: - all_data = {k: [] for k in sample.keys()} - for k, v in sample.items(): - if k not in all_data: - assert all_data - num_samples = len(all_data[list(all_data.keys())[0]]) - all_data[k] = [_DEFAULT_VALUE] * num_samples - all_data[k].append(v) - for k in all_data: - if k not in sample: - all_data[k].append(_DEFAULT_VALUE) - - if not has_data: - raise RuntimeError(f"Dataset {dataset_name} has no data") + # Dump parquet file + pd.DataFrame(all_data).to_parquet(parquet_filename) if do_upload: - # Dump parquet file - pd.DataFrame(all_data).to_parquet(parquet_filename) - - # Dump to Hugging Face - if hf_api is None: - hf_api, _ = connect_to_huggingface(args.repository, repo_type="dataset") - - hf_api.upload_file( - path_or_fileobj=parquet_filename, - path_in_repo=f"data/{path_in_repo}", - commit_message=f"Upload {os.path.splitext(path_in_repo)[0]}", - repo_id=args.repository, - repo_type="dataset", - revision=None, - ) - - if args.clean: - os.remove(parquet_filename) - - except Exception as err: - os.remove(lock_file) - raise err - - if do_upload: - # Create the README.md file - readme_content = "---\n" - with open(_readme_header_file) as f: - readme_content += f.read().strip() + "\n" - readme_content += "---\n" - with open(_readme_file_main) as f: - readme_content += "\n" + f.read().strip() + "\n" - tmp_file = os.path.join(tempfile.gettempdir(), "README.md") - with open(tmp_file, "w") as f: - f.write(readme_content) - - hf_api.upload_file( - path_or_fileobj=tmp_file, - path_in_repo="README.md", - commit_message="Upload README.md", - repo_id=args.repository, - repo_type="dataset", - revision=None, - ) + parquet_files_created.append(parquet_filename) + if len(parquet_files_created) >= args.update_each: + # Dump to Hugging Face + if hf_api is None: + hf_api, _ = connect_to_huggingface(args.repository, repo_type="dataset") + + if len(parquet_files_created) == 1: + hf_api.upload_file( + path_or_fileobj=parquet_filename, + path_in_repo=path_in_repo, + commit_message=f"Upload {source}", + repo_id=args.repository, + repo_type="dataset", + revision=None, + ) + else: + hf_api.upload_folder( + folder_path=args.folder, + commit_message="Upload data", + ignore_patterns=["*.lock"], + repo_id=args.repository, + repo_type="dataset", + revision=None, + ) + + if args.clean: + for f in parquet_files_created: + os.remove(f) + + parquet_files_created = [] + lock_files = [] + + except Exception as err: + if not parquet_finished and os.path.exists(parquet_filename): + os.remove(parquet_filename) + for f in lock_files: + os.remove(f) + raise err + + if len(parquet_files_created): + # Dump the last ones + + if hf_api is None: + hf_api, _ = connect_to_huggingface(args.repository, repo_type="dataset") + + hf_api.upload_folder( + folder_path=args.folder, + commit_message="Upload data", + ignore_patterns=["*.lock"], + repo_id=args.repository, + repo_type="dataset", + revision=None, + ) + + if args.clean: + for f in parquet_files_created: + os.remove(f) + + # # TODO ??? for now we don't automatically update the README.md (too dangerous) + # if must_update_readme: + if False: + if hf_api is None: + hf_api, _ = connect_to_huggingface(args.repository, repo_type="dataset") + + # Create the README.md file + readme_content = "---\n" + with open(_readme_header_file) as f: + readme_content += f.read().strip() + "\n" + readme_content += "---\n" + with open(_readme_file_main) as f: + readme_content += "\n" + f.read().strip() + "\n" + tmp_file = os.path.join(tempfile.gettempdir(), "README.md") + with open(tmp_file, "w") as f: + f.write(readme_content) + + hf_api.upload_file( + path_or_fileobj=tmp_file, + path_in_repo="README.md", + commit_message="Update README.md", + repo_id=args.repository, + repo_type="dataset", + revision=None, + ) - os.remove(tmp_file) + os.remove(tmp_file) diff --git a/tokenization/data.py b/tokenization/data.py index e813b17..035128a 100644 --- a/tokenization/data.py +++ b/tokenization/data.py @@ -336,7 +336,7 @@ def decompose_datasets(dataset, parquet_level=False, return_json_file_if_possibl streaming=True, split="train", ), - name=f"{dataset.name}" + (f"{i:03d}" if use_suffix else ""), + name=f"{dataset.name}:" + (f"{i:03d}" if use_suffix else ""), key=dataset.key, preprocess=dataset.preprocess, postprocess=dataset.postprocess,