From 8b5cec0782a35bde8c5fdfdb124fea0db16d8f9b Mon Sep 17 00:00:00 2001 From: "agustin-martin.picard" Date: Tue, 20 Aug 2024 15:38:57 +0200 Subject: [PATCH] chore: clean-up a conflict from the rebase --- assets/compile_stats.py | 129 +--------------------------------------- 1 file changed, 1 insertion(+), 128 deletions(-) diff --git a/assets/compile_stats.py b/assets/compile_stats.py index 03e1fa6..f1061bf 100644 --- a/assets/compile_stats.py +++ b/assets/compile_stats.py @@ -2,133 +2,6 @@ import json import os -text_types = { - "ocr": [ - "AmericanStories", - "Eurovoc", - "GallicaPress", - "GallicaMonographies", - "HAL", - # 'OtherFr', - "Persee", - "Theses", - ], - "mixed": ["PeS2o"], -} - -datasets_categories = { - "technical": [ - "HAL", - "NIH_ExPorter", - "OpenEdition", - "Persee", - "PeS2o", - "PhilPapers", - "Theses", - "USPTO_Backgrounds", - ], - "legi_written": [ - "amendements_parlement", - "Eurovoc.de", - "Eurovoc.en", - "Eurovoc.es", - "Eurovoc.it", - "FreeLaw", - "LEGI", - "OpenData", - "questions_ecrites_parlement", - ], - "legi_spoken": [ - "DiscoursPublics", - "interventions_parlement", - ], - "legi_dialogue": [ - "AssembleeNationale", - "Europarl.en", - "Europarl.es", - "Europarl.de", - "Europarl.fr", - "FREDSum", - "Senat", - ], - "dialogue": [ - "Claire.en", - "Claire.fr", - "Stac", - "ValidatedYouTube.fr", - ], - "book": [ - "GallicaMonographies", - "Gutenberg.en", - "Gutenberg.de", - "Gutenberg.it", - "Gutenberg.es", - "Gutenberg.fr", - ], - "newspaper": [ - "AmericanStories", - "GallicaPress", - ], - "forum": [ - "StackExchange", - "Ubuntu_IRC", - ], - "wiki": [ - "Wikiother.fr", - "Wikipedia.en", - "Wikipedia.es", - "Wikipedia.de", - "Wikipedia.it", - "Wikipedia.fr", - ], - "programming": [ - "TheStack", - ], - "math": [ - "DM_Mathematics", - "MathPile", - ], - "aligned": [ - "CroissantAligned", - "EuroparlAligned.fr-en", - "EuroparlAligned.es-en", - "EuroparlAligned.it-en", - "EuroparlAligned.de-fr", - ] -} - - -def _norm_string(s): - return s.lower().replace("_", " ") - - -text_types = {k: [_norm_string(x) for x in v] for k, v in text_types.items()} -datasets_categories = {k: [_norm_string(x) for x in v] for k, v in datasets_categories.items()} - - -def is_ocr_dataset(name, subset): - if name in ["---", "", None]: - return "" - name = _norm_string(name) - res = "false" - if any(d in name for d in text_types["ocr"]): - res = "true" - if any(d in name for d in text_types["mixed"]): - res = "mixed" - return res - - -def get_dataset_category(name, subset): - if name in ["---", "", None]: - return "" - name = _norm_string(name) - if name in ["pile", "otherfr"] and subset: - name = _norm_string(subset) - for cat, datasets in datasets_categories.items(): - if name in datasets: - return cat - return None - # Ignore datasets def ignore_datasets(name): @@ -706,4 +579,4 @@ def sort_function(row): for row in rows_detailed: row = compute_extra_stats(row, tokencount_folder) row = format_stats_display(row, ONLY_DETAILED) - writer.writerow(row) + writer.writerow(row) \ No newline at end of file