Skip to content

Commit

Permalink
chore: clean-up a conflict from the rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
Agustin-Picard committed Aug 21, 2024
1 parent f904967 commit 8b5cec0
Showing 1 changed file with 1 addition and 128 deletions.
129 changes: 1 addition & 128 deletions assets/compile_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,133 +2,6 @@
import json
import os

text_types = {
"ocr": [
"AmericanStories",
"Eurovoc",
"GallicaPress",
"GallicaMonographies",
"HAL",
# 'OtherFr',
"Persee",
"Theses",
],
"mixed": ["PeS2o"],
}

datasets_categories = {
"technical": [
"HAL",
"NIH_ExPorter",
"OpenEdition",
"Persee",
"PeS2o",
"PhilPapers",
"Theses",
"USPTO_Backgrounds",
],
"legi_written": [
"amendements_parlement",
"Eurovoc.de",
"Eurovoc.en",
"Eurovoc.es",
"Eurovoc.it",
"FreeLaw",
"LEGI",
"OpenData",
"questions_ecrites_parlement",
],
"legi_spoken": [
"DiscoursPublics",
"interventions_parlement",
],
"legi_dialogue": [
"AssembleeNationale",
"Europarl.en",
"Europarl.es",
"Europarl.de",
"Europarl.fr",
"FREDSum",
"Senat",
],
"dialogue": [
"Claire.en",
"Claire.fr",
"Stac",
"ValidatedYouTube.fr",
],
"book": [
"GallicaMonographies",
"Gutenberg.en",
"Gutenberg.de",
"Gutenberg.it",
"Gutenberg.es",
"Gutenberg.fr",
],
"newspaper": [
"AmericanStories",
"GallicaPress",
],
"forum": [
"StackExchange",
"Ubuntu_IRC",
],
"wiki": [
"Wikiother.fr",
"Wikipedia.en",
"Wikipedia.es",
"Wikipedia.de",
"Wikipedia.it",
"Wikipedia.fr",
],
"programming": [
"TheStack",
],
"math": [
"DM_Mathematics",
"MathPile",
],
"aligned": [
"CroissantAligned",
"EuroparlAligned.fr-en",
"EuroparlAligned.es-en",
"EuroparlAligned.it-en",
"EuroparlAligned.de-fr",
]
}


def _norm_string(s):
return s.lower().replace("_", " ")


text_types = {k: [_norm_string(x) for x in v] for k, v in text_types.items()}
datasets_categories = {k: [_norm_string(x) for x in v] for k, v in datasets_categories.items()}


def is_ocr_dataset(name, subset):
if name in ["---", "", None]:
return ""
name = _norm_string(name)
res = "false"
if any(d in name for d in text_types["ocr"]):
res = "true"
if any(d in name for d in text_types["mixed"]):
res = "mixed"
return res


def get_dataset_category(name, subset):
if name in ["---", "", None]:
return ""
name = _norm_string(name)
if name in ["pile", "otherfr"] and subset:
name = _norm_string(subset)
for cat, datasets in datasets_categories.items():
if name in datasets:
return cat
return None


# Ignore datasets
def ignore_datasets(name):
Expand Down Expand Up @@ -706,4 +579,4 @@ def sort_function(row):
for row in rows_detailed:
row = compute_extra_stats(row, tokencount_folder)
row = format_stats_display(row, ONLY_DETAILED)
writer.writerow(row)
writer.writerow(row)

0 comments on commit 8b5cec0

Please sign in to comment.