Skip to content

Commit

Permalink
Add precision in numbers when needed
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeronymous committed Nov 6, 2024
1 parent d1eaadb commit bfa03ec
Show file tree
Hide file tree
Showing 7 changed files with 295 additions and 271 deletions.
42 changes: 32 additions & 10 deletions assets/compile_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ def to_language_name_subset(name, subset=None):
SORT_BY = "#words"


def must_be_skipped(data):
return data["B words"] == 0


def compute_extra_stats(data, tokencount_folder):
if "B words" in data:
return data
Expand Down Expand Up @@ -310,38 +314,52 @@ def get_stat_names(compute_token_stats=True):
return list(dummy.keys())


def precision_at_least(x, prec=3, length=8):
if x == 0:
return f"{0:{length}.{prec}f}"
if round(x, prec) >= 100 * (10**-prec):
return f"{x:{length}.{prec}f}"
return precision_at_least(x, prec + 1)


def precision_at_least_func(prec=3, length=8):
return lambda x: precision_at_least(x, prec, length)


def format_stats_display(data, main=True):
for name, format in [
("language", "{:<9s}"),
("name", "{:<21s}"),
("subset", "{:<12s}" if main else "{:<28s}"),
("ocr", "{:<6s}"),
("category", "{:<13s}"),
("M docs", "{:8.3f}"),
("B words", "{:8.3f}"),
("B chars", "{:8.3f}"),
("B tokens", "{:9.3f}"),
("M docs", precision_at_least_func()), # "{:8.3f}"),
("B words", precision_at_least_func()), # "{:8.3f}"),
("B chars", precision_at_least_func()), # "{:8.3f}"),
("B tokens", precision_at_least_func(length=9)), # "{:9.3f}"),
("#words/doc", "{:11.0f}"),
("#chars/page", "{:11.0f}"),
("#chars/word", "{:11.1f}"),
("#tokens/words", "{:12.2f}"),
("#chars/tokens", "{:12.2f}"),
]:
format_func = format.format if isinstance(format, str) else format
format_str = format if isinstance(format, str) else "{:" + str(len(format(1))) + ".3f}"
if name in data.keys():
val = data[name]
if isinstance(val, str) or val is None:
if val is None:
val = " "
if format.endswith("f}"):
length = int(format[2:-2].split(".")[0])
format = f"{{:>{length}s}}"
elif format.endswith("}"):
length = int(format[2:-1].strip("<>s"))
if format_str.endswith("f}"):
length = int(format_str[2:-2].split(".")[0])
format_func = f"{{:>{length}s}}".format
elif format_str.endswith("}"):
length = int(format_str[2:-1].strip("<>s"))
if len(val) > length:
# val = val[:length]
val = val.strip("_ ")
try:
data[name] = format.format(val)
data[name] = format_func(val)
except Exception as err:
raise RuntimeError(f"Error formatting {name}={val} with {format=}") from err
return data
Expand Down Expand Up @@ -576,6 +594,8 @@ def sort_function(row):
writer.writerow(header_with_spaces)
for row in rows:
row = compute_extra_stats(row, tokencount_folder)
if must_be_skipped(row):
continue
row = format_stats_display(row, ONLY_DETAILED)
writer.writerow(row)

Expand All @@ -584,5 +604,7 @@ def sort_function(row):
writer.writerow(header_with_spaces)
for row in rows_detailed:
row = compute_extra_stats(row, tokencount_folder)
if must_be_skipped(row):
continue
row = format_stats_display(row, ONLY_DETAILED)
writer.writerow(row)
31 changes: 15 additions & 16 deletions assets/hugging_face/README_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
<td>47.758</td>
<td></td>
</tr>
<tr>
<!-- <tr>
<td><a href="#persee"><strong>Persee</strong></a></td>
<td><strong>fr</strong></td>
<td>1.094</td>
<td>3.25</td>
<td>5.754</td>
<td>20.314</td>
<td></td>
</tr>
</tr> -->
<tr>
<td><a href="#pile-uncopyrighted"><strong>Pile (USPTO_Backgrounds)</strong></a></td>
<td><strong>en</strong></td>
Expand Down Expand Up @@ -529,21 +529,21 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
<td>0.818</td>
<td>1.161</td>
<td>4.709</td>
<td><strong>DialogStudio</strong> (0.061 B words), <strong>BNC</strong> (0.011 B words), <strong>OANC</strong> (0.005 B words), <strong>AMI</strong> (0.001 B words), <strong>DailyDialog</strong> (0.001 B words), <strong>ICSI</strong> (0.001 B words)</td>
<td><strong>DialogStudio</strong> (0.0611 B words), <strong>BNC</strong> (0.0107 B words), <strong>OANC</strong> (0.00483 B words), <strong>DailyDialog</strong> (0.00127 B words), <strong>ICSI</strong> (0.000888 B words), <strong>AMI</strong> (0.000798 B words)</td>
</tr>
<tr>
<td><a href="#claire-french-and-english"><strong>Claire</strong></a></td>
<td><strong>fr</strong></td>
<td>0.037</td>
<td>0.209</td>
<td>0.31</td>
<td>1.313</td>
<td><strong>Senat</strong> (0.051 B words), <strong>Theatre</strong> (0.017 B words), <strong>ESLO</strong> (0.005 B words), <strong>CFPP</strong> (0.001 B words), <strong>OFROM</strong> (0.001 B words), <strong>ORFEO</strong> (0.001 B words), <strong>PFC</strong> (0.001 B words), <strong>SUMM</strong> (0.001 B words), <strong>TCOF</strong> (0.001 B words), <strong>ACSYNT</strong>, <strong>CID</strong>, <strong>CLAPI</strong>, <strong>FREDSum</strong>, <strong>LINAGORA</strong>, <strong>OTG</strong>, <strong>ParisStories</strong>, <strong>Rhapsodie</strong>, <strong>UBS</strong></td>
<td>0.039</td>
<td>0.21</td>
<td>0.311</td>
<td>1.314</td>
<td><strong>Senat</strong> (0.0515 B words), <strong>Theatre</strong> (0.0168 B words), <strong>ESLO</strong> (0.0051 B words), <strong>ORFEO</strong> (0.00147 B words), <strong>SUMM</strong> (0.00127 B words), <strong>TCOF</strong> (0.000784 B words), <strong>CFPP</strong> (0.000646 B words), <strong>OFROM</strong> (0.000593 B words), <strong>PFC</strong> (0.000506 B words), <strong>FREDSum</strong> (0.000392 B words), <strong>CLAPI</strong> (0.000144 B words), <strong>CID</strong> (0.00012 B words), <strong>LINAGORA</strong> (0.000106 B words), <strong>ACSYNT</strong> (6.12e-05 B words), <strong>OTG</strong> (3.01e-05 B words), <strong>Rhapsodie</strong> (2.74e-05 B words), <strong>ParisStories</strong> (2.71e-05 B words), <strong>UBS</strong> (7.87e-06 B words)</td>
</tr>
<tr>
<td><a href="#youtube"><strong>YouTube</strong></a></td>
<td><strong>fr</strong></td>
<td>0.038</td>
<td>0.037</td>
<td>0.145</td>
<td>0.336</td>
<td>1.003</td>
Expand All @@ -562,20 +562,20 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
<td colspan="7"><h4 id=legislative-transcripts>Legislative Transcripts</h4></td></tr>
<tr>
<td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
<td><strong>es</strong></td>
<td><strong>de</strong></td>
<td>0.01</td>
<td>0.052</td>
<td>0.045</td>
<td>0.073</td>
<td>0.325</td>
<td>0.327</td>
<td></td>
</tr>
<tr>
<td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
<td><strong>de</strong></td>
<td><strong>es</strong></td>
<td>0.01</td>
<td>0.045</td>
<td>0.052</td>
<td>0.073</td>
<td>0.327</td>
<td>0.325</td>
<td></td>
</tr>
<tr>
Expand Down Expand Up @@ -890,7 +890,6 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
</table>



### Details on Data Sources

#### AmendementsParlement
Expand Down
26 changes: 13 additions & 13 deletions assets/hugging_face/README_dataset_table.html
Original file line number Diff line number Diff line change
Expand Up @@ -452,21 +452,21 @@
<td>0.818</td>
<td>1.161</td>
<td>4.709</td>
<td><strong>DialogStudio</strong> (0.061 B words), <strong>BNC</strong> (0.011 B words), <strong>OANC</strong> (0.005 B words), <strong>AMI</strong> (0.001 B words), <strong>DailyDialog</strong> (0.001 B words), <strong>ICSI</strong> (0.001 B words)</td>
<td><strong>DialogStudio</strong> (0.0611 B words), <strong>BNC</strong> (0.0107 B words), <strong>OANC</strong> (0.00483 B words), <strong>DailyDialog</strong> (0.00127 B words), <strong>ICSI</strong> (0.000888 B words), <strong>AMI</strong> (0.000798 B words)</td>
</tr>
<tr>
<td><a href="#claire-french-and-english"><strong>Claire</strong></a></td>
<td><strong>fr</strong></td>
<td>0.037</td>
<td>0.209</td>
<td>0.31</td>
<td>1.313</td>
<td><strong>Senat</strong> (0.051 B words), <strong>Theatre</strong> (0.017 B words), <strong>ESLO</strong> (0.005 B words), <strong>CFPP</strong> (0.001 B words), <strong>OFROM</strong> (0.001 B words), <strong>ORFEO</strong> (0.001 B words), <strong>PFC</strong> (0.001 B words), <strong>SUMM</strong> (0.001 B words), <strong>TCOF</strong> (0.001 B words), <strong>ACSYNT</strong>, <strong>CID</strong>, <strong>CLAPI</strong>, <strong>FREDSum</strong>, <strong>LINAGORA</strong>, <strong>OTG</strong>, <strong>ParisStories</strong>, <strong>Rhapsodie</strong>, <strong>UBS</strong></td>
<td>0.039</td>
<td>0.21</td>
<td>0.311</td>
<td>1.314</td>
<td><strong>Senat</strong> (0.0515 B words), <strong>Theatre</strong> (0.0168 B words), <strong>ESLO</strong> (0.0051 B words), <strong>ORFEO</strong> (0.00147 B words), <strong>SUMM</strong> (0.00127 B words), <strong>TCOF</strong> (0.000784 B words), <strong>CFPP</strong> (0.000646 B words), <strong>OFROM</strong> (0.000593 B words), <strong>PFC</strong> (0.000506 B words), <strong>FREDSum</strong> (0.000392 B words), <strong>CLAPI</strong> (0.000144 B words), <strong>CID</strong> (0.00012 B words), <strong>LINAGORA</strong> (0.000106 B words), <strong>ACSYNT</strong> (6.12e-05 B words), <strong>OTG</strong> (3.01e-05 B words), <strong>Rhapsodie</strong> (2.74e-05 B words), <strong>ParisStories</strong> (2.71e-05 B words), <strong>UBS</strong> (7.87e-06 B words)</td>
</tr>
<tr>
<td><a href="#youtube"><strong>YouTube</strong></a></td>
<td><strong>fr</strong></td>
<td>0.038</td>
<td>0.037</td>
<td>0.145</td>
<td>0.336</td>
<td>1.003</td>
Expand All @@ -485,20 +485,20 @@
<td colspan="7"><h4 id=legislative-transcripts>Legislative Transcripts</h4></td></tr>
<tr>
<td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
<td><strong>es</strong></td>
<td><strong>de</strong></td>
<td>0.01</td>
<td>0.052</td>
<td>0.045</td>
<td>0.073</td>
<td>0.325</td>
<td>0.327</td>
<td></td>
</tr>
<tr>
<td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
<td><strong>de</strong></td>
<td><strong>es</strong></td>
<td>0.01</td>
<td>0.045</td>
<td>0.052</td>
<td>0.073</td>
<td>0.327</td>
<td>0.325</td>
<td></td>
</tr>
<tr>
Expand Down
8 changes: 4 additions & 4 deletions assets/hugging_face/README_dataset_table.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@
| [**Pile (StackExchange)**](#pile-uncopyrighted) | **en** | 15.269 | 4.534 | 10.275 | 33.609 | |
| [**Pile (Ubuntu_IRC)**](#pile-uncopyrighted) | **en** | 0.01 | 0.867 | 2.159 | 5.61 | |
| ***Dialogue*** |||||||
| [**Claire**](#claire-french-and-english) | **en** | 0.949 | 0.818 | 1.161 | 4.709 | **DialogStudio** (0.061 B words), **BNC** (0.011 B words), **OANC** (0.005 B words), **AMI** (0.001 B words), **DailyDialog** (0.001 B words), **ICSI** (0.001 B words) |
| [**Claire**](#claire-french-and-english) | **fr** | 0.037 | 0.209 | 0.31 | 1.313 | **Senat** (0.051 B words), **Theatre** (0.017 B words), **ESLO** (0.005 B words), **CFPP** (0.001 B words), **OFROM** (0.001 B words), **ORFEO** (0.001 B words), **PFC** (0.001 B words), **SUMM** (0.001 B words), **TCOF** (0.001 B words), **ACSYNT**, **CID**, **CLAPI**, **FREDSum**, **LINAGORA**, **OTG**, **ParisStories**, **Rhapsodie**, **UBS** |
| [**YouTube**](#youtube) | **fr** | 0.038 | 0.145 | 0.336 | 1.003 | |
| [**Claire**](#claire-french-and-english) | **en** | 0.949 | 0.818 | 1.161 | 4.709 | **DialogStudio** (0.0611 B words), **BNC** (0.0107 B words), **OANC** (0.00483 B words), **DailyDialog** (0.00127 B words), **ICSI** (0.000888 B words), **AMI** (0.000798 B words) |
| [**Claire**](#claire-french-and-english) | **fr** | 0.039 | 0.21 | 0.311 | 1.314 | **Senat** (0.0515 B words), **Theatre** (0.0168 B words), **ESLO** (0.0051 B words), **ORFEO** (0.00147 B words), **SUMM** (0.00127 B words), **TCOF** (0.000784 B words), **CFPP** (0.000646 B words), **OFROM** (0.000593 B words), **PFC** (0.000506 B words), **FREDSum** (0.000392 B words), **CLAPI** (0.000144 B words), **CID** (0.00012 B words), **LINAGORA** (0.000106 B words), **ACSYNT** (6.12e-05 B words), **OTG** (3.01e-05 B words), **Rhapsodie** (2.74e-05 B words), **ParisStories** (2.71e-05 B words), **UBS** (7.87e-06 B words) |
| [**YouTube**](#youtube) | **fr** | 0.037 | 0.145 | 0.336 | 1.003 | |
| [**Stac**](#stac) | **en** | 0.0 | 0.0 | 0.0 | 0.0 | |
| ***Legislative Transcripts*** |||||||
| [**Europarl**](#europarl-monolingual-and-parallel) | **es** | 0.01 | 0.052 | 0.073 | 0.325 | |
| [**Europarl**](#europarl-monolingual-and-parallel) | **de** | 0.01 | 0.045 | 0.073 | 0.327 | |
| [**Europarl**](#europarl-monolingual-and-parallel) | **es** | 0.01 | 0.052 | 0.073 | 0.325 | |
| [**Europarl**](#europarl-monolingual-and-parallel) | **fr** | 0.01 | 0.053 | 0.072 | 0.339 | |
| [**Europarl**](#europarl-monolingual-and-parallel) | **en** | 0.011 | 0.056 | 0.069 | 0.339 | |
| [**DiscoursPublics**](#discourspublics) | **fr** | 0.11 | 0.163 | 0.238 | 1.025 | |
Expand Down
8 changes: 6 additions & 2 deletions assets/hugging_face/generate_dataset_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,15 @@ def merge_stats(row1, row2, orig_name):
break
if subset:
info = f"**{subset}**"
if round(row2[_show_fields_in_details[0]], 3) > 0:
if True: # round(row2[_show_fields_in_details[0]], 3) > 0:
info += " ("
info += ", ".join([f"{row2[k]} {k}" for k in _show_fields_in_details])
info += ")"
sort_criterion = -row2[_sorting_field] if sort_by_count else subset
try:
subset_int = int(subset)
except ValueError:
subset_int = None
sort_criterion = -row2[_sorting_field] if sort_by_count else (subset if subset_int is None else subset_int)
extra[subset] = (sort_criterion, info)
for k, v in row2.items():
assert k in merged
Expand Down
Loading

0 comments on commit bfa03ec

Please sign in to comment.