Add precision in numbers when needed

OpenLLM-France · Nov 6, 2024 · bfa03ec · bfa03ec
1 parent d1eaadb
commit bfa03ec
Show file tree

Hide file tree

Showing 7 changed files with 295 additions and 271 deletions.
diff --git a/assets/compile_stats.py b/assets/compile_stats.py
@@ -257,6 +257,10 @@ def to_language_name_subset(name, subset=None):
 SORT_BY = "#words"
 
 
+def must_be_skipped(data):
+    return data["B words"] == 0
+
+
 def compute_extra_stats(data, tokencount_folder):
     if "B words" in data:
         return data
@@ -310,38 +314,52 @@ def get_stat_names(compute_token_stats=True):
     return list(dummy.keys())
 
 
+def precision_at_least(x, prec=3, length=8):
+    if x == 0:
+        return f"{0:{length}.{prec}f}"
+    if round(x, prec) >= 100 * (10**-prec):
+        return f"{x:{length}.{prec}f}"
+    return precision_at_least(x, prec + 1)
+
+
+def precision_at_least_func(prec=3, length=8):
+    return lambda x: precision_at_least(x, prec, length)
+
+
 def format_stats_display(data, main=True):
     for name, format in [
         ("language", "{:<9s}"),
         ("name", "{:<21s}"),
         ("subset", "{:<12s}" if main else "{:<28s}"),
         ("ocr", "{:<6s}"),
         ("category", "{:<13s}"),
-        ("M docs", "{:8.3f}"),
-        ("B words", "{:8.3f}"),
-        ("B chars", "{:8.3f}"),
-        ("B tokens", "{:9.3f}"),
+        ("M docs", precision_at_least_func()),  # "{:8.3f}"),
+        ("B words", precision_at_least_func()),  # "{:8.3f}"),
+        ("B chars", precision_at_least_func()),  # "{:8.3f}"),
+        ("B tokens", precision_at_least_func(length=9)),  # "{:9.3f}"),
         ("#words/doc", "{:11.0f}"),
         ("#chars/page", "{:11.0f}"),
         ("#chars/word", "{:11.1f}"),
         ("#tokens/words", "{:12.2f}"),
         ("#chars/tokens", "{:12.2f}"),
     ]:
+        format_func = format.format if isinstance(format, str) else format
+        format_str = format if isinstance(format, str) else "{:" + str(len(format(1))) + ".3f}"
         if name in data.keys():
             val = data[name]
             if isinstance(val, str) or val is None:
                 if val is None:
                     val = " "
-                if format.endswith("f}"):
-                    length = int(format[2:-2].split(".")[0])
-                    format = f"{{:>{length}s}}"
-                elif format.endswith("}"):
-                    length = int(format[2:-1].strip("<>s"))
+                if format_str.endswith("f}"):
+                    length = int(format_str[2:-2].split(".")[0])
+                    format_func = f"{{:>{length}s}}".format
+                elif format_str.endswith("}"):
+                    length = int(format_str[2:-1].strip("<>s"))
                     if len(val) > length:
                         # val = val[:length]
                         val = val.strip("_ ")
             try:
-                data[name] = format.format(val)
+                data[name] = format_func(val)
             except Exception as err:
                 raise RuntimeError(f"Error formatting {name}={val} with {format=}") from err
     return data
@@ -576,6 +594,8 @@ def sort_function(row):
                 writer.writerow(header_with_spaces)
                 for row in rows:
                     row = compute_extra_stats(row, tokencount_folder)
+                    if must_be_skipped(row):
+                        continue
                     row = format_stats_display(row, ONLY_DETAILED)
                     writer.writerow(row)
 
@@ -584,5 +604,7 @@ def sort_function(row):
             writer.writerow(header_with_spaces)
             for row in rows_detailed:
                 row = compute_extra_stats(row, tokencount_folder)
+                if must_be_skipped(row):
+                    continue
                 row = format_stats_display(row, ONLY_DETAILED)
                 writer.writerow(row)
diff --git a/assets/hugging_face/README_dataset.md b/assets/hugging_face/README_dataset.md
@@ -184,15 +184,15 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
   <td>47.758</td>
   <td></td>
 </tr>
-<tr>
+<!-- <tr>
   <td><a href="#persee"><strong>Persee</strong></a></td>
   <td><strong>fr</strong></td>
   <td>1.094</td>
   <td>3.25</td>
   <td>5.754</td>
   <td>20.314</td>
   <td></td>
-</tr>
+</tr> -->
 <tr>
   <td><a href="#pile-uncopyrighted"><strong>Pile (USPTO_Backgrounds)</strong></a></td>
   <td><strong>en</strong></td>
@@ -529,21 +529,21 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
   <td>0.818</td>
   <td>1.161</td>
   <td>4.709</td>
-  <td><strong>DialogStudio</strong> (0.061 B words), <strong>BNC</strong> (0.011 B words), <strong>OANC</strong> (0.005 B words), <strong>AMI</strong> (0.001 B words), <strong>DailyDialog</strong> (0.001 B words), <strong>ICSI</strong> (0.001 B words)</td>
+  <td><strong>DialogStudio</strong> (0.0611 B words), <strong>BNC</strong> (0.0107 B words), <strong>OANC</strong> (0.00483 B words), <strong>DailyDialog</strong> (0.00127 B words), <strong>ICSI</strong> (0.000888 B words), <strong>AMI</strong> (0.000798 B words)</td>
 </tr>
 <tr>
   <td><a href="#claire-french-and-english"><strong>Claire</strong></a></td>
   <td><strong>fr</strong></td>
-  <td>0.037</td>
-  <td>0.209</td>
-  <td>0.31</td>
-  <td>1.313</td>
-  <td><strong>Senat</strong> (0.051 B words), <strong>Theatre</strong> (0.017 B words), <strong>ESLO</strong> (0.005 B words), <strong>CFPP</strong> (0.001 B words), <strong>OFROM</strong> (0.001 B words), <strong>ORFEO</strong> (0.001 B words), <strong>PFC</strong> (0.001 B words), <strong>SUMM</strong> (0.001 B words), <strong>TCOF</strong> (0.001 B words), <strong>ACSYNT</strong>, <strong>CID</strong>, <strong>CLAPI</strong>, <strong>FREDSum</strong>, <strong>LINAGORA</strong>, <strong>OTG</strong>, <strong>ParisStories</strong>, <strong>Rhapsodie</strong>, <strong>UBS</strong></td>
+  <td>0.039</td>
+  <td>0.21</td>
+  <td>0.311</td>
+  <td>1.314</td>
+  <td><strong>Senat</strong> (0.0515 B words), <strong>Theatre</strong> (0.0168 B words), <strong>ESLO</strong> (0.0051 B words), <strong>ORFEO</strong> (0.00147 B words), <strong>SUMM</strong> (0.00127 B words), <strong>TCOF</strong> (0.000784 B words), <strong>CFPP</strong> (0.000646 B words), <strong>OFROM</strong> (0.000593 B words), <strong>PFC</strong> (0.000506 B words), <strong>FREDSum</strong> (0.000392 B words), <strong>CLAPI</strong> (0.000144 B words), <strong>CID</strong> (0.00012 B words), <strong>LINAGORA</strong> (0.000106 B words), <strong>ACSYNT</strong> (6.12e-05 B words), <strong>OTG</strong> (3.01e-05 B words), <strong>Rhapsodie</strong> (2.74e-05 B words), <strong>ParisStories</strong> (2.71e-05 B words), <strong>UBS</strong> (7.87e-06 B words)</td>
 </tr>
 <tr>
   <td><a href="#youtube"><strong>YouTube</strong></a></td>
   <td><strong>fr</strong></td>
-  <td>0.038</td>
+  <td>0.037</td>
   <td>0.145</td>
   <td>0.336</td>
   <td>1.003</td>
@@ -562,20 +562,20 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
   <td colspan="7"><h4 id=legislative-transcripts>Legislative Transcripts</h4></td></tr>
 <tr>
   <td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
-  <td><strong>es</strong></td>
+  <td><strong>de</strong></td>
   <td>0.01</td>
-  <td>0.052</td>
+  <td>0.045</td>
   <td>0.073</td>
-  <td>0.325</td>
+  <td>0.327</td>
   <td></td>
 </tr>
 <tr>
   <td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
-  <td><strong>de</strong></td>
+  <td><strong>es</strong></td>
   <td>0.01</td>
-  <td>0.045</td>
+  <td>0.052</td>
   <td>0.073</td>
-  <td>0.327</td>
+  <td>0.325</td>
   <td></td>
 </tr>
 <tr>
@@ -890,7 +890,6 @@ Examples of metadata (except from `text`) are shown for each source in [metadata
 </table>
 
 
-
 ### Details on Data Sources
 
 #### AmendementsParlement

diff --git a/assets/hugging_face/README_dataset_table.html b/assets/hugging_face/README_dataset_table.html
@@ -452,21 +452,21 @@
   <td>0.818</td>
   <td>1.161</td>
   <td>4.709</td>
-  <td><strong>DialogStudio</strong> (0.061 B words), <strong>BNC</strong> (0.011 B words), <strong>OANC</strong> (0.005 B words), <strong>AMI</strong> (0.001 B words), <strong>DailyDialog</strong> (0.001 B words), <strong>ICSI</strong> (0.001 B words)</td>
+  <td><strong>DialogStudio</strong> (0.0611 B words), <strong>BNC</strong> (0.0107 B words), <strong>OANC</strong> (0.00483 B words), <strong>DailyDialog</strong> (0.00127 B words), <strong>ICSI</strong> (0.000888 B words), <strong>AMI</strong> (0.000798 B words)</td>
 </tr>
 <tr>
   <td><a href="#claire-french-and-english"><strong>Claire</strong></a></td>
   <td><strong>fr</strong></td>
-  <td>0.037</td>
-  <td>0.209</td>
-  <td>0.31</td>
-  <td>1.313</td>
-  <td><strong>Senat</strong> (0.051 B words), <strong>Theatre</strong> (0.017 B words), <strong>ESLO</strong> (0.005 B words), <strong>CFPP</strong> (0.001 B words), <strong>OFROM</strong> (0.001 B words), <strong>ORFEO</strong> (0.001 B words), <strong>PFC</strong> (0.001 B words), <strong>SUMM</strong> (0.001 B words), <strong>TCOF</strong> (0.001 B words), <strong>ACSYNT</strong>, <strong>CID</strong>, <strong>CLAPI</strong>, <strong>FREDSum</strong>, <strong>LINAGORA</strong>, <strong>OTG</strong>, <strong>ParisStories</strong>, <strong>Rhapsodie</strong>, <strong>UBS</strong></td>
+  <td>0.039</td>
+  <td>0.21</td>
+  <td>0.311</td>
+  <td>1.314</td>
+  <td><strong>Senat</strong> (0.0515 B words), <strong>Theatre</strong> (0.0168 B words), <strong>ESLO</strong> (0.0051 B words), <strong>ORFEO</strong> (0.00147 B words), <strong>SUMM</strong> (0.00127 B words), <strong>TCOF</strong> (0.000784 B words), <strong>CFPP</strong> (0.000646 B words), <strong>OFROM</strong> (0.000593 B words), <strong>PFC</strong> (0.000506 B words), <strong>FREDSum</strong> (0.000392 B words), <strong>CLAPI</strong> (0.000144 B words), <strong>CID</strong> (0.00012 B words), <strong>LINAGORA</strong> (0.000106 B words), <strong>ACSYNT</strong> (6.12e-05 B words), <strong>OTG</strong> (3.01e-05 B words), <strong>Rhapsodie</strong> (2.74e-05 B words), <strong>ParisStories</strong> (2.71e-05 B words), <strong>UBS</strong> (7.87e-06 B words)</td>
 </tr>
 <tr>
   <td><a href="#youtube"><strong>YouTube</strong></a></td>
   <td><strong>fr</strong></td>
-  <td>0.038</td>
+  <td>0.037</td>
   <td>0.145</td>
   <td>0.336</td>
   <td>1.003</td>
@@ -485,20 +485,20 @@
   <td colspan="7"><h4 id=legislative-transcripts>Legislative Transcripts</h4></td></tr>
 <tr>
   <td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
-  <td><strong>es</strong></td>
+  <td><strong>de</strong></td>
   <td>0.01</td>
-  <td>0.052</td>
+  <td>0.045</td>
   <td>0.073</td>
-  <td>0.325</td>
+  <td>0.327</td>
   <td></td>
 </tr>
 <tr>
   <td><a href="#europarl-monolingual-and-parallel"><strong>Europarl</strong></a></td>
-  <td><strong>de</strong></td>
+  <td><strong>es</strong></td>
   <td>0.01</td>
-  <td>0.045</td>
+  <td>0.052</td>
   <td>0.073</td>
-  <td>0.327</td>
+  <td>0.325</td>
   <td></td>
 </tr>
 <tr>

diff --git a/assets/hugging_face/README_dataset_table.md b/assets/hugging_face/README_dataset_table.md
@@ -56,13 +56,13 @@
 | [**Pile (StackExchange)**](#pile-uncopyrighted) | **en** | 15.269 | 4.534 | 10.275 | 33.609 |  |
 | [**Pile (Ubuntu_IRC)**](#pile-uncopyrighted) | **en** | 0.01 | 0.867 | 2.159 | 5.61 |  |
 | ***Dialogue*** |||||||
-| [**Claire**](#claire-french-and-english) | **en** | 0.949 | 0.818 | 1.161 | 4.709 | **DialogStudio** (0.061 B words), **BNC** (0.011 B words), **OANC** (0.005 B words), **AMI** (0.001 B words), **DailyDialog** (0.001 B words), **ICSI** (0.001 B words) |
-| [**Claire**](#claire-french-and-english) | **fr** | 0.037 | 0.209 | 0.31 | 1.313 | **Senat** (0.051 B words), **Theatre** (0.017 B words), **ESLO** (0.005 B words), **CFPP** (0.001 B words), **OFROM** (0.001 B words), **ORFEO** (0.001 B words), **PFC** (0.001 B words), **SUMM** (0.001 B words), **TCOF** (0.001 B words), **ACSYNT**, **CID**, **CLAPI**, **FREDSum**, **LINAGORA**, **OTG**, **ParisStories**, **Rhapsodie**, **UBS** |
-| [**YouTube**](#youtube) | **fr** | 0.038 | 0.145 | 0.336 | 1.003 |  |
+| [**Claire**](#claire-french-and-english) | **en** | 0.949 | 0.818 | 1.161 | 4.709 | **DialogStudio** (0.0611 B words), **BNC** (0.0107 B words), **OANC** (0.00483 B words), **DailyDialog** (0.00127 B words), **ICSI** (0.000888 B words), **AMI** (0.000798 B words) |
+| [**Claire**](#claire-french-and-english) | **fr** | 0.039 | 0.21 | 0.311 | 1.314 | **Senat** (0.0515 B words), **Theatre** (0.0168 B words), **ESLO** (0.0051 B words), **ORFEO** (0.00147 B words), **SUMM** (0.00127 B words), **TCOF** (0.000784 B words), **CFPP** (0.000646 B words), **OFROM** (0.000593 B words), **PFC** (0.000506 B words), **FREDSum** (0.000392 B words), **CLAPI** (0.000144 B words), **CID** (0.00012 B words), **LINAGORA** (0.000106 B words), **ACSYNT** (6.12e-05 B words), **OTG** (3.01e-05 B words), **Rhapsodie** (2.74e-05 B words), **ParisStories** (2.71e-05 B words), **UBS** (7.87e-06 B words) |
+| [**YouTube**](#youtube) | **fr** | 0.037 | 0.145 | 0.336 | 1.003 |  |
 | [**Stac**](#stac) | **en** | 0.0 | 0.0 | 0.0 | 0.0 |  |
 | ***Legislative Transcripts*** |||||||
-| [**Europarl**](#europarl-monolingual-and-parallel) | **es** | 0.01 | 0.052 | 0.073 | 0.325 |  |
 | [**Europarl**](#europarl-monolingual-and-parallel) | **de** | 0.01 | 0.045 | 0.073 | 0.327 |  |
+| [**Europarl**](#europarl-monolingual-and-parallel) | **es** | 0.01 | 0.052 | 0.073 | 0.325 |  |
 | [**Europarl**](#europarl-monolingual-and-parallel) | **fr** | 0.01 | 0.053 | 0.072 | 0.339 |  |
 | [**Europarl**](#europarl-monolingual-and-parallel) | **en** | 0.011 | 0.056 | 0.069 | 0.339 |  |
 | [**DiscoursPublics**](#discourspublics) | **fr** | 0.11 | 0.163 | 0.238 | 1.025 |  |

diff --git a/assets/hugging_face/generate_dataset_table.py b/assets/hugging_face/generate_dataset_table.py
@@ -121,11 +121,15 @@ def merge_stats(row1, row2, orig_name):
                     break
         if subset:
             info = f"**{subset}**"
-            if round(row2[_show_fields_in_details[0]], 3) > 0:
+            if True:  # round(row2[_show_fields_in_details[0]], 3) > 0:
                 info += " ("
                 info += ", ".join([f"{row2[k]} {k}" for k in _show_fields_in_details])
                 info += ")"
-            sort_criterion = -row2[_sorting_field] if sort_by_count else subset
+            try:
+                subset_int = int(subset)
+            except ValueError:
+                subset_int = None
+            sort_criterion = -row2[_sorting_field] if sort_by_count else (subset if subset_int is None else subset_int)
             extra[subset] = (sort_criterion, info)
         for k, v in row2.items():
             assert k in merged