FineWeb-2: multilingual, numpy 2.0, minhash improvements (#285)

* change fw quality to strict inequality * bugfix for empty lines (breaking chinese samples) * word tokenizers changes: use spacy when possible, added missing languages from spacy and stanza * add all available tokenizers and all iso-639-1 languages * fix tokenizer issues * add todo * changed indic langs tokenizers to indicnlp * fix tests * added khmer, tibetan and lao * using new tokenizer assignment and language definition system * fix default script * add assignments file * fixes for japanese and tibetan * fixed khmer tokenizer not being active * fix for nan * added south azerbeijani proxy tokenizer * more tibet workarounds * georgian tokenizer and korean fix * added georgian in tokenizer assignments * fix for korean tokenizer: remove very large numbers * add additional punctuation and improve number normalization for other scripts * fix number pattern * added saving cluster_sizes in minhash and bugfixed saving cluster ids * added fallback whitespace tokenizer and fixed tokenizer assignment for iso1 codes * fix memory leaks in word/sent tokenization * ignore ruff * add memoryz zone to spans * add regex to reqs, update hf tests to reflect the new datasets version, fix word tokenizers global vars * empty commit * empty commit * fmt * unlock tensorflow version * bump flask * fix flasky test * add comment about flask * actually fix the flaky test . > ! * japanese tok bugfix * more generous split for japanese to overcome whatever weird normalization they do * allow restarting from "sorting buckets" part when ooming in minhash * small refactor * ugfix * bugfix * jpn word_tokenize * added sparse arrays option * add tqdm * add log msg * rust * rust * rust * rust * rust * rust * rust * messages * messages * messages * messages * sort list of files * no async sanity test * fixes * updates * bunch of changes * fix def value * added check * added check * added check * added check * remove useless lock * some improvements * 1 sec * GIVE ME MY PROGRESS BARS GOD DAMN IT * GIVE ME MY PROGRESS BARS GOD DAMN IT * GIVE ME MY PROGRESS BARS GOD DAMN IT * revert * stupid logspath * giving up. just printing now * giving up. just printing now * network limiting * network limiting * updated work_tokenizer assignments and added burmese * add dependency * add local version * remove progress message * fix for no .remove file * fix missing language tokenizer * fixes for empty folders * reuse word tokenizations between blocks * remove dumb print * updated url filter blocklists * updated symbollinesformatter * moved rust tool * add rust tool readme * fix terminal punctuation in fineweb quality filter --------- Co-authored-by: Hynek Kydlicek <[email protected]> Co-authored-by: Hynek Kydlicek <[email protected]>
huggingface · Dec 6, 2024 · 8427759 · 8427759
1 parent fe81883
commit 8427759
Show file tree

Hide file tree

Showing 29 changed files with 10,839 additions and 396 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "humanize",
     "loguru>=0.7.0",
     "multiprocess",
-    "numpy>=1.25.0,<2.0.0",
+    "numpy>=2.0.0",
     "tqdm",
 ]
 
@@ -41,15 +41,15 @@ io = [
   "pyarrow",
   "python-magic",
   "warcio",
-  "datasets>=2.18.0",
+  "datasets>=3.1.0",
   "orjson",
   "zstandard"
 ]
 s3 = [
   "s3fs>=2023.12.2",
 ]
 processing = [
-    "fasttext-wheel",
+    "fasttext-numpy2-wheel",
     "nltk",
     "inscriptis",
 #   "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
@@ -58,6 +58,7 @@ processing = [
     "tokenizers",
     "ftfy",
     "fasteners",
+    "regex",
     "xxhash",
     "kenlm",
     "pyahocorasick"
@@ -66,13 +67,20 @@ decont = [
     "lighteval>=0.3.0"
 ]
 multilingual = [
-    "spacy",
+    "spacy[ja]>=3.8",
     "stanza",
-    "pyvi",
-    "pythainlp",
-    "jieba",
-    "indic-nlp-library",
-    "kiwipiepy",
+    "pyvi", # vietnamese
+    "pythainlp", # thai
+    "jieba", # chinese
+    "indic-nlp-library", # indic languages
+    "kiwipiepy", # korean
+    # urduhack has keras and tensorflow as dependencies and requires a specific version to work...
+    "urduhack",
+    "tensorflow>=2.16",
+    "khmer-nltk", # khmer
+    "laonlp", # lao
+    "botok", # tibetan languages,
+    "pyidaungsu-numpy2", # burmese
 ]
 quality = [
   "ruff>=0.1.5"
@@ -83,11 +91,15 @@ testing = [
   "datatrove[processing]",
   "datatrove[multilingual]",
   "datatrove[s3]",
-  "datatrove[decont]",
+  # Lighteval doesn't support numpy>=2.0.0
+#  "datatrove[decont]",
+# Flask doesn't have correct dependencies on werkzeux, causing issues, thus we pin flask 3.1 (which currently works) to avoid it
+  "flask>=3.1.0",
   "pytest",
   "pytest-timeout",
   "pytest-xdist",
   "moto[s3,server]",
+  "spacy[ja]"
 ]
 all = [
   "datatrove[quality]",
@@ -128,7 +140,8 @@ lint.select = [
   "E",
   "F",
   "I",
-  "W"
+  "W",
+  "NPY201", # numpy 2.0.0
 ]
 line-length = 119
 

diff --git a/src/datatrove/assets/tokenizer_assignment.csv b/src/datatrove/assets/tokenizer_assignment.csv
diff --git a/...atrove/assets/url_filterblacklists.tar.gz → .../assets/url_filterblacklistsv0_3_0.tar.gz b/...atrove/assets/url_filterblacklists.tar.gz → .../assets/url_filterblacklistsv0_3_0.tar.gz
diff --git a/src/datatrove/io.py b/src/datatrove/io.py
@@ -162,7 +162,7 @@ def list_files(
             ]
         )
 
-    def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]:
+    def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str] | None:
         """Fetch a shard (set of files) for a given rank, assuming there are a total of `world_size` shards.
         This should be deterministic to not have any overlap among different ranks.
         Will return files [rank, rank+world_size, rank+2*world_size, ...]
@@ -175,7 +175,10 @@ def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]:
         Returns: a list of file paths
 
         """
-        return self.list_files(**kwargs)[rank::world_size]
+        all_files = self.list_files(**kwargs)
+        if len(all_files) == 0:
+            return None
+        return all_files[rank::world_size]
 
     def resolve_paths(self, paths) -> list[str] | str:
         """