Skip to content

Commit

Permalink
FineWeb-2: multilingual, numpy 2.0, minhash improvements (#285)
Browse files Browse the repository at this point in the history
* change fw quality to strict inequality

* bugfix for empty lines (breaking chinese samples)

* word tokenizers changes: use spacy when possible, added missing languages from spacy and stanza

* add all available tokenizers and all iso-639-1 languages

* fix tokenizer issues

* add todo

* changed indic langs tokenizers to indicnlp

* fix tests

* added khmer, tibetan and lao

* using new tokenizer assignment and language definition system

* fix default script

* add assignments file

* fixes for japanese and tibetan

* fixed khmer tokenizer not being active

* fix for nan

* added south azerbeijani proxy tokenizer

* more tibet workarounds

* georgian tokenizer and korean fix

* added georgian in tokenizer assignments

* fix for korean tokenizer: remove very large numbers

* add additional punctuation and improve number normalization for other scripts

* fix number pattern

* added saving cluster_sizes in minhash and bugfixed saving cluster ids

* added fallback whitespace tokenizer and fixed tokenizer assignment for iso1 codes

* fix memory leaks in word/sent tokenization

* ignore ruff

* add memoryz zone to spans

* add regex to reqs, update hf tests to reflect the new datasets version, fix word tokenizers global vars

* empty commit

* empty commit

* fmt

* unlock tensorflow version

* bump flask

* fix flasky test

* add comment about flask

* actually fix the flaky test . > !

* japanese tok bugfix

* more generous split for japanese to overcome whatever weird normalization they do

* allow restarting from "sorting buckets" part when ooming in minhash

* small refactor

* ugfix

* bugfix

* jpn word_tokenize

* added sparse arrays option

* add tqdm

* add log msg

* rust

* rust

* rust

* rust

* rust

* rust

* rust

* messages

* messages

* messages

* messages

* sort list of files

* no async sanity test

* fixes

* updates

* bunch of changes

* fix def value

* added check

* added check

* added check

* added check

* remove useless lock

* some improvements

* 1 sec

* GIVE ME MY PROGRESS BARS GOD DAMN IT

* GIVE ME MY PROGRESS BARS GOD DAMN IT

* GIVE ME MY PROGRESS BARS GOD DAMN IT

* revert

* stupid logspath

* giving up. just printing now

* giving up. just printing now

* network limiting

* network limiting

* updated work_tokenizer assignments and added burmese

* add dependency

* add local version

* remove progress message

* fix for no .remove file

* fix missing language tokenizer

* fixes for empty folders

* reuse word tokenizations between blocks

* remove dumb print

* updated url filter blocklists

* updated symbollinesformatter

* moved rust tool

* add rust tool readme

* fix terminal punctuation in fineweb quality filter

---------

Co-authored-by: Hynek Kydlicek <[email protected]>
Co-authored-by: Hynek Kydlicek <[email protected]>
  • Loading branch information
3 people authored Dec 6, 2024
1 parent fe81883 commit 8427759
Show file tree
Hide file tree
Showing 29 changed files with 10,839 additions and 396 deletions.
35 changes: 24 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ dependencies = [
"humanize",
"loguru>=0.7.0",
"multiprocess",
"numpy>=1.25.0,<2.0.0",
"numpy>=2.0.0",
"tqdm",
]

Expand All @@ -41,15 +41,15 @@ io = [
"pyarrow",
"python-magic",
"warcio",
"datasets>=2.18.0",
"datasets>=3.1.0",
"orjson",
"zstandard"
]
s3 = [
"s3fs>=2023.12.2",
]
processing = [
"fasttext-wheel",
"fasttext-numpy2-wheel",
"nltk",
"inscriptis",
# "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
Expand All @@ -58,6 +58,7 @@ processing = [
"tokenizers",
"ftfy",
"fasteners",
"regex",
"xxhash",
"kenlm",
"pyahocorasick"
Expand All @@ -66,13 +67,20 @@ decont = [
"lighteval>=0.3.0"
]
multilingual = [
"spacy",
"spacy[ja]>=3.8",
"stanza",
"pyvi",
"pythainlp",
"jieba",
"indic-nlp-library",
"kiwipiepy",
"pyvi", # vietnamese
"pythainlp", # thai
"jieba", # chinese
"indic-nlp-library", # indic languages
"kiwipiepy", # korean
# urduhack has keras and tensorflow as dependencies and requires a specific version to work...
"urduhack",
"tensorflow>=2.16",
"khmer-nltk", # khmer
"laonlp", # lao
"botok", # tibetan languages,
"pyidaungsu-numpy2", # burmese
]
quality = [
"ruff>=0.1.5"
Expand All @@ -83,11 +91,15 @@ testing = [
"datatrove[processing]",
"datatrove[multilingual]",
"datatrove[s3]",
"datatrove[decont]",
# Lighteval doesn't support numpy>=2.0.0
# "datatrove[decont]",
# Flask doesn't have correct dependencies on werkzeux, causing issues, thus we pin flask 3.1 (which currently works) to avoid it
"flask>=3.1.0",
"pytest",
"pytest-timeout",
"pytest-xdist",
"moto[s3,server]",
"spacy[ja]"
]
all = [
"datatrove[quality]",
Expand Down Expand Up @@ -128,7 +140,8 @@ lint.select = [
"E",
"F",
"I",
"W"
"W",
"NPY201", # numpy 2.0.0
]
line-length = 119

Expand Down
2,178 changes: 2,178 additions & 0 deletions src/datatrove/assets/tokenizer_assignment.csv

Large diffs are not rendered by default.

Binary file not shown.
7 changes: 5 additions & 2 deletions src/datatrove/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def list_files(
]
)

def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]:
def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str] | None:
"""Fetch a shard (set of files) for a given rank, assuming there are a total of `world_size` shards.
This should be deterministic to not have any overlap among different ranks.
Will return files [rank, rank+world_size, rank+2*world_size, ...]
Expand All @@ -175,7 +175,10 @@ def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]:
Returns: a list of file paths
"""
return self.list_files(**kwargs)[rank::world_size]
all_files = self.list_files(**kwargs)
if len(all_files) == 0:
return None
return all_files[rank::world_size]

def resolve_paths(self, paths) -> list[str] | str:
"""
Expand Down
Loading

0 comments on commit 8427759

Please sign in to comment.