-
Notifications
You must be signed in to change notification settings - Fork 167
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
build: replace setup.py with pyproject.toml (#59)
* build: replace setup.py with pyproject.toml * build: swap comments * Fix CI * Minor improvement * Fix CI again --------- Co-authored-by: mariosasko <[email protected]>
- Loading branch information
1 parent
9bef798
commit d872e4f
Showing
4 changed files
with
119 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,123 @@ | ||
[project] | ||
name = "datatrove" | ||
version = "0.0.1.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) | ||
description = "HuggingFace library to process and filter large amounts of webdata" | ||
readme = "README.md" | ||
authors = [ | ||
{name = "HuggingFace Inc.", email = "[email protected]"} | ||
] | ||
license = {text = "Apache-2.0"} | ||
classifiers = [ | ||
"Intended Audience :: Developers", | ||
"Intended Audience :: Education", | ||
"Intended Audience :: Science/Research", | ||
"License :: OSI Approved :: Apache Software License", | ||
"Operating System :: OS Independent", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11", | ||
"Programming Language :: Python :: 3.12", | ||
"Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
] | ||
keywords = ["data", "machine", "learning", "processing"] | ||
requires-python = ">=3.10.0" | ||
dependencies = [ | ||
"dill>=0.3.0", | ||
"fsspec>=2023.6.0", | ||
"huggingface-hub>=0.17.0", | ||
"humanize", | ||
"loguru>=0.7.0", | ||
"multiprocess", | ||
"numpy>=1.25.0", | ||
"tqdm", | ||
] | ||
|
||
[project.optional-dependencies] | ||
cli = [ | ||
"rich", | ||
] | ||
io = [ | ||
"faust-cchardet", | ||
"pyarrow", | ||
"python-magic", | ||
"warcio", | ||
"datasets" | ||
] | ||
s3 = [ | ||
"s3fs>=2023.12.2", | ||
] | ||
processing = [ | ||
"fasttext-wheel", | ||
"nltk", | ||
"inscriptis", | ||
"readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", | ||
"tldextract", | ||
"trafilatura", | ||
"tokenizers", | ||
] | ||
quality = [ | ||
"ruff>=0.1.5" | ||
] | ||
testing = [ | ||
"datatrove[cli]", | ||
"datatrove[io]", | ||
"datatrove[processing]", | ||
"datatrove[s3]", | ||
"pytest", | ||
"pytest-timeout", | ||
"pytest-xdist", | ||
"moto[s3,server]", | ||
] | ||
all = [ | ||
"datatrove[quality]", | ||
"datatrove[testing]", | ||
] | ||
dev = [ | ||
"datatrove[all]" | ||
] | ||
|
||
[project.urls] | ||
Repository = "https://github.com/huggingface/datatrove" | ||
|
||
[project.scripts] | ||
check_dataset = "datatrove.tools.check_dataset:main" | ||
merge_stats = "datatrove.tools.merge_stats:main" | ||
launch_pickled_pipeline = "datatrove.tools.launch_pickled_pipeline:main" | ||
failed_logs = "datatrove.tools.failed_logs:main" | ||
inspect_data = "datatrove.tools.inspect_data:main" | ||
|
||
[build-system] | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[tool.setuptools.packages.find] | ||
where = ["src"] | ||
|
||
[tool.setuptools.package-data] | ||
datatrove = ["assets/*"] | ||
|
||
[tool.ruff] | ||
# Ignored rules: | ||
# "E501" -> line length violation | ||
# "C901" -> `function_name` is too complex | ||
ignore = ["C901", "E501"] | ||
select = ["C", "E", "F", "I", "W"] | ||
ignore = [ | ||
"C901", # `function_name` is too complex | ||
"E501", # line length violation | ||
] | ||
select = [ | ||
"C", | ||
"E", | ||
"F", | ||
"I", | ||
"W" | ||
] | ||
line-length = 119 | ||
|
||
[tool.ruff.per-file-ignores] | ||
# Ignored rules: | ||
# "F401" -> module imported but unused | ||
"__init__.py" = ["F401"] | ||
"__init__.py" = [ | ||
"F401" # module imported but unused | ||
] | ||
|
||
[tool.ruff.isort] | ||
lines-after-imports = 2 | ||
known-first-party = ["datatrove"] | ||
known-first-party = [ | ||
"datatrove" | ||
] | ||
|