Skip to content

Commit

Permalink
build: replace setup.py with pyproject.toml (#59)
Browse files Browse the repository at this point in the history
* build: replace setup.py with pyproject.toml

* build: swap comments

* Fix CI

* Minor improvement

* Fix CI again

---------

Co-authored-by: mariosasko <[email protected]>
  • Loading branch information
baggiponte and mariosasko authored Jan 23, 2024
1 parent 9bef798 commit d872e4f
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 109 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ jobs:
pip install .[quality]
- name: Check quality
run: |
ruff check tests src setup.py # linter
ruff format --check tests src setup.py # formatter
ruff check tests src # linter
ruff format --check tests src # formatter
test:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.PHONY: quality style test

check_dirs := src tests examples setup.py
check_dirs := src tests examples

quality:
ruff check $(check_dirs) # linter
Expand Down
125 changes: 116 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,123 @@
[project]
name = "datatrove"
version = "0.0.1.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description = "HuggingFace library to process and filter large amounts of webdata"
readme = "README.md"
authors = [
{name = "HuggingFace Inc.", email = "[email protected]"}
]
license = {text = "Apache-2.0"}
classifiers = [
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
keywords = ["data", "machine", "learning", "processing"]
requires-python = ">=3.10.0"
dependencies = [
"dill>=0.3.0",
"fsspec>=2023.6.0",
"huggingface-hub>=0.17.0",
"humanize",
"loguru>=0.7.0",
"multiprocess",
"numpy>=1.25.0",
"tqdm",
]

[project.optional-dependencies]
cli = [
"rich",
]
io = [
"faust-cchardet",
"pyarrow",
"python-magic",
"warcio",
"datasets"
]
s3 = [
"s3fs>=2023.12.2",
]
processing = [
"fasttext-wheel",
"nltk",
"inscriptis",
"readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
"tldextract",
"trafilatura",
"tokenizers",
]
quality = [
"ruff>=0.1.5"
]
testing = [
"datatrove[cli]",
"datatrove[io]",
"datatrove[processing]",
"datatrove[s3]",
"pytest",
"pytest-timeout",
"pytest-xdist",
"moto[s3,server]",
]
all = [
"datatrove[quality]",
"datatrove[testing]",
]
dev = [
"datatrove[all]"
]

[project.urls]
Repository = "https://github.com/huggingface/datatrove"

[project.scripts]
check_dataset = "datatrove.tools.check_dataset:main"
merge_stats = "datatrove.tools.merge_stats:main"
launch_pickled_pipeline = "datatrove.tools.launch_pickled_pipeline:main"
failed_logs = "datatrove.tools.failed_logs:main"
inspect_data = "datatrove.tools.inspect_data:main"

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-data]
datatrove = ["assets/*"]

[tool.ruff]
# Ignored rules:
# "E501" -> line length violation
# "C901" -> `function_name` is too complex
ignore = ["C901", "E501"]
select = ["C", "E", "F", "I", "W"]
ignore = [
"C901", # `function_name` is too complex
"E501", # line length violation
]
select = [
"C",
"E",
"F",
"I",
"W"
]
line-length = 119

[tool.ruff.per-file-ignores]
# Ignored rules:
# "F401" -> module imported but unused
"__init__.py" = ["F401"]
"__init__.py" = [
"F401" # module imported but unused
]

[tool.ruff.isort]
lines-after-imports = 2
known-first-party = ["datatrove"]
known-first-party = [
"datatrove"
]

97 changes: 0 additions & 97 deletions setup.py

This file was deleted.

0 comments on commit d872e4f

Please sign in to comment.