diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a39cd89d..675e7897 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,8 +23,8 @@ jobs: pip install .[quality] - name: Check quality run: | - ruff check tests src setup.py # linter - ruff format --check tests src setup.py # formatter + ruff check tests src # linter + ruff format --check tests src # formatter test: runs-on: ubuntu-latest diff --git a/Makefile b/Makefile index 8e325385..a967892a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .PHONY: quality style test -check_dirs := src tests examples setup.py +check_dirs := src tests examples quality: ruff check $(check_dirs) # linter diff --git a/pyproject.toml b/pyproject.toml index 7f72ee25..93c5e398 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,123 @@ +[project] +name = "datatrove" +version = "0.0.1.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) +description = "HuggingFace library to process and filter large amounts of webdata" +readme = "README.md" +authors = [ + {name = "HuggingFace Inc.", email = "guilherme@huggingface.co"} +] +license = {text = "Apache-2.0"} +classifiers = [ + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] +keywords = ["data", "machine", "learning", "processing"] +requires-python = ">=3.10.0" +dependencies = [ + "dill>=0.3.0", + "fsspec>=2023.6.0", + "huggingface-hub>=0.17.0", + "humanize", + "loguru>=0.7.0", + "multiprocess", + "numpy>=1.25.0", + "tqdm", +] + +[project.optional-dependencies] +cli = [ + "rich", +] +io = [ + "faust-cchardet", + "pyarrow", + "python-magic", + "warcio", + "datasets" +] +s3 = [ + "s3fs>=2023.12.2", +] +processing = [ + "fasttext-wheel", + "nltk", + "inscriptis", + "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", + "tldextract", + "trafilatura", + "tokenizers", +] +quality = [ + "ruff>=0.1.5" +] +testing = [ + "datatrove[cli]", + "datatrove[io]", + "datatrove[processing]", + "datatrove[s3]", + "pytest", + "pytest-timeout", + "pytest-xdist", + "moto[s3,server]", +] +all = [ + "datatrove[quality]", + "datatrove[testing]", +] +dev = [ + "datatrove[all]" +] + +[project.urls] +Repository = "https://github.com/huggingface/datatrove" + +[project.scripts] +check_dataset = "datatrove.tools.check_dataset:main" +merge_stats = "datatrove.tools.merge_stats:main" +launch_pickled_pipeline = "datatrove.tools.launch_pickled_pipeline:main" +failed_logs = "datatrove.tools.failed_logs:main" +inspect_data = "datatrove.tools.inspect_data:main" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +datatrove = ["assets/*"] + [tool.ruff] -# Ignored rules: -# "E501" -> line length violation -# "C901" -> `function_name` is too complex -ignore = ["C901", "E501"] -select = ["C", "E", "F", "I", "W"] +ignore = [ + "C901", # `function_name` is too complex + "E501", # line length violation +] +select = [ + "C", + "E", + "F", + "I", + "W" +] line-length = 119 [tool.ruff.per-file-ignores] -# Ignored rules: -# "F401" -> module imported but unused -"__init__.py" = ["F401"] +"__init__.py" = [ + "F401" # module imported but unused +] [tool.ruff.isort] lines-after-imports = 2 -known-first-party = ["datatrove"] +known-first-party = [ + "datatrove" +] + diff --git a/setup.py b/setup.py deleted file mode 100644 index 60b0024c..00000000 --- a/setup.py +++ /dev/null @@ -1,97 +0,0 @@ -from setuptools import find_packages, setup - - -install_requires = [ - "dill>=0.3.0", - "fsspec>=2023.6.0", - "huggingface-hub>=0.17.0", - "humanize", - "loguru>=0.7.0", - "multiprocess", - "numpy>=1.25.0", - "tqdm", -] - -extras = {} - -extras["cli"] = [ - "rich", -] - -extras["io"] = ["faust-cchardet", "pyarrow", "python-magic", "warcio", "datasets"] - -extras["s3"] = [ - "s3fs>=2023.12.2", -] - -extras["processing"] = [ - "fasttext-wheel", - "nltk", - "inscriptis", - "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", - "tldextract", - "trafilatura", - "tokenizers", -] - -extras["quality"] = [ - "ruff>=0.1.5", -] - -extras["testing"] = ( - extras["cli"] - + extras["io"] - + extras["processing"] - + extras["s3"] - + [ - "pytest", - "pytest-timeout", - "pytest-xdist", - "moto[s3,server]", - ] -) - -extras["all"] = extras["quality"] + extras["testing"] - -extras["dev"] = extras["all"] - -setup( - name="datatrove", - version="0.0.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) - description="HuggingFace library to process and filter large amounts of webdata", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - author="HuggingFace Inc.", - author_email="guilherme@huggingface.co", - url="https://github.com/huggingface/datatrove", - license="Apache 2.0", - packages=find_packages("src"), - package_dir={"": "src"}, - package_data={"": ["assets/*"]}, - include_package_data=True, - python_requires=">=3.10.0", - install_requires=install_requires, - extras_require=extras, - classifiers=[ - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], - keywords="data machine learning processing", - entry_points={ - "console_scripts": [ - "check_dataset=datatrove.tools.check_dataset:main", - "merge_stats=datatrove.tools.merge_stats:main", - "launch_pickled_pipeline=datatrove.tools.launch_pickled_pipeline:main", - "failed_logs=datatrove.tools.failed_logs:main", - "inspect_data=datatrove.tools.inspect_data:main", - ] - }, -)