Skip to content

Commit

Permalink
style: 💄 add flake8 to the quality checks + fix style issues
Browse files Browse the repository at this point in the history
  • Loading branch information
severo committed Aug 30, 2021
1 parent 5795ec5 commit c448056
Show file tree
Hide file tree
Showing 17 changed files with 105 additions and 78 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
# Recommend matching the black line length (119),
# rather than using the flake8 default of 79:
max-line-length = 119
extend-ignore = "E203"
2 changes: 2 additions & 0 deletions .github/workflows/quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,7 @@ jobs:
run: poetry run black --check tests src benchmark
- name: Run isort
run: poetry run isort --check-only tests src benchmark
- name: Run flake8
run: poetry run flake8 tests src benchmark
- name: Run safety
run: poetry run safety check
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ test:
quality:
poetry run black --check tests src benchmark
poetry run isort --check-only tests src benchmark
poetry run flake8 tests src benchmark
poetry run safety check

# Format source code automatically
Expand Down
8 changes: 2 additions & 6 deletions benchmark/deprecated/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,10 @@ def export_all_datasets_exceptions():
datasets_iterator = [{"dataset": dataset} for dataset in datasets]

# print("Get info for all the datasets")
info_reports = process_map(
get_info_report, datasets_iterator, max_workers=max_workers
)
info_reports = process_map(get_info_report, datasets_iterator, max_workers=max_workers)

print("Get config names for all the datasets")
configs_reports = process_map(
get_configs_report, datasets_iterator, max_workers=max_workers
)
configs_reports = process_map(get_configs_report, datasets_iterator, max_workers=max_workers)

print("Get split names for all the pairs (dataset, config)")
configs_iterator = []
Expand Down
4 changes: 1 addition & 3 deletions benchmark/scripts/get_serialized_dataset_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
def main(filename: str):
dataset_names = list_datasets(with_community_datasets=True)
# replace '/' in namespaced dataset names
serialized_dataset_names = [
serialize_dataset_name(dataset_name) for dataset_name in dataset_names
]
serialized_dataset_names = [serialize_dataset_name(dataset_name) for dataset_name in dataset_names]
# # current subdirectories
# dir_list = next(os.walk(path))[1]
# # to add
Expand Down
8 changes: 2 additions & 6 deletions benchmark/scripts/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,13 @@ def deserialize_dataset_name(serialized_dataset: str) -> str:

def serialize_config_name(dataset: str, config: str) -> str:
# due to config named "(China)", "bbc hindi nli"
safe_config = (
config.replace("(", PAR_OPEN).replace(")", PAR_CLOSE).replace(" ", SPACE)
)
safe_config = config.replace("(", PAR_OPEN).replace(")", PAR_CLOSE).replace(" ", SPACE)
return serialize_dataset_name(dataset) + CONFIG_SEPARATOR + safe_config


def deserialize_config_name(serialized_config: str) -> Tuple[str, str]:
serialized_dataset, _, safe_config = serialized_config.partition(CONFIG_SEPARATOR)
config = (
safe_config.replace(PAR_OPEN, "(").replace(PAR_CLOSE, ")").replace(SPACE, " ")
)
config = safe_config.replace(PAR_OPEN, "(").replace(PAR_CLOSE, ")").replace(SPACE, " ")
dataset = deserialize_dataset_name(serialized_dataset)
return dataset, config

Expand Down
55 changes: 54 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ aiohttp = "^3.7.4"
datasets = {extras = ["streaming"], git = "https://github.com/huggingface/datasets.git", rev = "b9fb8b2567aecfb14ad0bc31b59329f573eb35df"}
typer = "^0.3.2"
safety = "^1.10.3"
flake8 = "^3.9.2"

[tool.poetry.dev-dependencies]
pytest = "^6.2.4"
Expand All @@ -44,3 +45,7 @@ filterwarnings = [

[tool.isort]
profile = "black"

[tool.black]
line-length = 119
experimental-string-processing = true
4 changes: 1 addition & 3 deletions src/datasets_preview_backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@
from datasets_preview_backend.utils import get_int_value

PORT = get_int_value(d=os.environ, key="PORT", default=DEFAULT_PORT)
EXTRACT_ROWS_LIMIT = get_int_value(
d=os.environ, key="EXTRACT_ROWS_LIMIT", default=DEFAULT_EXTRACT_ROWS_LIMIT
)
EXTRACT_ROWS_LIMIT = get_int_value(d=os.environ, key="EXTRACT_ROWS_LIMIT", default=DEFAULT_EXTRACT_ROWS_LIMIT)
4 changes: 1 addition & 3 deletions src/datasets_preview_backend/queries/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ def get_configs(dataset: str):
except FileNotFoundError as err:
raise Status404Error("The dataset could not be found.") from err
except Exception as err:
raise Status400Error(
"The config names could not be parsed from the dataset."
) from err
raise Status400Error("The config names could not be parsed from the dataset.") from err

configs = [c.name for c in builder_cls.BUILDER_CONFIGS] or [DEFAULT_CONFIG_NAME]
logging.debug(f"The dataset builder has {len(configs)} configs: {configs}")
Expand Down
9 changes: 2 additions & 7 deletions src/datasets_preview_backend/queries/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,10 @@ def get_info(dataset: str):
module_path, *_ = prepare_module(dataset, dataset=True)
builder_cls = import_main_class(module_path, dataset=True)
total_dataset_infos = builder_cls.get_all_exported_dataset_infos()
info = {
config_name: asdict(config_info)
for config_name, config_info in total_dataset_infos.items()
}
info = {config_name: asdict(config_info) for config_name, config_info in total_dataset_infos.items()}
except FileNotFoundError as err:
raise Status404Error("The dataset info could not be found.") from err
except Exception as err:
raise Status400Error(
"The dataset info could not be parsed from the dataset."
) from err
raise Status400Error("The dataset info could not be parsed from the dataset.") from err

return {"dataset": dataset, "info": info}
40 changes: 14 additions & 26 deletions src/datasets_preview_backend/queries/rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,53 +23,41 @@ def extract_rows(dataset: str, config: Union[str, None], split: str, num_rows: i
if not isinstance(num_rows, int):
raise TypeError("num_rows argument should be an int")

logging.debug(
f"asked for {num_rows} first rows of dataset {dataset} - {config} - {split}"
)
logging.debug(f"asked for {num_rows} first rows of dataset {dataset} - {config} - {split}")

try:
iterable_dataset: IterableDataset = load_dataset(
dataset, name=config, split=split, streaming=True
)
iterable_dataset: IterableDataset = load_dataset(dataset, name=config, split=split, streaming=True)
rows = list(iterable_dataset.take(num_rows))
except FileNotFoundError as err:
raise Status404Error(
"The split for the dataset config could not be found."
) from err
raise Status404Error("The split for the dataset config could not be found.") from err
except NotImplementedError as err:
# TODO: check what has changed once https://github.com/huggingface/datasets/pull/2662 is merged
try:
regex = re.compile(
r"Extraction protocol for file at .*?((\.\w+)?\.\w+)* is not implemented yet"
)
regex = re.compile(r"Extraction protocol for file at .*?((\.\w+)?\.\w+)* is not implemented yet")
extension = regex.match(str(err)).group(1)
except:
raise Status400Error(
"The rows could not be extracted from the split of the dataset config."
) from err
except Exception:
raise Status400Error("The rows could not be extracted from the split of the dataset config.") from err
else:
raise Status400Error(
f"The rows could not be extracted from the split of the dataset config because extension {extension} is not supported."
"The rows could not be extracted from the split of the dataset config because extension"
f" {extension} is not supported."
) from err
except ValueError as err:
if (
str(err).startswith(f"BuilderConfig {config} not found.")
or str(err).startswith(f"Config name is missing.")
or str(err).startswith(f"Bad split")
or str(err).startswith("Config name is missing.")
or str(err).startswith("Bad split")
):
raise Status404Error("The dataset config could not be found.") from err
else:
raise Status400Error(
"The rows could not be extracted from the split of the dataset config."
) from err
raise Status400Error("The rows could not be extracted from the split of the dataset config.") from err
except Exception as err:
raise Status400Error(
"The rows could not be extracted from the split of the dataset config."
) from err
raise Status400Error("The rows could not be extracted from the split of the dataset config.") from err

if len(rows) != num_rows:
logging.warning(
f"could not read all the required rows ({len(rows)} / {num_rows}) from dataset {dataset} - {config} - {split}"
f"could not read all the required rows ({len(rows)} / {num_rows}) from dataset {dataset} - {config} -"
f" {split}"
)

return {
Expand Down
18 changes: 5 additions & 13 deletions src/datasets_preview_backend/queries/splits.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Union
from typing import Union

from datasets import load_dataset_builder
from datasets.utils.streaming_download_manager import StreamingDownloadManager
Expand All @@ -24,28 +24,20 @@ def get_splits(dataset: str, config: Union[str, None]):
if str(err).startswith(f"BuilderConfig {config} not found."):
raise Status404Error("The dataset config could not be found.") from err
else:
raise Status400Error(
"The split names could not be parsed from the dataset config."
) from err
raise Status400Error("The split names could not be parsed from the dataset config.") from err
except Exception as err:
raise Status400Error(
"The split names could not be parsed from the dataset config."
) from err
raise Status400Error("The split names could not be parsed from the dataset config.") from err

if builder.info.splits is None:
# try to get them from _split_generators
# should not be necessary once https://github.com/huggingface/datasets/issues/2743 is fixed
try:
splits = [
split_generator.name
for split_generator in builder._split_generators(
StreamingDownloadManager(base_path=builder.base_path)
)
for split_generator in builder._split_generators(StreamingDownloadManager(base_path=builder.base_path))
]
except Exception as err:
raise Status400Error(
"The split names could not be parsed from the dataset config."
) from err
raise Status400Error("The split names could not be parsed from the dataset config.") from err
else:
splits = list(builder.info.splits.keys())
return {"dataset": dataset, "config": config, "splits": splits}
4 changes: 1 addition & 3 deletions src/datasets_preview_backend/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ async def rows(request: Request):
dataset: str = request.query_params.get("dataset")
config: Union[str, None] = request.query_params.get("config")
split: str = request.query_params.get("split")
num_rows = get_int_value(
d=request.query_params, key="rows", default=EXTRACT_ROWS_LIMIT
)
num_rows = get_int_value(d=request.query_params, key="rows", default=EXTRACT_ROWS_LIMIT)

try:
return JSONResponse(extract_rows(dataset, config, split, num_rows))
Expand Down
6 changes: 4 additions & 2 deletions tests/queries/test_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def test_import_nltk():


def test_script_error():
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'", which should be caught and raised as DatasetBuilderScriptError
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'"
# which should be caught and raised as DatasetBuilderScriptError
with pytest.raises(Status400Error):
get_configs("piEsposito/br-quad-2.0")

Expand All @@ -57,7 +58,8 @@ def test_no_dataset_no_script():
# the dataset does not contain a script
with pytest.raises(Status404Error):
get_configs("AConsApart/anime_subtitles_DialoGPT")
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'", which should be caught and raised as DatasetBuilderScriptError
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'"
# which should be caught and raised as DatasetBuilderScriptError
with pytest.raises(Status404Error):
get_configs("TimTreasure4/Test")

Expand Down
6 changes: 4 additions & 2 deletions tests/queries/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def test_bad_type_argument():


def test_script_error():
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'", which should be caught and raised as DatasetBuilderScriptError
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'"
# which should be caught and raised as DatasetBuilderScriptError
with pytest.raises(Status400Error):
get_info("piEsposito/br-quad-2.0")

Expand All @@ -54,7 +55,8 @@ def test_no_dataset_no_script():
# the dataset does not contain a script
with pytest.raises(Status404Error):
get_info("AConsApart/anime_subtitles_DialoGPT")
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'", which should be caught and raised as DatasetBuilderScriptError
# raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'"
# which should be caught and raised as DatasetBuilderScriptError
with pytest.raises(Status404Error):
get_info("TimTreasure4/Test")

Expand Down
Loading

0 comments on commit c448056

Please sign in to comment.