style: 💄 add flake8 to the quality checks + fix style issues

huggingface · Aug 30, 2021 · c448056 · c448056
1 parent 5795ec5
commit c448056
Show file tree

Hide file tree

Showing 17 changed files with 105 additions and 78 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+# Recommend matching the black line length (119),
+# rather than using the flake8 default of 79:
+max-line-length = 119
+extend-ignore = "E203"
diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
@@ -28,5 +28,7 @@ jobs:
         run: poetry run black --check tests src benchmark
       - name: Run isort
         run: poetry run isort --check-only tests src benchmark
+      - name: Run flake8
+        run: poetry run flake8 tests src benchmark
       - name: Run safety
         run: poetry run safety check
diff --git a/Makefile b/Makefile
@@ -20,6 +20,7 @@ test:
 quality:
 	poetry run black --check tests src benchmark
 	poetry run isort --check-only tests src benchmark
+	poetry run flake8 tests src benchmark
 	poetry run safety check
 
 # Format source code automatically

diff --git a/benchmark/deprecated/test_datasets.py b/benchmark/deprecated/test_datasets.py
@@ -153,14 +153,10 @@ def export_all_datasets_exceptions():
     datasets_iterator = [{"dataset": dataset} for dataset in datasets]
 
     # print("Get info for all the datasets")
-    info_reports = process_map(
-        get_info_report, datasets_iterator, max_workers=max_workers
-    )
+    info_reports = process_map(get_info_report, datasets_iterator, max_workers=max_workers)
 
     print("Get config names for all the datasets")
-    configs_reports = process_map(
-        get_configs_report, datasets_iterator, max_workers=max_workers
-    )
+    configs_reports = process_map(get_configs_report, datasets_iterator, max_workers=max_workers)
 
     print("Get split names for all the pairs (dataset, config)")
     configs_iterator = []

diff --git a/benchmark/scripts/get_serialized_dataset_names.py b/benchmark/scripts/get_serialized_dataset_names.py
@@ -9,9 +9,7 @@
 def main(filename: str):
     dataset_names = list_datasets(with_community_datasets=True)
     # replace '/' in namespaced dataset names
-    serialized_dataset_names = [
-        serialize_dataset_name(dataset_name) for dataset_name in dataset_names
-    ]
+    serialized_dataset_names = [serialize_dataset_name(dataset_name) for dataset_name in dataset_names]
     # # current subdirectories
     # dir_list = next(os.walk(path))[1]
     # # to add

diff --git a/benchmark/scripts/serialize.py b/benchmark/scripts/serialize.py
@@ -18,17 +18,13 @@ def deserialize_dataset_name(serialized_dataset: str) -> str:
 
 def serialize_config_name(dataset: str, config: str) -> str:
     # due to config named "(China)", "bbc hindi nli"
-    safe_config = (
-        config.replace("(", PAR_OPEN).replace(")", PAR_CLOSE).replace(" ", SPACE)
-    )
+    safe_config = config.replace("(", PAR_OPEN).replace(")", PAR_CLOSE).replace(" ", SPACE)
     return serialize_dataset_name(dataset) + CONFIG_SEPARATOR + safe_config
 
 
 def deserialize_config_name(serialized_config: str) -> Tuple[str, str]:
     serialized_dataset, _, safe_config = serialized_config.partition(CONFIG_SEPARATOR)
-    config = (
-        safe_config.replace(PAR_OPEN, "(").replace(PAR_CLOSE, ")").replace(SPACE, " ")
-    )
+    config = safe_config.replace(PAR_OPEN, "(").replace(PAR_CLOSE, ")").replace(SPACE, " ")
     dataset = deserialize_dataset_name(serialized_dataset)
     return dataset, config
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ aiohttp = "^3.7.4"
 datasets = {extras = ["streaming"], git = "https://github.com/huggingface/datasets.git", rev = "b9fb8b2567aecfb14ad0bc31b59329f573eb35df"}
 typer = "^0.3.2"
 safety = "^1.10.3"
+flake8 = "^3.9.2"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.4"
@@ -44,3 +45,7 @@ filterwarnings = [
 
 [tool.isort]
 profile = "black"
+
+[tool.black]
+line-length = 119
+experimental-string-processing = true
diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py
@@ -4,6 +4,4 @@
 from datasets_preview_backend.utils import get_int_value
 
 PORT = get_int_value(d=os.environ, key="PORT", default=DEFAULT_PORT)
-EXTRACT_ROWS_LIMIT = get_int_value(
-    d=os.environ, key="EXTRACT_ROWS_LIMIT", default=DEFAULT_EXTRACT_ROWS_LIMIT
-)
+EXTRACT_ROWS_LIMIT = get_int_value(d=os.environ, key="EXTRACT_ROWS_LIMIT", default=DEFAULT_EXTRACT_ROWS_LIMIT)
diff --git a/src/datasets_preview_backend/queries/configs.py b/src/datasets_preview_backend/queries/configs.py
@@ -17,9 +17,7 @@ def get_configs(dataset: str):
     except FileNotFoundError as err:
         raise Status404Error("The dataset could not be found.") from err
     except Exception as err:
-        raise Status400Error(
-            "The config names could not be parsed from the dataset."
-        ) from err
+        raise Status400Error("The config names could not be parsed from the dataset.") from err
 
     configs = [c.name for c in builder_cls.BUILDER_CONFIGS] or [DEFAULT_CONFIG_NAME]
     logging.debug(f"The dataset builder has {len(configs)} configs: {configs}")

diff --git a/src/datasets_preview_backend/queries/info.py b/src/datasets_preview_backend/queries/info.py
@@ -14,15 +14,10 @@ def get_info(dataset: str):
         module_path, *_ = prepare_module(dataset, dataset=True)
         builder_cls = import_main_class(module_path, dataset=True)
         total_dataset_infos = builder_cls.get_all_exported_dataset_infos()
-        info = {
-            config_name: asdict(config_info)
-            for config_name, config_info in total_dataset_infos.items()
-        }
+        info = {config_name: asdict(config_info) for config_name, config_info in total_dataset_infos.items()}
     except FileNotFoundError as err:
         raise Status404Error("The dataset info could not be found.") from err
     except Exception as err:
-        raise Status400Error(
-            "The dataset info could not be parsed from the dataset."
-        ) from err
+        raise Status400Error("The dataset info could not be parsed from the dataset.") from err
 
     return {"dataset": dataset, "info": info}
diff --git a/src/datasets_preview_backend/queries/rows.py b/src/datasets_preview_backend/queries/rows.py
@@ -23,53 +23,41 @@ def extract_rows(dataset: str, config: Union[str, None], split: str, num_rows: i
     if not isinstance(num_rows, int):
         raise TypeError("num_rows argument should be an int")
 
-    logging.debug(
-        f"asked for {num_rows} first rows of dataset {dataset} - {config} - {split}"
-    )
+    logging.debug(f"asked for {num_rows} first rows of dataset {dataset} - {config} - {split}")
 
     try:
-        iterable_dataset: IterableDataset = load_dataset(
-            dataset, name=config, split=split, streaming=True
-        )
+        iterable_dataset: IterableDataset = load_dataset(dataset, name=config, split=split, streaming=True)
         rows = list(iterable_dataset.take(num_rows))
     except FileNotFoundError as err:
-        raise Status404Error(
-            "The split for the dataset config could not be found."
-        ) from err
+        raise Status404Error("The split for the dataset config could not be found.") from err
     except NotImplementedError as err:
         # TODO: check what has changed once https://github.com/huggingface/datasets/pull/2662 is merged
         try:
-            regex = re.compile(
-                r"Extraction protocol for file at .*?((\.\w+)?\.\w+)* is not implemented yet"
-            )
+            regex = re.compile(r"Extraction protocol for file at .*?((\.\w+)?\.\w+)* is not implemented yet")
             extension = regex.match(str(err)).group(1)
-        except:
-            raise Status400Error(
-                "The rows could not be extracted from the split of the dataset config."
-            ) from err
+        except Exception:
+            raise Status400Error("The rows could not be extracted from the split of the dataset config.") from err
         else:
             raise Status400Error(
-                f"The rows could not be extracted from the split of the dataset config because extension {extension} is not supported."
+                "The rows could not be extracted from the split of the dataset config because extension"
+                f" {extension} is not supported."
             ) from err
     except ValueError as err:
         if (
             str(err).startswith(f"BuilderConfig {config} not found.")
-            or str(err).startswith(f"Config name is missing.")
-            or str(err).startswith(f"Bad split")
+            or str(err).startswith("Config name is missing.")
+            or str(err).startswith("Bad split")
         ):
             raise Status404Error("The dataset config could not be found.") from err
         else:
-            raise Status400Error(
-                "The rows could not be extracted from the split of the dataset config."
-            ) from err
+            raise Status400Error("The rows could not be extracted from the split of the dataset config.") from err
     except Exception as err:
-        raise Status400Error(
-            "The rows could not be extracted from the split of the dataset config."
-        ) from err
+        raise Status400Error("The rows could not be extracted from the split of the dataset config.") from err
 
     if len(rows) != num_rows:
         logging.warning(
-            f"could not read all the required rows ({len(rows)} / {num_rows}) from dataset {dataset} - {config} - {split}"
+            f"could not read all the required rows ({len(rows)} / {num_rows}) from dataset {dataset} - {config} -"
+            f" {split}"
         )
 
     return {

diff --git a/src/datasets_preview_backend/queries/splits.py b/src/datasets_preview_backend/queries/splits.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import Union
 
 from datasets import load_dataset_builder
 from datasets.utils.streaming_download_manager import StreamingDownloadManager
@@ -24,28 +24,20 @@ def get_splits(dataset: str, config: Union[str, None]):
         if str(err).startswith(f"BuilderConfig {config} not found."):
             raise Status404Error("The dataset config could not be found.") from err
         else:
-            raise Status400Error(
-                "The split names could not be parsed from the dataset config."
-            ) from err
+            raise Status400Error("The split names could not be parsed from the dataset config.") from err
     except Exception as err:
-        raise Status400Error(
-            "The split names could not be parsed from the dataset config."
-        ) from err
+        raise Status400Error("The split names could not be parsed from the dataset config.") from err
 
     if builder.info.splits is None:
         # try to get them from _split_generators
         # should not be necessary once https://github.com/huggingface/datasets/issues/2743 is fixed
         try:
             splits = [
                 split_generator.name
-                for split_generator in builder._split_generators(
-                    StreamingDownloadManager(base_path=builder.base_path)
-                )
+                for split_generator in builder._split_generators(StreamingDownloadManager(base_path=builder.base_path))
             ]
         except Exception as err:
-            raise Status400Error(
-                "The split names could not be parsed from the dataset config."
-            ) from err
+            raise Status400Error("The split names could not be parsed from the dataset config.") from err
     else:
         splits = list(builder.info.splits.keys())
     return {"dataset": dataset, "config": config, "splits": splits}
diff --git a/src/datasets_preview_backend/routes.py b/src/datasets_preview_backend/routes.py
@@ -65,9 +65,7 @@ async def rows(request: Request):
     dataset: str = request.query_params.get("dataset")
     config: Union[str, None] = request.query_params.get("config")
     split: str = request.query_params.get("split")
-    num_rows = get_int_value(
-        d=request.query_params, key="rows", default=EXTRACT_ROWS_LIMIT
-    )
+    num_rows = get_int_value(d=request.query_params, key="rows", default=EXTRACT_ROWS_LIMIT)
 
     try:
         return JSONResponse(extract_rows(dataset, config, split, num_rows))

diff --git a/tests/queries/test_configs.py b/tests/queries/test_configs.py
@@ -42,7 +42,8 @@ def test_import_nltk():
 
 
 def test_script_error():
-    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'", which should be caught and raised as DatasetBuilderScriptError
+    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'"
+    # which should be caught and raised as DatasetBuilderScriptError
     with pytest.raises(Status400Error):
         get_configs("piEsposito/br-quad-2.0")
 
@@ -57,7 +58,8 @@ def test_no_dataset_no_script():
     # the dataset does not contain a script
     with pytest.raises(Status404Error):
         get_configs("AConsApart/anime_subtitles_DialoGPT")
-    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'", which should be caught and raised as DatasetBuilderScriptError
+    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'"
+    # which should be caught and raised as DatasetBuilderScriptError
     with pytest.raises(Status404Error):
         get_configs("TimTreasure4/Test")
 

diff --git a/tests/queries/test_info.py b/tests/queries/test_info.py
@@ -39,7 +39,8 @@ def test_bad_type_argument():
 
 
 def test_script_error():
-    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'", which should be caught and raised as DatasetBuilderScriptError
+    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.br-quad-2'"
+    # which should be caught and raised as DatasetBuilderScriptError
     with pytest.raises(Status400Error):
         get_info("piEsposito/br-quad-2.0")
 
@@ -54,7 +55,8 @@ def test_no_dataset_no_script():
     # the dataset does not contain a script
     with pytest.raises(Status404Error):
         get_info("AConsApart/anime_subtitles_DialoGPT")
-    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'", which should be caught and raised as DatasetBuilderScriptError
+    # raises "ModuleNotFoundError: No module named 'datasets_modules.datasets.Test'"
+    # which should be caught and raised as DatasetBuilderScriptError
     with pytest.raises(Status404Error):
         get_info("TimTreasure4/Test")