From 65b8761db922513dada0320b860fabb1b4f01dc3 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Wed, 20 Dec 2023 19:14:46 +0500
Subject: [PATCH] Switch Linting to `ruff` (#1166)

* add ruff and isort. remove black and flake8

* remove unnecessary dependencies

* remove dependency from table

* change order

* ran ruff

* check 3.9

* exclude evaluator

* update CI workflow

* use ruff config in pyproject.toml

* test

* add isort rules to ruff

* sort imports

* import `make_table`

* try stages for no-commit-to-branch

* turn on mypy for pre-commit

* test

* test

* test

* change no-commit-to-branch to default

* nits

* fixed dependency
---
 .github/workflows/new_tasks.yml               |  2 +-
 .github/workflows/unit_tests.yml              | 29 +++----
 .pre-commit-config.yaml                       | 16 ++--
 README.md                                     |  5 +-
 lm_eval/__main__.py                           | 19 ++--
 lm_eval/api/filter.py                         |  4 +-
 lm_eval/api/metrics.py                        |  9 +-
 lm_eval/api/model.py                          | 10 +--
 lm_eval/api/registry.py                       | 10 +--
 lm_eval/api/samplers.py                       | 10 +--
 lm_eval/api/task.py                           | 86 ++++++++-----------
 lm_eval/decontamination/archiver.py           | 13 +--
 lm_eval/decontamination/decontaminate.py      | 20 ++---
 lm_eval/decontamination/janitor.py            |  6 +-
 lm_eval/evaluator.py                          | 14 ++-
 lm_eval/filters/__init__.py                   |  2 +-
 lm_eval/models/anthropic_llms.py              | 11 ++-
 lm_eval/models/dummy.py                       |  1 +
 lm_eval/models/gguf.py                        |  7 +-
 lm_eval/models/huggingface.py                 | 50 ++++++-----
 lm_eval/models/openai_completions.py          | 38 ++++----
 lm_eval/models/textsynth.py                   |  6 +-
 lm_eval/models/vllm_causallms.py              | 28 +++---
 lm_eval/prompts/__init__.py                   |  2 -
 lm_eval/tasks/__init__.py                     |  1 -
 lm_eval/tasks/bbh/_generate_configs.py        |  2 -
 lm_eval/tasks/belebele/_generate_configs.py   |  1 -
 lm_eval/tasks/bigbench/generate_tasks.py      |  1 -
 lm_eval/tasks/blimp/generate_configs.py       |  1 -
 lm_eval/tasks/ceval/_generate_configs.py      |  5 +-
 lm_eval/tasks/cmmlu/_generate_configs.py      |  5 +-
 lm_eval/tasks/code_x_glue/code-text/bleu.py   | 11 +--
 lm_eval/tasks/code_x_glue/code-text/utils.py  |  2 -
 lm_eval/tasks/coqa/utils.py                   |  4 +-
 lm_eval/tasks/csatqa/_generate_configs.py     |  2 -
 lm_eval/tasks/drop/utils.py                   |  1 -
 lm_eval/tasks/ifeval/instructions_registry.py | 27 ++----
 lm_eval/tasks/ifeval/instructions_util.py     |  1 -
 lm_eval/tasks/mgsm/utils.py                   |  1 -
 lm_eval/tasks/mmlu/_generate_configs.py       |  3 -
 .../advanced_ai_risk/_generate_configs.py     |  2 -
 .../persona/_generate_configs.py              |  2 -
 lm_eval/tasks/paws-x/_generate_config.py      |  1 -
 lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py |  3 +-
 lm_eval/tasks/qasper/utils.py                 |  1 -
 lm_eval/tasks/scrolls/task.py                 |  1 -
 lm_eval/tasks/squadv2/task.py                 |  5 +-
 lm_eval/tasks/super_glue/cb/t5_utils.py       |  2 -
 lm_eval/tasks/super_glue/multirc/t5_utils.py  |  3 -
 lm_eval/tasks/super_glue/record/t5_utils.py   |  5 +-
 lm_eval/tasks/super_glue/wsc/t5_utils.py      |  4 -
 lm_eval/tasks/translation/utils.py            |  2 -
 lm_eval/tasks/truthfulqa/utils.py             |  3 -
 lm_eval/tasks/xnli/utils.py                   |  1 -
 lm_eval/utils.py                              | 36 ++++----
 mypy.ini                                      | 34 ++++----
 pyproject.toml                                | 23 +++--
 scripts/build_benchmark.py                    | 10 +--
 .../compress_and_package.py                   | 10 +--
 .../clean_training_data/generate_13_grams.py  | 19 ++--
 .../clean_training_data/investigate_pile.py   |  9 +-
 .../process_sorted_buckets.py                 | 12 ++-
 .../sort_13_gram_buckets.py                   | 11 ++-
 scripts/cost_estimate.py                      |  4 +-
 scripts/get_prompts.py                        |  4 +-
 scripts/make_gpt2_test_cases.py               |  5 +-
 scripts/make_table_results.py                 |  5 +-
 scripts/make_table_tasks.py                   |  4 +-
 scripts/model_comparator.py                   | 14 +--
 scripts/regression.py                         | 10 ++-
 scripts/write_out.py                          | 10 ++-
 setup.py                                      |  1 +
 tests/models/test_gguf.py                     |  7 +-
 tests/models/test_huggingface.py              | 20 +++--
 tests/models/test_vllm.py                     |  9 +-
 tests/test_evaluator.py                       | 12 ++-
 tests/test_janitor.py                         | 12 +--
 tests/test_misc.py                            |  4 +-
 tests/test_tasks.py                           |  6 +-
 tests/tests_master/test_description.py        |  4 +-
 tests/tests_master/test_generate_13_grams.py  | 10 +--
 tests/tests_master/test_models.py             |  5 +-
 tests/tests_master/test_version_stable.py     | 16 ++--
 tests/utils.py                                |  8 +-
 84 files changed, 389 insertions(+), 446 deletions(-)

diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
index 76ab1be15b..0c4490f53a 100644
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -56,7 +56,7 @@ jobs:
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
         run: |
             python -m pip install --upgrade pip
-            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
     #   Install optional git dependencies
     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 07a85864b3..f981798fdf 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -17,29 +17,22 @@ jobs:
   linter:
     name: Linters
     runs-on: ubuntu-latest
-    timeout-minutes: 20
+    timeout-minutes: 5
 
     steps:
     - name: Checkout Code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
     - name: Set up Python 3.8
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
         cache: pip
-        cache-dependency-path: setup.py
-    - name: Install dependencies
-      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu ; export SKIP=no-commit-to-branch # env var deactivates --no-commit-to-branch
+        cache-dependency-path: pyproject.toml
     - name: Pre-Commit
+      env:
+        SKIP: "no-commit-to-branch,mypy"
+
       uses: pre-commit/action@v3.0.0
-    - name: Lint with pylint
-      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
@@ -53,17 +46,17 @@ jobs:
     timeout-minutes: 30
     steps:
     - name: Checkout Code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
         cache: pip
-        cache-dependency-path: setup.py
+        cache-dependency-path: pyproject.toml
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9b4ae822c1..b5386cfda3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,14 +27,16 @@ repos:
         args: [--remove]
       - id: mixed-line-ending
         args: [--fix=lf]
-  - repo: https://github.com/pycqa/flake8
-    rev: 3.7.9
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.8
     hooks:
-      - id: flake8
-  - repo: https://github.com/psf/black
-    rev: 22.3.0
-    hooks:
-      - id: black
+      # Run the linter.
+      - id: ruff
+        args:
+          - --fix
+        # Run the formatter.
+      - id: ruff-format
   - repo: https://github.com/codespell-project/codespell
     rev: v2.1.0
     hooks:
diff --git a/README.md b/README.md
index ef3dae54d8..9dbee8fbb2 100644
--- a/README.md
+++ b/README.md
@@ -49,11 +49,10 @@ pip install -e .
 We also provide a number of optional dependencies for extended functionality. Extras can be installed via `pip install -e ".[NAME]"`
 
 | Name          | Use                                   |
-| ------------- | ------------------------------------- |
+|---------------|---------------------------------------|
 | anthropic     | For using Anthropic's models          |
-| dev           | You probably don't want to use this   |
 | gptq          | For loading models with GPTQ          |
-| testing       | You probably don't want to use this   |
+| dev           | You probably don't want to use this   |
 | multilingual  | For multilingual tokenizers           |
 | openai        | For using OpenAI's models             |
 | promptsource  | For using PromtSource prompts         |
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index ebb1b6c4ab..7fbee0dc73 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -1,17 +1,18 @@
+import argparse
+import json
+import logging
 import os
 import re
 import sys
-import json
-import logging
-import argparse
-import numpy as np
-
 from pathlib import Path
 from typing import Union
 
+import numpy as np
+
 from lm_eval import evaluator, utils
-from lm_eval.tasks import initialize_tasks, include_path
 from lm_eval.api.registry import ALL_TASKS
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import make_table
 
 
 def _handle_non_serializable(o):
@@ -170,7 +171,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         task_names = ALL_TASKS
     elif args.tasks == "list":
         eval_logger.info(
-            "Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
+            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
         )
         sys.exit()
     else:
@@ -271,9 +272,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
             f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
         )
-        print(evaluator.make_table(results))
+        print(make_table(results))
         if "groups" in results:
-            print(evaluator.make_table(results, "groups"))
+            print(make_table(results, "groups"))
 
 
 if __name__ == "__main__":
diff --git a/lm_eval/api/filter.py b/lm_eval/api/filter.py
index ac69aa8ffd..bc26a1a637 100644
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -1,9 +1,10 @@
 from dataclasses import dataclass
 from typing import List
 
-from lm_eval.api.instance import Instance
 from datasets import Dataset
 
+from lm_eval.api.instance import Instance
+
 
 class Filter:
     """
@@ -42,7 +43,6 @@ class FilterEnsemble:
     filters: List[Filter]
 
     def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
-
         resps = [
             inst.resps for inst in instances
         ]  # operate just on the model responses
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 4eb68585b6..85a944c888 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -1,18 +1,19 @@
+import logging
 import math
+import random
 from collections.abc import Iterable
 
+import evaluate
 import numpy as np
 import sacrebleu
 import sklearn.metrics
-import random
-import evaluate
 
-from lm_eval.api.registry import register_metric, register_aggregation
+from lm_eval.api.registry import register_aggregation, register_metric
 
-import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
+
 # Register Aggregations First
 @register_aggregation("mean")
 def mean(arr):
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 0f67095879..df829af592 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -1,17 +1,15 @@
 import abc
+import hashlib
+import json
+import logging
 import os
+from typing import List, Optional, Tuple, Type, TypeVar
 
-import torch
-from typing import Union, List, Tuple, Optional, Type, TypeVar
 from sqlitedict import SqliteDict
-import json
-import hashlib
-
 from tqdm import tqdm
 
 from lm_eval import utils
 
-import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 7d73ae6c5f..5fb9c011fc 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -1,8 +1,9 @@
-import os
+import logging
+
 import evaluate
+
 from lm_eval.api.model import LM
 
-import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
@@ -91,7 +92,6 @@ def decorate(fn):
 def register_metric(**args):
     # TODO: do we want to enforce a certain interface to registered metrics?
     def decorate(fn):
-
         assert "metric" in args
         name = args["metric"]
 
@@ -100,7 +100,6 @@ def decorate(fn):
             ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
             ("aggregation", METRIC_AGGREGATION_REGISTRY),
         ]:
-
             if key in args:
                 value = args[key]
                 assert (
@@ -120,7 +119,6 @@ def decorate(fn):
 
 
 def get_metric(name, hf_evaluate_metric=False):
-
     if not hf_evaluate_metric:
         if name in METRIC_REGISTRY:
             return METRIC_REGISTRY[name]
@@ -151,7 +149,6 @@ def decorate(fn):
 
 
 def get_aggregation(name):
-
     try:
         return AGGREGATION_REGISTRY[name]
     except KeyError:
@@ -161,7 +158,6 @@ def get_aggregation(name):
 
 
 def get_metric_aggregation(name):
-
     try:
         return METRIC_AGGREGATION_REGISTRY[name]
     except KeyError:
diff --git a/lm_eval/api/samplers.py b/lm_eval/api/samplers.py
index 8a0d1e334d..57e3a6f1a4 100644
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -40,18 +40,18 @@ def get_context(self, doc, num_fewshot):
                         self.doc_to_text(doc)
                         if (
                             self.config.doc_to_choice is None
-                            or type(self.doc_to_text(doc)) is str
+                            or isinstance(self.doc_to_text(doc), str)
                         )
                         else self.doc_to_choice(doc)[self.doc_to_text(doc)]
                     )
                     + self.target_delimiter
                     + (
                         str(self.doc_to_target(doc)[0])
-                        if type(self.doc_to_target(doc)) is list
+                        if isinstance(self.doc_to_target(doc), list)
                         else self.doc_to_target(doc)
                         if (
                             self.config.doc_to_choice is None
-                            or type(self.doc_to_target(doc)) is str
+                            or isinstance(self.doc_to_target(doc), str)
                         )
                         else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                     )
@@ -77,8 +77,8 @@ def sample(self, n) -> None:
         Draw the first `n` samples in order from the specified split.
         Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
         """
-        assert n <= len(
-            self.docs
+        assert (
+            n <= len(self.docs)
         ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
         return self.docs[:n]
 
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 88ca412923..217349426c 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1,45 +1,35 @@
 import abc
-from dataclasses import dataclass, field, asdict
-
-import os
-import re
 import ast
-import yaml
 import logging
-import evaluate
+import os
 import random
-import itertools
-import functools
-from tqdm import tqdm
+import re
+from collections.abc import Callable
+from dataclasses import asdict, dataclass
+from typing import Any, List, Literal, Tuple, Union
 
 import datasets
 import numpy as np
 
-from typing import Union, List, Any, Tuple, Literal
-from collections.abc import Callable
-
 from lm_eval import utils
 from lm_eval.api import samplers
 from lm_eval.api.instance import Instance
-from lm_eval.api.filter import FilterEnsemble
-
-from lm_eval.prompts import get_prompt
-from lm_eval.filters import build_filter_ensemble
 from lm_eval.api.metrics import (
+    bits_per_byte,
     mean,
     weighted_perplexity,
-    bits_per_byte,
-    metric_max_over_ground_truths,
 )
 from lm_eval.api.registry import (
-    get_metric,
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
     get_aggregation,
+    get_metric,
     get_metric_aggregation,
     is_higher_better,
-    DEFAULT_METRIC_REGISTRY,
-    OUTPUT_TYPE_REGISTRY,
-    AGGREGATION_REGISTRY,
 )
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.prompts import get_prompt
+
 
 ALL_OUTPUT_TYPES = [
     "loglikelihood",
@@ -349,9 +339,7 @@ def build_all_requests(self, limit=None, rank=None, world_size=None) -> None:
         elif self.has_validation_docs():
             docs = self.validation_docs()
         else:
-            assert (
-                False
-            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
 
         eval_logger.info(f"Building contexts for task on rank {rank}...")
 
@@ -603,9 +591,9 @@ def __init__(
 
                 if "aggregation" in metric_config:
                     agg_name = metric_config["aggregation"]
-                    if type(agg_name) == str:
+                    if isinstance(agg_name, str):
                         self._aggregation_list[metric_name] = get_aggregation(agg_name)
-                    elif callable(agg_name):
+                    elif callable(agg_name):  # noqa: E721
                         self._aggregation_list[metric_name] = metric_config[
                             "aggregation"
                         ]
@@ -672,9 +660,7 @@ def __init__(
         elif self.has_validation_docs():
             self.task_docs = self.validation_docs()
         else:
-            assert (
-                False
-            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
 
         # Test One Doc
         self.features = list(self.task_docs.features.keys())
@@ -686,20 +672,20 @@ def __init__(
 
         if self.config.doc_to_choice is not None:
             test_choice = self.doc_to_choice(test_doc)
-            if type(test_choice) is not list:
+            if not isinstance(test_choice, list):
                 eval_logger.error("doc_to_choice must return list")
             else:
                 num_choice = len(test_choice)
 
-            if type(test_text) is int:
+            if isinstance(test_text, int):
                 self.multiple_input = num_choice
         else:
             test_choice = None
 
-        if type(test_target) is list:
+        if isinstance(test_target, list):
             self.multiple_target = len(test_target)
         else:
-            if (type(test_target) is int) and (test_choice is not None):
+            if (isinstance(test_target, int)) and (test_choice is not None):
                 test_target = test_choice[test_target]
             else:
                 test_target = str(test_target)
@@ -808,11 +794,11 @@ def fewshot_context(self, doc, num_fewshot):
             )
 
         example = self.doc_to_text(doc)
-        if type(example) == str:
+        if isinstance(example, str):
             return labeled_examples + example
-        elif type(example) == list:
+        elif isinstance(example, list):
             return [labeled_examples + ex for ex in example]
-        elif type(example) == int:
+        elif isinstance(example, int):
             if self.config.doc_to_choice is not None:
                 choices = self.doc_to_choice(doc)
                 return labeled_examples + choices[example]
@@ -864,9 +850,9 @@ def doc_to_text(self, doc):
         else:
             doc_to_text = self.config.doc_to_text
 
-        if type(doc_to_text) == int:
+        if isinstance(doc_to_text, int):
             return doc_to_text
-        elif type(doc_to_text) == str:
+        elif isinstance(doc_to_text, str):
             if doc_to_text in self.features:
                 # if self.config.doc_to_choice is not None:
                 #     return self.doc_to_choice(doc)[doc[doc_to_text]]
@@ -898,9 +884,9 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]:
         else:
             doc_to_target = self.config.doc_to_target
 
-        if type(doc_to_target) == int:
+        if isinstance(doc_to_target, int):
             return doc_to_target
-        elif type(doc_to_target) == str:
+        elif isinstance(doc_to_target, str):
             if doc_to_target in self.features:
                 # if self.config.doc_to_choice is not None:
                 #     return self.doc_to_choice(doc)[doc[doc_to_target]]
@@ -921,7 +907,7 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]:
                         return target_string
                 else:
                     return target_string
-        elif type(doc_to_target) == list:
+        elif isinstance(doc_to_target, list):
             return doc_to_target
         elif callable(doc_to_target):
             return doc_to_target(doc)
@@ -944,14 +930,14 @@ def doc_to_choice(self, doc: Any) -> List[str]:
         else:
             doc_to_choice = self.config.doc_to_choice
 
-        if type(doc_to_choice) == str:
+        if isinstance(doc_to_choice, str):
             if doc_to_choice in self.features:
                 return doc[doc_to_choice]
             else:
                 return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
-        elif type(doc_to_choice) == list:
+        elif isinstance(doc_to_choice, list):
             return doc_to_choice
-        elif type(doc_to_choice) == dict:
+        elif isinstance(doc_to_choice, dict):
             return list(doc_to_choice.values())
         elif callable(doc_to_choice):
             return doc_to_choice(doc)
@@ -1078,14 +1064,14 @@ def process_results(self, doc, results):
                 gold = self.doc_to_target(doc)
 
             gold_index_error = False
-            if type(gold) is list:
+            if isinstance(gold, list):
                 gold = [i if i < len(choices) else -100 for i in gold]
                 if -100 in gold:
                     gold_index_error = True
             else:
-                if type(gold) is int:
+                if isinstance(gold, int):
                     gold = gold if gold < len(choices) else -100
-                elif type(gold) is str:
+                elif isinstance(gold, str):
                     gold = choices.index(gold) if gold in choices else -100
 
                 if gold == -100:
@@ -1175,9 +1161,7 @@ def process_results(self, doc, results):
                             predictions=[result],
                             **self._metric_fn_kwargs[metric],
                         )
-                    except (
-                        TypeError
-                    ):  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
                         result_score = self._metric_fn_list[metric]([gold, result])
                     if isinstance(result_score, dict):
                         # TODO: this handles the case where HF evaluate returns a dict.
diff --git a/lm_eval/decontamination/archiver.py b/lm_eval/decontamination/archiver.py
index 3b5f09f525..e6bff33f0c 100644
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
@@ -1,13 +1,14 @@
+import datetime
+import io
+import json
+import mmap
 import os
+from pathlib import Path
 from typing import Any
-import zstandard
-import json
+
 import jsonlines
-import io
-import datetime
-import mmap
 import tqdm
-from pathlib import Path
+import zstandard
 
 
 def json_serial(obj: Any) -> str:
diff --git a/lm_eval/decontamination/decontaminate.py b/lm_eval/decontamination/decontaminate.py
index 447eae52bf..f5b4157c67 100644
--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -1,13 +1,13 @@
-import time
-import random
-import pickle
-import json
+import collections
 import glob
+import json
 import os
-import collections
+import pickle
+import random
+import time
 
-from .janitor import Janitor, word_ngrams
 from .archiver import ZStdTextReader
+from .janitor import Janitor, word_ngrams
 
 
 # Was used for testing the evaluator decoupled from the full logic below
@@ -109,7 +109,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
         print(f"Merging lookups took {elapsed:0.5f} seconds.")
 
         print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
-        files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
+        files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
         print(files)
 
         for file in files:
@@ -135,11 +135,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
                         matching_unique += 1
                         for task_name, task_set, doc_ids in merged_lookup[ngram]:
                             task_doc_set = duplicates[(task_name, task_set)]
-                            for (
-                                doc_id
-                            ) in (
-                                doc_ids
-                            ):  # Record contamination across all relevant task/set combos
+                            for doc_id in doc_ids:  # Record contamination across all relevant task/set combos
                                 task_doc_set.add(doc_id)
                         del merged_lookup[ngram]  # No point matching again
                     else:
diff --git a/lm_eval/decontamination/janitor.py b/lm_eval/decontamination/janitor.py
index 5ad84d13df..cedf8a5717 100644
--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -1,9 +1,9 @@
+import pickle
 import re
 import string
-import pickle
 import traceback
-from pprint import pprint
-from typing import Iterator, Sequence, TypeVar, List, Tuple
+from typing import Iterator, List, Sequence, Tuple, TypeVar
+
 
 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index cb5d0f53ae..5d277a6bf7 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -1,8 +1,6 @@
 import random
 import itertools
-import json
 import collections
-import sys
 
 import torch
 
@@ -17,8 +15,6 @@
 from lm_eval.utils import (
     positional_deprecated,
     run_task_tests,
-    make_table,
-    create_iterator,
     get_git_commit_hash,
     simple_parse_args_string,
     eval_logger,
@@ -91,7 +87,7 @@ def simple_evaluate(
     if gen_kwargs is not None:
         gen_kwargs = simple_parse_args_string(gen_kwargs)
         eval_logger.warning(
-            f"generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks."
+            "generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks."
         )
         if gen_kwargs == "":
             gen_kwargs = None
@@ -118,7 +114,9 @@ def simple_evaluate(
             use_cache
             # each rank receives a different cache db.
             # necessary to avoid multiple writes to cache at once
-            + "_rank" + str(lm.rank) + ".db",
+            + "_rank"
+            + str(lm.rank)
+            + ".db",
         )
 
     task_dict = lm_eval.tasks.get_task_dict(tasks)
@@ -513,9 +511,7 @@ def evaluate(
                                 ) + total_size * current_size / (
                                     (total_size + current_size)
                                     * (total_size + current_size - 1)
-                                ) * (
-                                    results[group][metric] - metric_score
-                                ) ** 2
+                                ) * (results[group][metric] - metric_score) ** 2
                             else:
                                 results[group][metric] = metric_score
                                 results[group][stderr] = var_score
diff --git a/lm_eval/filters/__init__.py b/lm_eval/filters/__init__.py
index c74ac01593..76eb78467e 100644
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -32,7 +32,7 @@ def build_filter_ensemble(filter_name, components):
     Create a filtering pipeline.
     """
     filters = []
-    for (function, kwargs) in components:
+    for function, kwargs in components:
         if kwargs is None:
             f = get_filter(function)()
         else:
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index 18b1b70a38..6e5b437875 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -1,9 +1,12 @@
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-from tqdm import tqdm
 import time
+from typing import Any, List, Tuple
+
+from tqdm import tqdm
+
 from lm_eval import utils
-from typing import List, Any, Tuple
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
 
 eval_logger = utils.eval_logger
 
diff --git a/lm_eval/models/dummy.py b/lm_eval/models/dummy.py
index b13a3900f9..d28435f7ea 100644
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -1,4 +1,5 @@
 import random
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
diff --git a/lm_eval/models/gguf.py b/lm_eval/models/gguf.py
index 5ae154f39a..8eebc2e04f 100644
--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
@@ -1,11 +1,14 @@
-import requests
 import logging
 import time
-from tqdm import tqdm
+
+import requests
 from requests.exceptions import RequestException
+from tqdm import tqdm
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index b32ffc34e3..dc243a1a5c 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -1,29 +1,28 @@
+import copy
 import os
-from packaging import version
+from collections import defaultdict
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+
 import torch
+import torch.nn.functional as F
 import transformers
+from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
-from peft import __version__ as PEFT_VERSION, PeftModel
-
-import copy
-from collections import defaultdict
-from tqdm import tqdm
-from pathlib import Path
-
-import torch.nn.functional as F
 
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import stop_sequences_criteria
 
-from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
-
-from accelerate import Accelerator, find_executable_batch_size, DistributedType
-from typing import List, Optional, Union, Tuple, Literal
 
 eval_logger = utils.eval_logger
 
@@ -107,9 +106,7 @@ def __init__(
             eval_logger.warning(
                 "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
             )
-            assert (
-                not parallelize
-            ), "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
             self._model = pretrained
             self._device = self._model.device
 
@@ -279,10 +276,13 @@ def __init__(
                             "with 'accelerate launch *script*'. "
                             f"Current run will proceed with {accelerator.num_processes} devices."
                         )
-                    assert accelerator.distributed_type in [
-                        DistributedType.FSDP,
-                        DistributedType.MULTI_GPU,
-                    ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                    assert (
+                        accelerator.distributed_type
+                        in [
+                            DistributedType.FSDP,
+                            DistributedType.MULTI_GPU,
+                        ]
+                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
                     if accelerator.distributed_type == DistributedType.FSDP:
                         self._model = accelerator.prepare(self.model)
                     else:
@@ -417,7 +417,6 @@ def _get_config(
         revision: str = "main",
         trust_remote_code: bool = False,
     ) -> None:
-
         self._config = transformers.AutoConfig.from_pretrained(
             pretrained,
             revision=revision,
@@ -751,8 +750,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         for context, continuation in [req.args for req in requests]:
             if context == "":
                 # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                 )
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -995,9 +995,7 @@ def _collate(x):
                 greedy_tokens = logits.argmax(dim=-1)
                 cont_toks = torch.tensor(
                     cont_toks, dtype=torch.long, device=self.device
-                ).unsqueeze(
-                    0
-                )  # [1, seq]
+                ).unsqueeze(0)  # [1, seq]
                 max_equal = (greedy_tokens == cont_toks).all()
 
                 # Obtain log-probs at the corresponding continuation token indices
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index b0f6a8f170..d63f8ab12a 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -1,9 +1,10 @@
+import copy
 import os
 import time
-from typing import List, Tuple, Optional
-
-import copy
 from collections import defaultdict
+from importlib.util import find_spec
+from typing import List, Optional, Tuple
+
 from tqdm import tqdm
 
 from lm_eval import utils
@@ -44,13 +45,13 @@ def oa_completion(**kwargs):
 
     Retry with back-off until they respond
     """
-    try:
-        import openai, tiktoken  # noqa: E401
-    except ModuleNotFoundError:
+    if not find_spec("openai") or not find_spec("tiktoken"):
         raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
         )
+    else:
+        import openai
 
     backoff_time = 3
     while True:
@@ -88,7 +89,8 @@ def __init__(
         super().__init__()
         self.seed = seed
         try:
-            import openai, tiktoken  # noqa: E401
+            import openai  # noqa: E401
+            import tiktoken
         except ModuleNotFoundError:
             raise Exception(
                 "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
@@ -154,8 +156,9 @@ def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
         for context, continuation in [req.args for req in requests]:
             if context == "":
                 # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                 )
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -326,13 +329,13 @@ def oa_chat_completion(client, **kwargs):
 
     Retry with back-off until they respond
     """
-    try:
-        import openai, tiktoken  # noqa: E401
-    except ModuleNotFoundError:
+    if not find_spec("openai") or not find_spec("tiktoken"):
         raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
         )
+    else:
+        import openai
 
     async def _get_completions(**kwargs):
         chat_completions = await client.chat.completions.create(**kwargs)
@@ -364,7 +367,8 @@ def __init__(
         """
         super().__init__()
         try:
-            import openai, tiktoken  # noqa: E401
+            import openai  # noqa: E401
+            import tiktoken
         except ModuleNotFoundError:
             raise Exception(
                 "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
diff --git a/lm_eval/models/textsynth.py b/lm_eval/models/textsynth.py
index 379f11b902..32917d692c 100644
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -13,9 +13,11 @@
 """
 import logging
 import os
-import requests as _requests
 import time
+
+import requests as _requests
 from tqdm import tqdm
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
@@ -149,7 +151,7 @@ def generate_until(self, requests):
                 self.cache_hook.add_partial("generate_until", (inp, request_args), s)
             else:
                 logger.error(
-                    f"The following response does not contain generated `text`. "
+                    "The following response does not contain generated `text`. "
                     "Got:\n{resp}"
                 )
                 assert False
diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py
index edab369411..e6a75ceb21 100644
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -1,16 +1,19 @@
+import copy
 from collections import defaultdict
-from typing import List, Tuple, Optional, Literal, Union, Any
-from transformers import AutoTokenizer
+from importlib.util import find_spec
+from typing import List, Literal, Optional, Tuple, Union
+
+from tqdm import tqdm
+
+from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
-import copy
-from tqdm import tqdm
 from lm_eval.api.registry import register_model
-from lm_eval import utils
+
 
 try:
-    from vllm import LLM, SamplingParams
     from ray.util.multiprocessing import Pool
+    from vllm import LLM, SamplingParams
     from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
     pass
@@ -54,12 +57,10 @@ def __init__(
     ):
         super().__init__()
 
-        try:
-            import vllm
-        except ModuleNotFoundError:
+        if not find_spec("vllm"):
             raise Exception(
-                "attempted to use 'vllm' LM type, but package `vllm` is not installed. \
-please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`",
+                "attempted to use 'vllm' LM type, but package `vllm` is not installed. "
+                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
             )
 
         assert "cuda" in device or device is None, "vLLM only supports CUDA"
@@ -193,8 +194,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         for context, continuation in [req.args for req in requests]:
             if context == "":
                 # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                 )
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index d058a48776..d8b62e7deb 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -69,7 +69,6 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
 def load_prompt_list(
     use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
 ):
-
     category_name, prompt_name = use_prompt.split(":")
 
     if category_name == "promptsource":
@@ -113,7 +112,6 @@ def __init__(self, prompt_string):
         self.prompt_string = prompt_string
 
     def apply(self, doc):
-
         doc_to_text = self.prompt_string["doc_to_text"]
         doc_to_target = self.prompt_string["doc_to_target"]
 
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 28563de6bf..ed92bd9755 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -180,7 +180,6 @@ def include_path(task_dir):
 
 
 def initialize_tasks(verbosity="INFO"):
-
     eval_logger.setLevel(getattr(logging, f"{verbosity}"))
 
     task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index d2a53cfb6c..18a55c705a 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -24,7 +24,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -37,7 +36,6 @@ def parse_args():
 
     dataset_path = "lukaemon/bbh"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
-
         resp = requests.get(
             f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
         ).content.decode("utf-8")
diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py
index 9df56f5feb..052d55bea2 100644
--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -23,7 +23,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
index fa8619f40c..3fd5cd6c2b 100644
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -173,7 +173,6 @@
 
 
 def main() -> None:
-
     for path, task_type in zip(
         ["multiple_choice", "generate_until"],
         ["multiple_choice_template_yaml", "generate_until_template_yaml"],
diff --git a/lm_eval/tasks/blimp/generate_configs.py b/lm_eval/tasks/blimp/generate_configs.py
index 4fa45db4d2..dfc4b4dc95 100644
--- a/lm_eval/tasks/blimp/generate_configs.py
+++ b/lm_eval/tasks/blimp/generate_configs.py
@@ -73,7 +73,6 @@
 
 def main() -> None:
     for task in all_subtasks:
-
         file_name = f"{task}.yaml"
         try:
             with open(f"{file_name}", "w") as f:
diff --git a/lm_eval/tasks/ceval/_generate_configs.py b/lm_eval/tasks/ceval/_generate_configs.py
index deaa0372c8..2b96e00713 100644
--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
@@ -75,7 +75,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -93,7 +92,9 @@ def parse_args():
         if args.cot_prompt_path is not None:
             description = cot_file[subject_eng]
         else:
-            description = f"以下是中国关于{subject_zh}的单项选择题，请选出其中的正确答案。\n\n"
+            description = (
+                f"以下是中国关于{subject_zh}的单项选择题，请选出其中的正确答案。\n\n"
+            )
 
         yaml_dict = {
             "include": base_yaml_name,
diff --git a/lm_eval/tasks/cmmlu/_generate_configs.py b/lm_eval/tasks/cmmlu/_generate_configs.py
index 4b3dba75b1..07553bb1ea 100644
--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -90,7 +90,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -108,7 +107,9 @@ def parse_args():
         if args.cot_prompt_path is not None:
             description = cot_file[subject_eng]
         else:
-            description = f"以下是关于{subject_zh}的单项选择题，请直接给出正确答案的选项。\n\n"
+            description = (
+                f"以下是关于{subject_zh}的单项选择题，请直接给出正确答案的选项。\n\n"
+            )
 
         yaml_dict = {
             "include": base_yaml_name,
diff --git a/lm_eval/tasks/code_x_glue/code-text/bleu.py b/lm_eval/tasks/code_x_glue/code-text/bleu.py
index 310c626c73..a90fc46b17 100644
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
@@ -1,9 +1,7 @@
 #!/usr/bin/python
-import os
 import re
 import sys
 import math
-import subprocess
 import xml.sax.saxutils
 
 from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
@@ -65,14 +63,14 @@ def normalize(s):
     if type(s) is not str:
         s = " ".join(s)
     # language-independent part:
-    for (pattern, replace) in normalize1:
+    for pattern, replace in normalize1:
         s = re.sub(pattern, replace, s)
     s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
     # language-dependent part (assuming Western languages):
     s = " %s " % s
     if not preserve_case:
         s = s.lower()  # this might not be identical to the original
-    for (pattern, replace) in normalize2:
+    for pattern, replace in normalize2:
         s = re.sub(pattern, replace, s)
     return s.split()
 
@@ -95,7 +93,7 @@ def cook_refs(refs, n=4):
     maxcounts: Dict[Tuple[str], int] = {}
     for ref in refs:
         counts = count_ngrams(ref, n)
-        for (ngram, count) in counts.items():
+        for ngram, count in counts.items():
             maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
     return ([len(ref) for ref in refs], maxcounts)
 
@@ -125,7 +123,7 @@ def cook_test(test, item, n=4):
 
     result["correct"] = [0] * n
     counts = count_ngrams(test, n)
-    for (ngram, count) in counts.items():
+    for ngram, count in counts.items():
         result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
 
     return result
@@ -222,7 +220,6 @@ def bleuFromMaps(m1, m2):
 
 
 def smoothed_bleu_4(references, predictions, **kwargs):
-
     predictionMap = {}
     goldMap = {}
 
diff --git a/lm_eval/tasks/code_x_glue/code-text/utils.py b/lm_eval/tasks/code_x_glue/code-text/utils.py
index 981a00b912..6975684259 100644
--- a/lm_eval/tasks/code_x_glue/code-text/utils.py
+++ b/lm_eval/tasks/code_x_glue/code-text/utils.py
@@ -1,5 +1,4 @@
 def doc_to_text(doc):
-
     inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
     inputs = " ".join(inputs.strip().split())
 
@@ -7,7 +6,6 @@ def doc_to_text(doc):
 
 
 def doc_to_target(doc):
-
     targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
     targets = " ".join(targets.strip().split())
 
diff --git a/lm_eval/tasks/coqa/utils.py b/lm_eval/tasks/coqa/utils.py
index 4fed8ff8c2..29911cfec5 100644
--- a/lm_eval/tasks/coqa/utils.py
+++ b/lm_eval/tasks/coqa/utils.py
@@ -7,7 +7,7 @@ def doc_to_text(doc):
     # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
     # and a question qi, the task is to predict the answer ai
     doc_text = doc["story"] + "\n\n"
-    for (q, a) in zip_longest(
+    for q, a in zip_longest(
         doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
     ):  # omit target answer ai
         question = f"Q: {q}\n\n"
@@ -17,7 +17,6 @@ def doc_to_text(doc):
 
 
 def doc_to_target(doc):
-
     turn_id = len(doc["questions"]["input_text"])
     # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
     answers = []
@@ -71,7 +70,6 @@ def compute_scores(gold_list, pred):
 
 
 def process_results(doc, results):
-
     gold_list = doc_to_target(doc)
     pred = results[0].strip().split("\n")[0]
 
diff --git a/lm_eval/tasks/csatqa/_generate_configs.py b/lm_eval/tasks/csatqa/_generate_configs.py
index ca2bfc436e..56fe825a90 100644
--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
@@ -21,7 +21,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -30,7 +29,6 @@ def parse_args():
         base_yaml = yaml.full_load(f)
 
     for name in tqdm(SUBSETS):
-
         yaml_dict = {
             "include": base_yaml_name,
             "task": f"csatqa_{args.task_prefix}_{name}"
diff --git a/lm_eval/tasks/drop/utils.py b/lm_eval/tasks/drop/utils.py
index 1e2888ce3e..03f7218c90 100644
--- a/lm_eval/tasks/drop/utils.py
+++ b/lm_eval/tasks/drop/utils.py
@@ -62,7 +62,6 @@ def parse_answer(answer):
 
 
 def process_results(doc, results):
-
     preds, golds = results, doc["answers"]
     max_em = 0
     max_f1 = 0
diff --git a/lm_eval/tasks/ifeval/instructions_registry.py b/lm_eval/tasks/ifeval/instructions_registry.py
index 1056b139e2..ecb20e9b23 100644
--- a/lm_eval/tasks/ifeval/instructions_registry.py
+++ b/lm_eval/tasks/ifeval/instructions_registry.py
@@ -78,8 +78,7 @@
     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
     _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
     _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
-    _LANGUAGE
-    + "response_language": {
+    _LANGUAGE + "response_language": {
         _LANGUAGE + "response_language",
         _FORMAT + "multiple_sections",
         _KEYWORD + "existence",
@@ -90,16 +89,14 @@
         _CHANGE_CASES + "english_lowercase",
     },
     _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
-    _LENGTH
-    + "number_paragraphs": {
+    _LENGTH + "number_paragraphs": {
         _LENGTH + "number_paragraphs",
         _LENGTH + "nth_paragraph_first_word",
         _LENGTH + "number_sentences",
         _LENGTH + "nth_paragraph_first_word",
     },
     _LENGTH + "number_words": {_LENGTH + "number_words"},
-    _LENGTH
-    + "nth_paragraph_first_word": {
+    _LENGTH + "nth_paragraph_first_word": {
         _LENGTH + "nth_paragraph_first_word",
         _LENGTH + "number_paragraphs",
     },
@@ -110,23 +107,20 @@
     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
     _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
     _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
-    _FORMAT
-    + "multiple_sections": {
+    _FORMAT + "multiple_sections": {
         _FORMAT + "multiple_sections",
         _LANGUAGE + "response_language",
         _FORMAT + "number_highlighted_sections",
     },
     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
     # _FORMAT + "rephrase": instructions.RephraseChecker,
-    _FORMAT
-    + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+    _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
         {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
     ),
     _FORMAT + "title": {_FORMAT + "title"},
     # TODO(tianjianlu): Re-enable with specific prompts.
     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
-    _COMBINATION
-    + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
+    _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
         {
             _KEYWORD + "forbidden_words",
             _KEYWORD + "existence",
@@ -135,20 +129,17 @@
             _PUNCTUATION + "no_comma",
         }
     ),
-    _COMBINATION
-    + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
+    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
         {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
     ),
     _STARTEND + "end_checker": {_STARTEND + "end_checker"},
-    _CHANGE_CASES
-    + "capital_word_frequency": {
+    _CHANGE_CASES + "capital_word_frequency": {
         _CHANGE_CASES + "capital_word_frequency",
         _CHANGE_CASES + "english_lowercase",
         _CHANGE_CASES + "english_capital",
     },
     _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
-    _CHANGE_CASES
-    + "english_lowercase": {
+    _CHANGE_CASES + "english_lowercase": {
         _CHANGE_CASES + "english_lowercase",
         _CHANGE_CASES + "english_capital",
     },
diff --git a/lm_eval/tasks/ifeval/instructions_util.py b/lm_eval/tasks/ifeval/instructions_util.py
index 2390cba305..ccb531f96e 100644
--- a/lm_eval/tasks/ifeval/instructions_util.py
+++ b/lm_eval/tasks/ifeval/instructions_util.py
@@ -17,7 +17,6 @@
 import functools
 import random
 import re
-from typing import List
 
 import immutabledict
 import nltk
diff --git a/lm_eval/tasks/mgsm/utils.py b/lm_eval/tasks/mgsm/utils.py
index 97affac765..3edc78ab28 100644
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
@@ -94,7 +94,6 @@
 
 
 def add_regex_pattern(regex_pattern):
-
     if regex_pattern is None:
         return {}
     return {
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index 2bf27ac0f7..e6271bc4c2 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -7,7 +7,6 @@
 
 from tqdm import tqdm
 
-from lm_eval import utils
 from lm_eval.logger import eval_logger
 
 SUBJECTS = {
@@ -82,7 +81,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
@@ -98,7 +96,6 @@ def parse_args():
 
     ALL_CATEGORIES = []
     for subject, category in tqdm(SUBJECTS.items()):
-
         if category not in ALL_CATEGORIES:
             ALL_CATEGORIES.append(category)
 
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
index ca199226a8..aecb40a5eb 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
@@ -1,12 +1,10 @@
 import yaml
-import inspect
 import datasets
 
 from tqdm import tqdm
 
 
 def main() -> None:
-
     dataset_path = "EleutherAI/advanced_ai_risk"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
         file_name = f"{task}.yaml"
diff --git a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
index a21f28309b..7aff892f03 100644
--- a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
@@ -1,12 +1,10 @@
 import yaml
-import inspect
 import datasets
 
 from tqdm import tqdm
 
 
 def main() -> None:
-
     dataset_path = "EleutherAI/persona"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
         file_name = f"{task}.yaml"
diff --git a/lm_eval/tasks/paws-x/_generate_config.py b/lm_eval/tasks/paws-x/_generate_config.py
index bff82e4ff0..a1341fec89 100644
--- a/lm_eval/tasks/paws-x/_generate_config.py
+++ b/lm_eval/tasks/paws-x/_generate_config.py
@@ -1,5 +1,4 @@
 import argparse
-from typing import Dict, List
 
 import yaml
 
diff --git a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
index 51c198703f..0dccf9408a 100644
--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -1,5 +1,6 @@
 def doc_to_text(doc) -> str:
     ctxs = "\n".join(doc["CONTEXTS"])
     return "Abstract: {}\nQuestion: {}\nAnswer:".format(
-        ctxs, doc["QUESTION"], doc["final_decision"]
+        ctxs,
+        doc["QUESTION"],
     )
diff --git a/lm_eval/tasks/qasper/utils.py b/lm_eval/tasks/qasper/utils.py
index be6f79dcad..7a02237a78 100644
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
@@ -3,7 +3,6 @@
 
 
 def process_docs(dataset, set_answer_type="bool"):
-
     FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
 
     def _categorise_answer(answer_blob):
diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py
index e44296a4e0..829e97bde0 100644
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -235,7 +235,6 @@ def process_results(self, doc, results):
         }
 
     def construct_requests(self, doc, ctx, **kwargs):
-
         request_list = [
             Instance(
                 request_type="loglikelihood",
diff --git a/lm_eval/tasks/squadv2/task.py b/lm_eval/tasks/squadv2/task.py
index 4630e2a161..ba308acd43 100644
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -14,7 +14,6 @@
 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
 import datasets
-from evaluate import load
 
 from math import exp
 from functools import partial
@@ -120,14 +119,14 @@ def construct_requests(self, doc, ctx, **kwargs):
                 doc=doc,
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
-                **kwargs
+                **kwargs,
             ),
             Instance(
                 request_type="loglikelihood",
                 doc=doc,
                 arguments=(ctx, " " + "unanswerable"),
                 idx=0,
-                **kwargs
+                **kwargs,
             ),
         ]
 
diff --git a/lm_eval/tasks/super_glue/cb/t5_utils.py b/lm_eval/tasks/super_glue/cb/t5_utils.py
index 43eafce9d6..ec02e34538 100644
--- a/lm_eval/tasks/super_glue/cb/t5_utils.py
+++ b/lm_eval/tasks/super_glue/cb/t5_utils.py
@@ -2,7 +2,6 @@
 
 
 def mean_3class_f1(predictions, references):  # This is a passthrough function
-
     string_label = ["entailment", "contradiction", "neutral"]
     predictions = (
         string_label.index(predictions[0]) if predictions[0] in string_label else 0
@@ -13,7 +12,6 @@ def mean_3class_f1(predictions, references):  # This is a passthrough function
 
 
 def agg_mean_3class_f1(items):
-
     predictions, references = zip(*items)
 
     """Computes the unweighted average of the F1 per class."""
diff --git a/lm_eval/tasks/super_glue/multirc/t5_utils.py b/lm_eval/tasks/super_glue/multirc/t5_utils.py
index ac99aaf962..d17d498fa2 100644
--- a/lm_eval/tasks/super_glue/multirc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/multirc/t5_utils.py
@@ -5,7 +5,6 @@
 
 
 def f1(predictions, references):  # This is a passthrough function
-
     _prediction = predictions[0]
     _reference = references[0].split("_")[-1]
     string_label = ["False", "True"]
@@ -20,7 +19,6 @@ def f1(predictions, references):  # This is a passthrough function
 
 
 def agg_f1(items):
-
     predictions, references = zip(*items)
     references, predictions = np.asarray(references), np.asarray(predictions)
 
@@ -28,7 +26,6 @@ def agg_f1(items):
 
 
 def em(predictions, references):  # This is a passthrough function
-
     _prediction = predictions[0]
     _group, _reference = references[0].split("_")
     string_label = ["False", "True"]
diff --git a/lm_eval/tasks/super_glue/record/t5_utils.py b/lm_eval/tasks/super_glue/record/t5_utils.py
index 98730cacd4..68301b18b3 100644
--- a/lm_eval/tasks/super_glue/record/t5_utils.py
+++ b/lm_eval/tasks/super_glue/record/t5_utils.py
@@ -3,14 +3,12 @@
 import collections
 import numpy as np
 
-from tqdm import tqdm
-from datasets import Dataset, concatenate_datasets
+from datasets import Dataset
 
 from lm_eval.api.metrics import metric_max_over_ground_truths
 
 
 def doc_to_text(doc):
-
     passage = doc["passage"]
     passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
     passage = re.sub(r"\n@highlight\n", ". ", passage)
@@ -34,7 +32,6 @@ def split_answers(doc):
         }
         answers = doc.pop("answers")
         for idx, answer in enumerate(answers):
-
             for key in split_doc.keys():
                 if key in doc:
                     split_doc[key].append(doc[key])
diff --git a/lm_eval/tasks/super_glue/wsc/t5_utils.py b/lm_eval/tasks/super_glue/wsc/t5_utils.py
index 7e55a52a7b..eb5331a42a 100644
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
@@ -8,7 +8,6 @@ def doc_to_text(x):
 
 
 def _wsc_inputs(x):
-
     words = x["text"].split(" ")
 
     # We would need some special logic to handle the case where the pronoun is the
@@ -55,7 +54,6 @@ def create_input():
 
 class WSCPostprocess(Filter):
     def __init__(self, **kwargs):
-
         self.determiners = {
             "a",
             "an",
@@ -86,10 +84,8 @@ def clean(self, s):
         return " ".join([w for w in s.split(" ") if w not in self.determiners])
 
     def apply(self, resps, docs):
-
         filtered_resps = []
         for prediction, reference in zip(*(resps, docs["span1_text"])):
-
             prediction = self.clean(prediction[0])
             reference = self.clean(reference)
 
diff --git a/lm_eval/tasks/translation/utils.py b/lm_eval/tasks/translation/utils.py
index f80ae89a4f..f30c4d8625 100644
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -1,9 +1,7 @@
 import argparse
-from typing import Dict, List
 
 import yaml
 
-import sacrebleu
 
 try:
     import pycountry
diff --git a/lm_eval/tasks/truthfulqa/utils.py b/lm_eval/tasks/truthfulqa/utils.py
index 8c011d2d10..8e2ab43fe8 100644
--- a/lm_eval/tasks/truthfulqa/utils.py
+++ b/lm_eval/tasks/truthfulqa/utils.py
@@ -6,7 +6,6 @@
 
 
 def process_results_mc2(doc, results):
-
     lls, is_greedy = zip(*results)
 
     # Split on the first `0` as everything before it is true (`1`).
@@ -20,7 +19,6 @@ def process_results_mc2(doc, results):
 
 
 def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
-
     return dataset.map(preprocess_function)
 
 
@@ -49,7 +47,6 @@ def _format_answers(answers):
 
 
 def process_results_gen(doc, results):
-
     completion = results[0]
     true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
     all_refs = true_refs + false_refs
diff --git a/lm_eval/tasks/xnli/utils.py b/lm_eval/tasks/xnli/utils.py
index fa7806fc74..2844d1d7c8 100644
--- a/lm_eval/tasks/xnli/utils.py
+++ b/lm_eval/tasks/xnli/utils.py
@@ -1,5 +1,4 @@
 import argparse
-from typing import Dict, List
 
 import yaml
 
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 4067669c0d..74f4f482da 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -1,25 +1,23 @@
-import os
-import re
-import sys
-import yaml
+import collections
+import fnmatch
+import functools
+import gc
+import importlib.util
 import inspect
+import logging
+import os
 import pathlib
-import functools
+import re
 import subprocess
-import collections
-import importlib.util
-import fnmatch
-
-from typing import Iterator, List, Literal, Union, Any, Callable
+import sys
+from itertools import islice
+from typing import Any, Callable, Iterator, List, Literal, Union
 
-import gc
 import torch
 import transformers
-
+import yaml
 from jinja2 import BaseLoader, Environment, StrictUndefined
-from itertools import islice
 
-import logging
 
 logging.basicConfig(
     format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
@@ -143,7 +141,7 @@ def __init__(self, choices) -> None:
     def __contains__(self, values) -> bool:
         for value in values.split(","):
             if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.info(f"Available tasks to choose:")
+                eval_logger.info("Available tasks to choose:")
                 for choice in self.choices:
                     eval_logger.info(f"  - {choice}")
                 raise ValueError("'{}' is not in task list".format(value))
@@ -157,7 +155,7 @@ def __iter__(self) -> Iterator:
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
-    if type(patterns) == str:
+    if isinstance(patterns, str):
         patterns = [patterns]
 
     task_names = set()
@@ -332,7 +330,7 @@ def get_original(self, grouped_dict):
 
 def make_table(result_dict, column: str = "results"):
     """Generate table of results."""
-    from pytablewriter import MarkdownTableWriter, LatexTableWriter
+    from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
     if column == "results":
         column_name = "Tasks"
@@ -466,7 +464,7 @@ def import_function(loader, node):
     yaml_path = os.path.dirname(loader.name)
 
     *module_name, function_name = function_name.split(".")
-    if type(module_name) == list:
+    if isinstance(module_name, list):
         module_name = ".".join(module_name)
     module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))
 
@@ -496,7 +494,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
         include_path = yaml_config["include"]
         del yaml_config["include"]
 
-        if type(include_path) == str:
+        if isinstance(include_path, str):
             include_path = [include_path]
 
         # Load from the last one first
diff --git a/mypy.ini b/mypy.ini
index 76a0c86452..2d20dd2cc5 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -9,21 +9,19 @@ warn_unused_ignores = True
 warn_redundant_casts = True
 
 # We ignore errors everywhere to gradually add type annotations
-
-[mypy-lm_eval.*]
-ignore_errors = True
-
-[mypy-lm_eval.api.*]
-ignore_errors = True
-
-[mypy-lm_eval.prompts.*]
-ignore_errors = True
-
-[mypy-lm_eval.models.*]
-ignore_errors = True
-
-[mypy-scripts.*]
-ignore_errors = True
-
-[mypy-main]
-ignore_errors = True
+# [mypy-lm_eval.*]
+# ignore_errors = True
+#
+# [mypy-lm_eval.api.*]
+# ignore_errors = True
+#
+# [mypy-lm_eval.prompts.*]
+# ignore_errors = True
+#
+# [mypy-lm_eval.models.*]
+# ignore_errors = True
+#
+# [mypy-scripts.*]
+# ignore_errors = True
+#
+# [mypy-main]
diff --git a/pyproject.toml b/pyproject.toml
index 5a4d191d7c..87eefc72d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,14 +54,7 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
 Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 
 [project.optional-dependencies]
-dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
-linting = [
-    "flake8",
-    "pylint",
-    "mypy",
-    "pre-commit",
-]
-testing = ["pytest", "pytest-cov", "pytest-xdist"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
 sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
@@ -88,3 +81,17 @@ all = [
     "lm_eval[ifeval]",
     "lm_eval[zeno]",
 ]
+
+[tool.ruff]
+extend-exclude = ["lm_eval/evaluator.py", "lm_eval/tasks/*.py"]
+
+[tool.ruff.lint]
+extend-select = ["I"]
+
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["lm_eval"]
+
+[tool.ruff.extend-per-file-ignores]
+"__init__.py" = ["F401","F402","F403","I"]
+"lm_eval/tasks/*"= ["E721"]
diff --git a/scripts/build_benchmark.py b/scripts/build_benchmark.py
index 4cd07dd3eb..ce4b661681 100644
--- a/scripts/build_benchmark.py
+++ b/scripts/build_benchmark.py
@@ -1,15 +1,14 @@
-import os
-import yaml
 import argparse
+import os
 
-from tqdm import tqdm
+import yaml
 from promptsource.templates import DatasetTemplates
-
-from lm_eval import utils
+from tqdm import tqdm
 
 # from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger
 
+
 # from lm_eval.tasks import include_task_folder
 
 
@@ -22,7 +21,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     with open(args.benchmark_path) as file:
diff --git a/scripts/clean_training_data/compress_and_package.py b/scripts/clean_training_data/compress_and_package.py
index c9e7f2593c..d4af5ba5f3 100644
--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py
@@ -1,15 +1,15 @@
-import glob
 import argparse
+import glob
+import logging
 import os
-import subprocess
 import shutil
+import subprocess
 
 from tqdm import tqdm
 from tqdm_multiprocess import TqdmMultiProcessPool
-
-import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -35,7 +35,7 @@ def compress_and_move(working_directory, output_directory, process_count):
 
     tasks = []
     bucket_file_paths = glob.glob(
-        os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
+        os.path.join(working_directory, "output", "*.bkt.txt.sorted")
     )
     for bucket_file_path in bucket_file_paths:
         task = (process_task, (working_directory, output_directory, bucket_file_path))
diff --git a/scripts/clean_training_data/generate_13_grams.py b/scripts/clean_training_data/generate_13_grams.py
index 27037e394d..66fa0ff45b 100644
--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
@@ -21,22 +21,22 @@
 """
 
 import argparse
+import glob
 import json
-import pickle
+import logging
 import os
+import pickle
+import signal
 import sys
 from pathlib import Path
-import glob
-import signal
 from signal import SIGINT
 
 from tqdm import tqdm
+from tqdm_multiprocess.logger import setup_logger_tqdm
 
+from lm_eval.decontamination.archiver import Reader, TextArchive
 from lm_eval.decontamination.janitor import Janitor, word_ngrams
-from lm_eval.decontamination.archiver import TextArchive, Reader
 
-import logging
-from tqdm_multiprocess.logger import setup_logger_tqdm
 
 logger = logging.getLogger(__name__)
 
@@ -89,7 +89,7 @@ def __init__(self, directory, num_buckets):
             os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
         ]
         self.buckets = list(map(TextArchive, self.bucket_files))
-        self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt")
+        self.checkpoint_file = os.path.join(directory, "bucket_offsets.ckpt")
 
         if os.path.exists(self.checkpoint_file):
             self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb"))
@@ -119,7 +119,6 @@ def close_buckets(self):
 
 
 def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
-
     pile_statistics = json.load(open("pile_statistics.json", "r"))
     pile_document_count = pile_statistics["Document Count"]
     start_offsets = pile_statistics["File Start Offsets"]
@@ -130,13 +129,13 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
     logger.info(f"Generating {n_value}-grams and bucketing.")
 
     # Done file
-    done_file = os.path.join(output_directory, f"ngram_buckets.done")
+    done_file = os.path.join(output_directory, "ngram_buckets.done")
     if os.path.exists(done_file):
         logger.info("ngrams already generated and bucketed, skipping")
         return
 
     # Checkpoint
-    checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt")
+    checkpoint_file = os.path.join(working_directory, "pile_offset.ckpt")
     if os.path.exists(checkpoint_file):
         checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
         iterate = True
diff --git a/scripts/clean_training_data/investigate_pile.py b/scripts/clean_training_data/investigate_pile.py
index dccd3abe70..c1d348d463 100644
--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
@@ -1,12 +1,13 @@
-from lm_eval.decontamination.archiver import Reader
-import os
+import glob
 import json
+import os
 from functools import reduce
-import glob
-import tqdm
 
+import tqdm
 from tqdm_multiprocess import TqdmMultiProcessPool
 
+from lm_eval.decontamination.archiver import Reader
+
 
 def get_file_stats(file_path, tqdm_func, global_tqdm):
     reader = Reader()
diff --git a/scripts/clean_training_data/process_sorted_buckets.py b/scripts/clean_training_data/process_sorted_buckets.py
index 1e145f9198..9d345d8e86 100644
--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
@@ -15,18 +15,18 @@
 
 import argparse
 import glob
+import logging
 import os
-from pathlib import Path
 import re
 import shutil
+from pathlib import Path
 
 from tqdm import tqdm
 from tqdm_multiprocess import TqdmMultiProcessPool
+from tqdm_multiprocess.logger import setup_logger_tqdm
 
-from scripts.clean_training_data.archiver import TextReader, TextArchive
+from scripts.clean_training_data.archiver import TextArchive, TextReader
 
-import logging
-from tqdm_multiprocess.logger import setup_logger_tqdm
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +35,6 @@
 def process_bucket(
     bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
 ):
-
     bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
     done_file = os.path.join(
         processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
@@ -96,7 +95,7 @@ def process_bucket(
 
 
 def process_sorted_buckets(working_directory, move_dir, process_count):
-    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt.sorted"))
     processed_directory = os.path.join(working_directory, "processed")
     os.makedirs(processed_directory, exist_ok=True)
 
@@ -123,7 +122,6 @@ def on_error(_):
 parser.add_argument("-procs", "--process_count", type=int, default=4)
 
 if __name__ == "__main__":
-
     logfile_path = "process13grams.log"
     setup_logger_tqdm(logfile_path)
 
diff --git a/scripts/clean_training_data/sort_13_gram_buckets.py b/scripts/clean_training_data/sort_13_gram_buckets.py
index 07a2eedcd0..83990de822 100644
--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -8,18 +8,18 @@
     directory and the unsorted buckets are removed after.
 """
 
-import glob
 import argparse
+import glob
+import logging
 import os
 import signal
-from signal import SIGINT
 import subprocess
+from signal import SIGINT
 
 from tqdm import tqdm
-
-import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
 
+
 logger = logging.getLogger(__name__)
 
 terminate = False
@@ -31,7 +31,7 @@ def handler(signal_received, frame):
 
 
 def sort_13_gram_buckets(working_directory):
-    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt"))
 
     for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
         sorted_file_path = bucket_file_path + ".sorted"
@@ -49,7 +49,6 @@ def sort_13_gram_buckets(working_directory):
 parser.add_argument("-dir", "--working_directory", default="")
 
 if __name__ == "__main__":
-
     version = 1.00
     print(f"Running version {version}")
 
diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py
index 72b8d4b358..6fb64504e8 100644
--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -1,6 +1,8 @@
 import random
+
 import transformers
-from lm_eval import tasks, evaluator
+
+from lm_eval import evaluator, tasks
 from lm_eval.base import LM
 
 
diff --git a/scripts/get_prompts.py b/scripts/get_prompts.py
index 06e2f89c13..d262ec37e4 100644
--- a/scripts/get_prompts.py
+++ b/scripts/get_prompts.py
@@ -1,6 +1,8 @@
-from lm_eval import tasks
 from itertools import islice
 
+from lm_eval import tasks
+
+
 ct = 3
 
 for (
diff --git a/scripts/make_gpt2_test_cases.py b/scripts/make_gpt2_test_cases.py
index 361bc2ecd6..0c1a4bffe0 100644
--- a/scripts/make_gpt2_test_cases.py
+++ b/scripts/make_gpt2_test_cases.py
@@ -1,8 +1,9 @@
-import transformers
+import random
 
 import torch
 import torch.nn.functional as F
-import random
+import transformers
+
 
 random.seed(42)
 
diff --git a/scripts/make_table_results.py b/scripts/make_table_results.py
index 690658ccea..72af524ffe 100644
--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,10 +2,11 @@
 Usage:
    python make_table_tasks.py --output <markdown_filename>
 """
+import json
 import logging
-from pytablewriter import MarkdownTableWriter, LatexTableWriter
 import os
-import json
+
+from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
 
 logging.basicConfig(level=logging.INFO)
diff --git a/scripts/make_table_tasks.py b/scripts/make_table_tasks.py
index d68d8fe219..ded7c1a596 100644
--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -4,9 +4,11 @@
 """
 import argparse
 import logging
-from lm_eval import tasks
+
 from pytablewriter import MarkdownTableWriter
 
+from lm_eval import tasks
+
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/scripts/model_comparator.py b/scripts/model_comparator.py
index f3cbd320f4..b1aeb142b7 100644
--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
@@ -1,13 +1,15 @@
 import argparse
+import os
+from typing import Dict, List, Tuple
+
 import numpy as np
-import lm_eval.evaluator
-from lm_eval import tasks
-from lm_eval import utils
-import scipy.stats
-from typing import Tuple, Dict, List
 import pandas as pd
+import scipy.stats
 import torch
-import os
+
+import lm_eval.evaluator
+from lm_eval import tasks, utils
+
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 eval_logger = utils.eval_logger
diff --git a/scripts/regression.py b/scripts/regression.py
index ef85d0c75e..2b8167c0eb 100644
--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -5,7 +5,7 @@
 import time
 from pathlib import Path
 
-from lm_eval import evaluator, utils
+from lm_eval import utils
 from lm_eval.api.registry import ALL_TASKS
 
 
@@ -136,14 +136,16 @@ def main():
     args = parse_args()
 
     args.branches = (
-        args.branches.split(",") if type(args.branches) == str else args.branches
+        args.branches.split(",") if isinstance(args.branches, str) else args.branches
+    )
+    args.models = (
+        args.models.split(",") if isinstance(args.models, str) else args.models
     )
-    args.models = args.models.split(",") if type(args.models) == str else args.models
     args.tasks = (
         ALL_TASKS
         if args.tasks == "all_tasks"
         else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
-        if type(args.tasks) == str
+        if isinstance(args.tasks, str)
         else args.tasks
     )
 
diff --git a/scripts/write_out.py b/scripts/write_out.py
index eb81e6732b..360b0b6271 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -1,11 +1,13 @@
 import argparse
-import numpy as np
-import json
 import os
 import random
+
+import numpy as np
+
 from lm_eval import tasks
-from lm_eval.utils import join_iters, eval_logger
-from lm_eval.tasks import initialize_tasks, include_path
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import eval_logger, join_iters
+
 
 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 
diff --git a/setup.py b/setup.py
index dbe4675d06..b5d8fabb86 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 import setuptools
 
+
 # This is to make sure that the package supports editable installs
 setuptools.setup()
diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py
index 6d186676fe..186b2305e6 100644
--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
@@ -1,12 +1,13 @@
-import unittest
-from unittest.mock import patch
 import hashlib
 import json
 import os
 import pickle
-from lm_eval.models.gguf import GGUFLM
+import unittest
+from unittest.mock import patch
 
 from lm_eval.api.instance import Instance
+from lm_eval.models.gguf import GGUFLM
+
 
 base_url = "https://matthoffner-ggml-llm-api.hf.space"
 
diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 557ad05124..323d664af8 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
-import pytest
+
+import sys
 from pathlib import Path
+
 import numpy as np
-from lm_eval.models.huggingface import HFLM
-from lm_eval.api.instance import Instance
-import lm_eval.tasks as tasks
-import sys
 import torch
 
+import lm_eval.tasks as tasks
+from lm_eval.api.instance import Instance
+from lm_eval.models.huggingface import HFLM
+
+
 tasks.initialize_tasks()
 
 
@@ -106,9 +109,10 @@ def test_logliklihood(self) -> None:
             f.write("\n".join(str(x) for x in _res))
         assert np.allclose(_res, _RES, atol=1e-2)
         # check indices for Multiple Choice
-        argmax_RES, argmax_res = np.argmax(
-            np.array(_RES).reshape(-1, 4), axis=1
-        ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
+        argmax_RES, argmax_res = (
+            np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
+            np.argmax(np.array(_res).reshape(-1, 4), axis=1),
+        )
         assert (argmax_RES == argmax_res).all()
 
     def test_generate_until(self) -> None:
diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py
index 61a024ce71..1da8a48762 100644
--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
@@ -1,10 +1,11 @@
-import pytest
 from typing import List
-from lm_eval.api.instance import Instance
-import lm_eval.tasks as tasks
-import sys
+
+import pytest
 import torch
 
+import lm_eval.tasks as tasks
+from lm_eval.api.instance import Instance
+
 
 @pytest.mark.skip(reason="requires CUDA")
 class TEST_VLLM:
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 7f30e21f43..825f57413d 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -1,15 +1,13 @@
-import os
-
 # import lm_eval.base as base
-import lm_eval.api.registry as registry
-import lm_eval.tasks as tasks
+from typing import List
+
+import pytest
 
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
-from typing import List
-import random
-import pytest
+import lm_eval.tasks as tasks
+
 
 tasks.initialize_tasks()
 
diff --git a/tests/test_janitor.py b/tests/test_janitor.py
index b496bfadd1..19ba611dfb 100644
--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
@@ -1,11 +1,10 @@
-import re
 from collections import defaultdict
 
 from lm_eval.decontamination.janitor import (
     Janitor,
     form_ngrams,
-    word_ngrams,
     split_indices,
+    word_ngrams,
     word_ngrams_indices,
 )
 
@@ -81,7 +80,6 @@ def test_split_indices():
 
 
 def test_word_ngrams_indices():
-
     sequence = (
         "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
         " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
 # Assumptions from GPT3 Paper:
 # the 200 characters to remove include punctuation and is actually a half-window
 
+
 # All tests below initially test without any registered contaminants, expecting the same sequence back.
 def test_janitor1():
-
     # First test using a 1gram and expected the first block before the filth to have some remaining
     # characters, but the second block should be completely removed.
 
@@ -165,7 +163,6 @@ def test_janitor1():
 
 
 def test_janitor2():
-
     # Second test using a 1gram and expected the first block before the filth to have some remaining
     # characters, and the second block is longer then 200 characters so should also have some remaining.
 
@@ -214,7 +211,6 @@ def test_janitor2():
 
 
 def test_janitor3():
-
     # Same test as above but with a 6gram.
 
     sequence = (
@@ -262,7 +258,6 @@ def test_janitor3():
 
 
 def test_janitor4():
-
     # This test adds another block to that from the previous. The middle block should be entirely
     # removed as the 200 characters are removed from each side.
 
@@ -318,7 +313,6 @@ def test_janitor4():
 
 
 def test_janitor5():
-
     # Same as above but using multiple different filth 6grams.
 
     sequence = (
@@ -374,7 +368,6 @@ def test_janitor5():
 
 
 def test_janitor6():
-
     # Same as above but now we add 10 filths and expect the same result, the following test does 11.
 
     sequence = (
@@ -438,7 +431,6 @@ def test_janitor6():
 
 
 def test_janitor7():
-
     # Same as above but now we add 9 filths and expect the same result, the following test does 10.
 
     sequence = (
diff --git a/tests/test_misc.py b/tests/test_misc.py
index 149a65f4c3..30267f63d0 100644
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -1,6 +1,8 @@
+import random
+
 import pytest
+
 import lm_eval.api.metrics as metrics
-import random
 
 
 def test_bootstrapping():
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
index 41504430d5..3651fd5ab3 100644
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -1,9 +1,13 @@
 from itertools import islice
+
 import pytest
-from .utils import new_tasks
+
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask
 
+from .utils import new_tasks
+
+
 tasks.initialize_tasks()
 # Default Task
 TASKS = ["arc_easy"]
diff --git a/tests/tests_master/test_description.py b/tests/tests_master/test_description.py
index fdf7bf5db0..2503bcea4b 100644
--- a/tests/tests_master/test_description.py
+++ b/tests/tests_master/test_description.py
@@ -1,6 +1,7 @@
 import random
-import lm_eval.tasks
+
 import lm_eval.models
+import lm_eval.tasks
 
 
 def test_description():
@@ -14,7 +15,6 @@ def test_description():
 
     task_dict = lm_eval.tasks.get_task_dict(task_names)
     for task_name, task in task_dict.items():
-
         # patch description field in task (# TODO: make this much more cleaned up)
         task._config.description = description_dict[task_name]
 
diff --git a/tests/tests_master/test_generate_13_grams.py b/tests/tests_master/test_generate_13_grams.py
index 26cd890369..722e69a77e 100644
--- a/tests/tests_master/test_generate_13_grams.py
+++ b/tests/tests_master/test_generate_13_grams.py
@@ -1,13 +1,13 @@
+import glob
+import logging
 import os
-from collections import Counter
 import shutil
-import glob
+from collections import Counter
 
+from lm_eval.decontamination.archiver import Archive, TextReader
 from lm_eval.decontamination.janitor import Janitor, word_ngrams
 from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
-from lm_eval.decontamination.archiver import Archive, TextReader
 
-import logging
 
 logger = logging.getLogger(__name__)
 
@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
     print("rebuild")
     rebuilt_ngrams = []
     bucket_file_paths = glob.glob(
-        os.path.join(test_working_directory, "output", f"*.bkt.txt")
+        os.path.join(test_working_directory, "output", "*.bkt.txt")
     )
     for bucket_file_path in bucket_file_paths:
         reader = TextReader(bucket_file_path)
diff --git a/tests/tests_master/test_models.py b/tests/tests_master/test_models.py
index 11ea5a8b46..e56dcaf8e4 100644
--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
@@ -2,12 +2,13 @@
 import json
 import os
 import pickle
-import pytest
 import unittest.mock as mock
 
+import pytest
+from openai import OpenAI
+
 import lm_eval.models as models
 
-from openai import OpenAI
 
 client = OpenAI()
 
diff --git a/tests/tests_master/test_version_stable.py b/tests/tests_master/test_version_stable.py
index 2eba83c6c6..34073d0a69 100644
--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
@@ -1,12 +1,14 @@
-import lm_eval.tasks as tasks
-import lm_eval.models as models
-import lm_eval.evaluator as evaluator
+import collections
+import hashlib
+import json
+import os
 import random
+
 import pytest
-import os
-import json
-import hashlib
-import collections
+
+import lm_eval.evaluator as evaluator
+import lm_eval.models as models
+import lm_eval.tasks as tasks
 
 
 os.makedirs("tests/testdata", exist_ok=True)
diff --git a/tests/utils.py b/tests/utils.py
index 3555541e71..fbdbb6a7fb 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,8 +1,8 @@
-from typing import List
-from lm_eval.utils import load_yaml_config
-from pathlib import Path
-from typing import Union
 import os
+from pathlib import Path
+from typing import List, Union
+
+from lm_eval.utils import load_yaml_config
 
 
 # {{{CI}}}