From 65b8761db922513dada0320b860fabb1b4f01dc3 Mon Sep 17 00:00:00 2001 From: Baber Abbasi <92168766+baberabb@users.noreply.github.com> Date: Wed, 20 Dec 2023 19:14:46 +0500 Subject: [PATCH] Switch Linting to `ruff` (#1166) * add ruff and isort. remove black and flake8 * remove unnecessary dependencies * remove dependency from table * change order * ran ruff * check 3.9 * exclude evaluator * update CI workflow * use ruff config in pyproject.toml * test * add isort rules to ruff * sort imports * import `make_table` * try stages for no-commit-to-branch * turn on mypy for pre-commit * test * test * test * change no-commit-to-branch to default * nits * fixed dependency --- .github/workflows/new_tasks.yml | 2 +- .github/workflows/unit_tests.yml | 29 +++---- .pre-commit-config.yaml | 16 ++-- README.md | 5 +- lm_eval/__main__.py | 19 ++-- lm_eval/api/filter.py | 4 +- lm_eval/api/metrics.py | 9 +- lm_eval/api/model.py | 10 +-- lm_eval/api/registry.py | 10 +-- lm_eval/api/samplers.py | 10 +-- lm_eval/api/task.py | 86 ++++++++----------- lm_eval/decontamination/archiver.py | 13 +-- lm_eval/decontamination/decontaminate.py | 20 ++--- lm_eval/decontamination/janitor.py | 6 +- lm_eval/evaluator.py | 14 ++- lm_eval/filters/__init__.py | 2 +- lm_eval/models/anthropic_llms.py | 11 ++- lm_eval/models/dummy.py | 1 + lm_eval/models/gguf.py | 7 +- lm_eval/models/huggingface.py | 50 ++++++----- lm_eval/models/openai_completions.py | 38 ++++---- lm_eval/models/textsynth.py | 6 +- lm_eval/models/vllm_causallms.py | 28 +++--- lm_eval/prompts/__init__.py | 2 - lm_eval/tasks/__init__.py | 1 - lm_eval/tasks/bbh/_generate_configs.py | 2 - lm_eval/tasks/belebele/_generate_configs.py | 1 - lm_eval/tasks/bigbench/generate_tasks.py | 1 - lm_eval/tasks/blimp/generate_configs.py | 1 - lm_eval/tasks/ceval/_generate_configs.py | 5 +- lm_eval/tasks/cmmlu/_generate_configs.py | 5 +- lm_eval/tasks/code_x_glue/code-text/bleu.py | 11 +-- lm_eval/tasks/code_x_glue/code-text/utils.py | 2 - lm_eval/tasks/coqa/utils.py | 4 +- lm_eval/tasks/csatqa/_generate_configs.py | 2 - lm_eval/tasks/drop/utils.py | 1 - lm_eval/tasks/ifeval/instructions_registry.py | 27 ++---- lm_eval/tasks/ifeval/instructions_util.py | 1 - lm_eval/tasks/mgsm/utils.py | 1 - lm_eval/tasks/mmlu/_generate_configs.py | 3 - .../advanced_ai_risk/_generate_configs.py | 2 - .../persona/_generate_configs.py | 2 - lm_eval/tasks/paws-x/_generate_config.py | 1 - lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py | 3 +- lm_eval/tasks/qasper/utils.py | 1 - lm_eval/tasks/scrolls/task.py | 1 - lm_eval/tasks/squadv2/task.py | 5 +- lm_eval/tasks/super_glue/cb/t5_utils.py | 2 - lm_eval/tasks/super_glue/multirc/t5_utils.py | 3 - lm_eval/tasks/super_glue/record/t5_utils.py | 5 +- lm_eval/tasks/super_glue/wsc/t5_utils.py | 4 - lm_eval/tasks/translation/utils.py | 2 - lm_eval/tasks/truthfulqa/utils.py | 3 - lm_eval/tasks/xnli/utils.py | 1 - lm_eval/utils.py | 36 ++++---- mypy.ini | 34 ++++---- pyproject.toml | 23 +++-- scripts/build_benchmark.py | 10 +-- .../compress_and_package.py | 10 +-- .../clean_training_data/generate_13_grams.py | 19 ++-- .../clean_training_data/investigate_pile.py | 9 +- .../process_sorted_buckets.py | 12 ++- .../sort_13_gram_buckets.py | 11 ++- scripts/cost_estimate.py | 4 +- scripts/get_prompts.py | 4 +- scripts/make_gpt2_test_cases.py | 5 +- scripts/make_table_results.py | 5 +- scripts/make_table_tasks.py | 4 +- scripts/model_comparator.py | 14 +-- scripts/regression.py | 10 ++- scripts/write_out.py | 10 ++- setup.py | 1 + tests/models/test_gguf.py | 7 +- tests/models/test_huggingface.py | 20 +++-- tests/models/test_vllm.py | 9 +- tests/test_evaluator.py | 12 ++- tests/test_janitor.py | 12 +-- tests/test_misc.py | 4 +- tests/test_tasks.py | 6 +- tests/tests_master/test_description.py | 4 +- tests/tests_master/test_generate_13_grams.py | 10 +-- tests/tests_master/test_models.py | 5 +- tests/tests_master/test_version_stable.py | 16 ++-- tests/utils.py | 8 +- 84 files changed, 389 insertions(+), 446 deletions(-) diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml index 76ab1be15b..0c4490f53a 100644 --- a/.github/workflows/new_tasks.yml +++ b/.github/workflows/new_tasks.yml @@ -56,7 +56,7 @@ jobs: if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' run: | python -m pip install --upgrade pip - pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 07a85864b3..f981798fdf 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -17,29 +17,22 @@ jobs: linter: name: Linters runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 5 steps: - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python 3.8 - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: 3.8 cache: pip - cache-dependency-path: setup.py - - name: Install dependencies - run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu ; export SKIP=no-commit-to-branch # env var deactivates --no-commit-to-branch + cache-dependency-path: pyproject.toml - name: Pre-Commit + env: + SKIP: "no-commit-to-branch,mypy" + uses: pre-commit/action@v3.0.0 - - name: Lint with pylint - run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics # # mypy turned off for now # - name: Lint with mypy # run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable @@ -53,17 +46,17 @@ jobs: timeout-minutes: 30 steps: - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: pip - cache-dependency-path: setup.py + cache-dependency-path: pyproject.toml - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9b4ae822c1..b5386cfda3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,14 +27,16 @@ repos: args: [--remove] - id: mixed-line-ending args: [--fix=lf] - - repo: https://github.com/pycqa/flake8 - rev: 3.7.9 + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.1.8 hooks: - - id: flake8 - - repo: https://github.com/psf/black - rev: 22.3.0 - hooks: - - id: black + # Run the linter. + - id: ruff + args: + - --fix + # Run the formatter. + - id: ruff-format - repo: https://github.com/codespell-project/codespell rev: v2.1.0 hooks: diff --git a/README.md b/README.md index ef3dae54d8..9dbee8fbb2 100644 --- a/README.md +++ b/README.md @@ -49,11 +49,10 @@ pip install -e . We also provide a number of optional dependencies for extended functionality. Extras can be installed via `pip install -e ".[NAME]"` | Name | Use | -| ------------- | ------------------------------------- | +|---------------|---------------------------------------| | anthropic | For using Anthropic's models | -| dev | You probably don't want to use this | | gptq | For loading models with GPTQ | -| testing | You probably don't want to use this | +| dev | You probably don't want to use this | | multilingual | For multilingual tokenizers | | openai | For using OpenAI's models | | promptsource | For using PromtSource prompts | diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py index ebb1b6c4ab..7fbee0dc73 100644 --- a/lm_eval/__main__.py +++ b/lm_eval/__main__.py @@ -1,17 +1,18 @@ +import argparse +import json +import logging import os import re import sys -import json -import logging -import argparse -import numpy as np - from pathlib import Path from typing import Union +import numpy as np + from lm_eval import evaluator, utils -from lm_eval.tasks import initialize_tasks, include_path from lm_eval.api.registry import ALL_TASKS +from lm_eval.tasks import include_path, initialize_tasks +from lm_eval.utils import make_table def _handle_non_serializable(o): @@ -170,7 +171,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: task_names = ALL_TASKS elif args.tasks == "list": eval_logger.info( - "Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS))) + "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))) ) sys.exit() else: @@ -271,9 +272,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" ) - print(evaluator.make_table(results)) + print(make_table(results)) if "groups" in results: - print(evaluator.make_table(results, "groups")) + print(make_table(results, "groups")) if __name__ == "__main__": diff --git a/lm_eval/api/filter.py b/lm_eval/api/filter.py index ac69aa8ffd..bc26a1a637 100644 --- a/lm_eval/api/filter.py +++ b/lm_eval/api/filter.py @@ -1,9 +1,10 @@ from dataclasses import dataclass from typing import List -from lm_eval.api.instance import Instance from datasets import Dataset +from lm_eval.api.instance import Instance + class Filter: """ @@ -42,7 +43,6 @@ class FilterEnsemble: filters: List[Filter] def apply(self, instances: List[Instance], docs: List[Dataset]) -> None: - resps = [ inst.resps for inst in instances ] # operate just on the model responses diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py index 4eb68585b6..85a944c888 100644 --- a/lm_eval/api/metrics.py +++ b/lm_eval/api/metrics.py @@ -1,18 +1,19 @@ +import logging import math +import random from collections.abc import Iterable +import evaluate import numpy as np import sacrebleu import sklearn.metrics -import random -import evaluate -from lm_eval.api.registry import register_metric, register_aggregation +from lm_eval.api.registry import register_aggregation, register_metric -import logging eval_logger = logging.getLogger("lm-eval") + # Register Aggregations First @register_aggregation("mean") def mean(arr): diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py index 0f67095879..df829af592 100644 --- a/lm_eval/api/model.py +++ b/lm_eval/api/model.py @@ -1,17 +1,15 @@ import abc +import hashlib +import json +import logging import os +from typing import List, Optional, Tuple, Type, TypeVar -import torch -from typing import Union, List, Tuple, Optional, Type, TypeVar from sqlitedict import SqliteDict -import json -import hashlib - from tqdm import tqdm from lm_eval import utils -import logging eval_logger = logging.getLogger("lm-eval") diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py index 7d73ae6c5f..5fb9c011fc 100644 --- a/lm_eval/api/registry.py +++ b/lm_eval/api/registry.py @@ -1,8 +1,9 @@ -import os +import logging + import evaluate + from lm_eval.api.model import LM -import logging eval_logger = logging.getLogger("lm-eval") @@ -91,7 +92,6 @@ def decorate(fn): def register_metric(**args): # TODO: do we want to enforce a certain interface to registered metrics? def decorate(fn): - assert "metric" in args name = args["metric"] @@ -100,7 +100,6 @@ def decorate(fn): ("higher_is_better", HIGHER_IS_BETTER_REGISTRY), ("aggregation", METRIC_AGGREGATION_REGISTRY), ]: - if key in args: value = args[key] assert ( @@ -120,7 +119,6 @@ def decorate(fn): def get_metric(name, hf_evaluate_metric=False): - if not hf_evaluate_metric: if name in METRIC_REGISTRY: return METRIC_REGISTRY[name] @@ -151,7 +149,6 @@ def decorate(fn): def get_aggregation(name): - try: return AGGREGATION_REGISTRY[name] except KeyError: @@ -161,7 +158,6 @@ def get_aggregation(name): def get_metric_aggregation(name): - try: return METRIC_AGGREGATION_REGISTRY[name] except KeyError: diff --git a/lm_eval/api/samplers.py b/lm_eval/api/samplers.py index 8a0d1e334d..57e3a6f1a4 100644 --- a/lm_eval/api/samplers.py +++ b/lm_eval/api/samplers.py @@ -40,18 +40,18 @@ def get_context(self, doc, num_fewshot): self.doc_to_text(doc) if ( self.config.doc_to_choice is None - or type(self.doc_to_text(doc)) is str + or isinstance(self.doc_to_text(doc), str) ) else self.doc_to_choice(doc)[self.doc_to_text(doc)] ) + self.target_delimiter + ( str(self.doc_to_target(doc)[0]) - if type(self.doc_to_target(doc)) is list + if isinstance(self.doc_to_target(doc), list) else self.doc_to_target(doc) if ( self.config.doc_to_choice is None - or type(self.doc_to_target(doc)) is str + or isinstance(self.doc_to_target(doc), str) ) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) ) @@ -77,8 +77,8 @@ def sample(self, n) -> None: Draw the first `n` samples in order from the specified split. Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU. """ - assert n <= len( - self.docs + assert ( + n <= len(self.docs) ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available." return self.docs[:n] diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py index 88ca412923..217349426c 100644 --- a/lm_eval/api/task.py +++ b/lm_eval/api/task.py @@ -1,45 +1,35 @@ import abc -from dataclasses import dataclass, field, asdict - -import os -import re import ast -import yaml import logging -import evaluate +import os import random -import itertools -import functools -from tqdm import tqdm +import re +from collections.abc import Callable +from dataclasses import asdict, dataclass +from typing import Any, List, Literal, Tuple, Union import datasets import numpy as np -from typing import Union, List, Any, Tuple, Literal -from collections.abc import Callable - from lm_eval import utils from lm_eval.api import samplers from lm_eval.api.instance import Instance -from lm_eval.api.filter import FilterEnsemble - -from lm_eval.prompts import get_prompt -from lm_eval.filters import build_filter_ensemble from lm_eval.api.metrics import ( + bits_per_byte, mean, weighted_perplexity, - bits_per_byte, - metric_max_over_ground_truths, ) from lm_eval.api.registry import ( - get_metric, + AGGREGATION_REGISTRY, + DEFAULT_METRIC_REGISTRY, get_aggregation, + get_metric, get_metric_aggregation, is_higher_better, - DEFAULT_METRIC_REGISTRY, - OUTPUT_TYPE_REGISTRY, - AGGREGATION_REGISTRY, ) +from lm_eval.filters import build_filter_ensemble +from lm_eval.prompts import get_prompt + ALL_OUTPUT_TYPES = [ "loglikelihood", @@ -349,9 +339,7 @@ def build_all_requests(self, limit=None, rank=None, world_size=None) -> None: elif self.has_validation_docs(): docs = self.validation_docs() else: - assert ( - False - ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" + assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" eval_logger.info(f"Building contexts for task on rank {rank}...") @@ -603,9 +591,9 @@ def __init__( if "aggregation" in metric_config: agg_name = metric_config["aggregation"] - if type(agg_name) == str: + if isinstance(agg_name, str): self._aggregation_list[metric_name] = get_aggregation(agg_name) - elif callable(agg_name): + elif callable(agg_name): # noqa: E721 self._aggregation_list[metric_name] = metric_config[ "aggregation" ] @@ -672,9 +660,7 @@ def __init__( elif self.has_validation_docs(): self.task_docs = self.validation_docs() else: - assert ( - False - ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" + assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" # Test One Doc self.features = list(self.task_docs.features.keys()) @@ -686,20 +672,20 @@ def __init__( if self.config.doc_to_choice is not None: test_choice = self.doc_to_choice(test_doc) - if type(test_choice) is not list: + if not isinstance(test_choice, list): eval_logger.error("doc_to_choice must return list") else: num_choice = len(test_choice) - if type(test_text) is int: + if isinstance(test_text, int): self.multiple_input = num_choice else: test_choice = None - if type(test_target) is list: + if isinstance(test_target, list): self.multiple_target = len(test_target) else: - if (type(test_target) is int) and (test_choice is not None): + if (isinstance(test_target, int)) and (test_choice is not None): test_target = test_choice[test_target] else: test_target = str(test_target) @@ -808,11 +794,11 @@ def fewshot_context(self, doc, num_fewshot): ) example = self.doc_to_text(doc) - if type(example) == str: + if isinstance(example, str): return labeled_examples + example - elif type(example) == list: + elif isinstance(example, list): return [labeled_examples + ex for ex in example] - elif type(example) == int: + elif isinstance(example, int): if self.config.doc_to_choice is not None: choices = self.doc_to_choice(doc) return labeled_examples + choices[example] @@ -864,9 +850,9 @@ def doc_to_text(self, doc): else: doc_to_text = self.config.doc_to_text - if type(doc_to_text) == int: + if isinstance(doc_to_text, int): return doc_to_text - elif type(doc_to_text) == str: + elif isinstance(doc_to_text, str): if doc_to_text in self.features: # if self.config.doc_to_choice is not None: # return self.doc_to_choice(doc)[doc[doc_to_text]] @@ -898,9 +884,9 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]: else: doc_to_target = self.config.doc_to_target - if type(doc_to_target) == int: + if isinstance(doc_to_target, int): return doc_to_target - elif type(doc_to_target) == str: + elif isinstance(doc_to_target, str): if doc_to_target in self.features: # if self.config.doc_to_choice is not None: # return self.doc_to_choice(doc)[doc[doc_to_target]] @@ -921,7 +907,7 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]: return target_string else: return target_string - elif type(doc_to_target) == list: + elif isinstance(doc_to_target, list): return doc_to_target elif callable(doc_to_target): return doc_to_target(doc) @@ -944,14 +930,14 @@ def doc_to_choice(self, doc: Any) -> List[str]: else: doc_to_choice = self.config.doc_to_choice - if type(doc_to_choice) == str: + if isinstance(doc_to_choice, str): if doc_to_choice in self.features: return doc[doc_to_choice] else: return ast.literal_eval(utils.apply_template(doc_to_choice, doc)) - elif type(doc_to_choice) == list: + elif isinstance(doc_to_choice, list): return doc_to_choice - elif type(doc_to_choice) == dict: + elif isinstance(doc_to_choice, dict): return list(doc_to_choice.values()) elif callable(doc_to_choice): return doc_to_choice(doc) @@ -1078,14 +1064,14 @@ def process_results(self, doc, results): gold = self.doc_to_target(doc) gold_index_error = False - if type(gold) is list: + if isinstance(gold, list): gold = [i if i < len(choices) else -100 for i in gold] if -100 in gold: gold_index_error = True else: - if type(gold) is int: + if isinstance(gold, int): gold = gold if gold < len(choices) else -100 - elif type(gold) is str: + elif isinstance(gold, str): gold = choices.index(gold) if gold in choices else -100 if gold == -100: @@ -1175,9 +1161,7 @@ def process_results(self, doc, results): predictions=[result], **self._metric_fn_kwargs[metric], ) - except ( - TypeError - ): # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics + except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics result_score = self._metric_fn_list[metric]([gold, result]) if isinstance(result_score, dict): # TODO: this handles the case where HF evaluate returns a dict. diff --git a/lm_eval/decontamination/archiver.py b/lm_eval/decontamination/archiver.py index 3b5f09f525..e6bff33f0c 100644 --- a/lm_eval/decontamination/archiver.py +++ b/lm_eval/decontamination/archiver.py @@ -1,13 +1,14 @@ +import datetime +import io +import json +import mmap import os +from pathlib import Path from typing import Any -import zstandard -import json + import jsonlines -import io -import datetime -import mmap import tqdm -from pathlib import Path +import zstandard def json_serial(obj: Any) -> str: diff --git a/lm_eval/decontamination/decontaminate.py b/lm_eval/decontamination/decontaminate.py index 447eae52bf..f5b4157c67 100644 --- a/lm_eval/decontamination/decontaminate.py +++ b/lm_eval/decontamination/decontaminate.py @@ -1,13 +1,13 @@ -import time -import random -import pickle -import json +import collections import glob +import json import os -import collections +import pickle +import random +import time -from .janitor import Janitor, word_ngrams from .archiver import ZStdTextReader +from .janitor import Janitor, word_ngrams # Was used for testing the evaluator decoupled from the full logic below @@ -109,7 +109,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str: print(f"Merging lookups took {elapsed:0.5f} seconds.") print(f"{ngrams_n_size} grams files found in {ngrams_path}:") - files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst")) + files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst")) print(files) for file in files: @@ -135,11 +135,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str: matching_unique += 1 for task_name, task_set, doc_ids in merged_lookup[ngram]: task_doc_set = duplicates[(task_name, task_set)] - for ( - doc_id - ) in ( - doc_ids - ): # Record contamination across all relevant task/set combos + for doc_id in doc_ids: # Record contamination across all relevant task/set combos task_doc_set.add(doc_id) del merged_lookup[ngram] # No point matching again else: diff --git a/lm_eval/decontamination/janitor.py b/lm_eval/decontamination/janitor.py index 5ad84d13df..cedf8a5717 100644 --- a/lm_eval/decontamination/janitor.py +++ b/lm_eval/decontamination/janitor.py @@ -1,9 +1,9 @@ +import pickle import re import string -import pickle import traceback -from pprint import pprint -from typing import Iterator, Sequence, TypeVar, List, Tuple +from typing import Iterator, List, Sequence, Tuple, TypeVar + # This is a cpp module. Compile janitor_util.cpp with: # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index cb5d0f53ae..5d277a6bf7 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -1,8 +1,6 @@ import random import itertools -import json import collections -import sys import torch @@ -17,8 +15,6 @@ from lm_eval.utils import ( positional_deprecated, run_task_tests, - make_table, - create_iterator, get_git_commit_hash, simple_parse_args_string, eval_logger, @@ -91,7 +87,7 @@ def simple_evaluate( if gen_kwargs is not None: gen_kwargs = simple_parse_args_string(gen_kwargs) eval_logger.warning( - f"generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks." + "generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks." ) if gen_kwargs == "": gen_kwargs = None @@ -118,7 +114,9 @@ def simple_evaluate( use_cache # each rank receives a different cache db. # necessary to avoid multiple writes to cache at once - + "_rank" + str(lm.rank) + ".db", + + "_rank" + + str(lm.rank) + + ".db", ) task_dict = lm_eval.tasks.get_task_dict(tasks) @@ -513,9 +511,7 @@ def evaluate( ) + total_size * current_size / ( (total_size + current_size) * (total_size + current_size - 1) - ) * ( - results[group][metric] - metric_score - ) ** 2 + ) * (results[group][metric] - metric_score) ** 2 else: results[group][metric] = metric_score results[group][stderr] = var_score diff --git a/lm_eval/filters/__init__.py b/lm_eval/filters/__init__.py index c74ac01593..76eb78467e 100644 --- a/lm_eval/filters/__init__.py +++ b/lm_eval/filters/__init__.py @@ -32,7 +32,7 @@ def build_filter_ensemble(filter_name, components): Create a filtering pipeline. """ filters = [] - for (function, kwargs) in components: + for function, kwargs in components: if kwargs is None: f = get_filter(function)() else: diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py index 18b1b70a38..6e5b437875 100644 --- a/lm_eval/models/anthropic_llms.py +++ b/lm_eval/models/anthropic_llms.py @@ -1,9 +1,12 @@ -from lm_eval.api.model import LM -from lm_eval.api.registry import register_model -from tqdm import tqdm import time +from typing import Any, List, Tuple + +from tqdm import tqdm + from lm_eval import utils -from typing import List, Any, Tuple +from lm_eval.api.model import LM +from lm_eval.api.registry import register_model + eval_logger = utils.eval_logger diff --git a/lm_eval/models/dummy.py b/lm_eval/models/dummy.py index b13a3900f9..d28435f7ea 100644 --- a/lm_eval/models/dummy.py +++ b/lm_eval/models/dummy.py @@ -1,4 +1,5 @@ import random + from lm_eval.api.model import LM from lm_eval.api.registry import register_model diff --git a/lm_eval/models/gguf.py b/lm_eval/models/gguf.py index 5ae154f39a..8eebc2e04f 100644 --- a/lm_eval/models/gguf.py +++ b/lm_eval/models/gguf.py @@ -1,11 +1,14 @@ -import requests import logging import time -from tqdm import tqdm + +import requests from requests.exceptions import RequestException +from tqdm import tqdm + from lm_eval.api.model import LM from lm_eval.api.registry import register_model + logger = logging.getLogger(__name__) diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py index b32ffc34e3..dc243a1a5c 100644 --- a/lm_eval/models/huggingface.py +++ b/lm_eval/models/huggingface.py @@ -1,29 +1,28 @@ +import copy import os -from packaging import version +from collections import defaultdict +from pathlib import Path +from typing import List, Literal, Optional, Tuple, Union + import torch +import torch.nn.functional as F import transformers +from accelerate import Accelerator, DistributedType, find_executable_batch_size +from packaging import version +from peft import PeftModel +from peft import __version__ as PEFT_VERSION +from tqdm import tqdm from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, ) -from peft import __version__ as PEFT_VERSION, PeftModel - -import copy -from collections import defaultdict -from tqdm import tqdm -from pathlib import Path - -import torch.nn.functional as F from lm_eval import utils from lm_eval.api.instance import Instance from lm_eval.api.model import LM from lm_eval.api.registry import register_model +from lm_eval.utils import stop_sequences_criteria -from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria - -from accelerate import Accelerator, find_executable_batch_size, DistributedType -from typing import List, Optional, Union, Tuple, Literal eval_logger = utils.eval_logger @@ -107,9 +106,7 @@ def __init__( eval_logger.warning( "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way." ) - assert ( - not parallelize - ), "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`" + assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`" self._model = pretrained self._device = self._model.device @@ -279,10 +276,13 @@ def __init__( "with 'accelerate launch *script*'. " f"Current run will proceed with {accelerator.num_processes} devices." ) - assert accelerator.distributed_type in [ - DistributedType.FSDP, - DistributedType.MULTI_GPU, - ], "Unsupported distributed type provided. Only DDP and FSDP are supported." + assert ( + accelerator.distributed_type + in [ + DistributedType.FSDP, + DistributedType.MULTI_GPU, + ] + ), "Unsupported distributed type provided. Only DDP and FSDP are supported." if accelerator.distributed_type == DistributedType.FSDP: self._model = accelerator.prepare(self.model) else: @@ -417,7 +417,6 @@ def _get_config( revision: str = "main", trust_remote_code: bool = False, ) -> None: - self._config = transformers.AutoConfig.from_pretrained( pretrained, revision=revision, @@ -751,8 +750,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: for context, continuation in [req.args for req in requests]: if context == "": # end of text as context - context_enc, continuation_enc = [self.eot_token_id], self.tok_encode( - continuation + context_enc, continuation_enc = ( + [self.eot_token_id], + self.tok_encode(continuation), ) else: context_enc, continuation_enc = self._encode_pair(context, continuation) @@ -995,9 +995,7 @@ def _collate(x): greedy_tokens = logits.argmax(dim=-1) cont_toks = torch.tensor( cont_toks, dtype=torch.long, device=self.device - ).unsqueeze( - 0 - ) # [1, seq] + ).unsqueeze(0) # [1, seq] max_equal = (greedy_tokens == cont_toks).all() # Obtain log-probs at the corresponding continuation token indices diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py index b0f6a8f170..d63f8ab12a 100644 --- a/lm_eval/models/openai_completions.py +++ b/lm_eval/models/openai_completions.py @@ -1,9 +1,10 @@ +import copy import os import time -from typing import List, Tuple, Optional - -import copy from collections import defaultdict +from importlib.util import find_spec +from typing import List, Optional, Tuple + from tqdm import tqdm from lm_eval import utils @@ -44,13 +45,13 @@ def oa_completion(**kwargs): Retry with back-off until they respond """ - try: - import openai, tiktoken # noqa: E401 - except ModuleNotFoundError: + if not find_spec("openai") or not find_spec("tiktoken"): raise Exception( - "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \ -please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`", + "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. " + "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`" ) + else: + import openai backoff_time = 3 while True: @@ -88,7 +89,8 @@ def __init__( super().__init__() self.seed = seed try: - import openai, tiktoken # noqa: E401 + import openai # noqa: E401 + import tiktoken except ModuleNotFoundError: raise Exception( "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \ @@ -154,8 +156,9 @@ def loglikelihood(self, requests) -> List[Tuple[float, bool]]: for context, continuation in [req.args for req in requests]: if context == "": # end of text as context - context_enc, continuation_enc = [self.eot_token_id], self.tok_encode( - continuation + context_enc, continuation_enc = ( + [self.eot_token_id], + self.tok_encode(continuation), ) else: context_enc, continuation_enc = self._encode_pair(context, continuation) @@ -326,13 +329,13 @@ def oa_chat_completion(client, **kwargs): Retry with back-off until they respond """ - try: - import openai, tiktoken # noqa: E401 - except ModuleNotFoundError: + if not find_spec("openai") or not find_spec("tiktoken"): raise Exception( - "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \ -please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`", + "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. " + "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`" ) + else: + import openai async def _get_completions(**kwargs): chat_completions = await client.chat.completions.create(**kwargs) @@ -364,7 +367,8 @@ def __init__( """ super().__init__() try: - import openai, tiktoken # noqa: E401 + import openai # noqa: E401 + import tiktoken except ModuleNotFoundError: raise Exception( "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \ diff --git a/lm_eval/models/textsynth.py b/lm_eval/models/textsynth.py index 379f11b902..32917d692c 100644 --- a/lm_eval/models/textsynth.py +++ b/lm_eval/models/textsynth.py @@ -13,9 +13,11 @@ """ import logging import os -import requests as _requests import time + +import requests as _requests from tqdm import tqdm + from lm_eval.api.model import LM from lm_eval.api.registry import register_model @@ -149,7 +151,7 @@ def generate_until(self, requests): self.cache_hook.add_partial("generate_until", (inp, request_args), s) else: logger.error( - f"The following response does not contain generated `text`. " + "The following response does not contain generated `text`. " "Got:\n{resp}" ) assert False diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py index edab369411..e6a75ceb21 100644 --- a/lm_eval/models/vllm_causallms.py +++ b/lm_eval/models/vllm_causallms.py @@ -1,16 +1,19 @@ +import copy from collections import defaultdict -from typing import List, Tuple, Optional, Literal, Union, Any -from transformers import AutoTokenizer +from importlib.util import find_spec +from typing import List, Literal, Optional, Tuple, Union + +from tqdm import tqdm + +from lm_eval import utils from lm_eval.api.instance import Instance from lm_eval.api.model import LM -import copy -from tqdm import tqdm from lm_eval.api.registry import register_model -from lm_eval import utils + try: - from vllm import LLM, SamplingParams from ray.util.multiprocessing import Pool + from vllm import LLM, SamplingParams from vllm.transformers_utils.tokenizer import get_tokenizer except ModuleNotFoundError: pass @@ -54,12 +57,10 @@ def __init__( ): super().__init__() - try: - import vllm - except ModuleNotFoundError: + if not find_spec("vllm"): raise Exception( - "attempted to use 'vllm' LM type, but package `vllm` is not installed. \ -please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`", + "attempted to use 'vllm' LM type, but package `vllm` is not installed. " + "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`" ) assert "cuda" in device or device is None, "vLLM only supports CUDA" @@ -193,8 +194,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: for context, continuation in [req.args for req in requests]: if context == "": # end of text as context - context_enc, continuation_enc = [self.eot_token_id], self.tok_encode( - continuation + context_enc, continuation_enc = ( + [self.eot_token_id], + self.tok_encode(continuation), ) else: context_enc, continuation_enc = self._encode_pair(context, continuation) diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py index d058a48776..d8b62e7deb 100644 --- a/lm_eval/prompts/__init__.py +++ b/lm_eval/prompts/__init__.py @@ -69,7 +69,6 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None def load_prompt_list( use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs ): - category_name, prompt_name = use_prompt.split(":") if category_name == "promptsource": @@ -113,7 +112,6 @@ def __init__(self, prompt_string): self.prompt_string = prompt_string def apply(self, doc): - doc_to_text = self.prompt_string["doc_to_text"] doc_to_target = self.prompt_string["doc_to_target"] diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 28563de6bf..ed92bd9755 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -180,7 +180,6 @@ def include_path(task_dir): def initialize_tasks(verbosity="INFO"): - eval_logger.setLevel(getattr(logging, f"{verbosity}")) task_dir = os.path.dirname(os.path.abspath(__file__)) + "/" diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py index d2a53cfb6c..18a55c705a 100644 --- a/lm_eval/tasks/bbh/_generate_configs.py +++ b/lm_eval/tasks/bbh/_generate_configs.py @@ -24,7 +24,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() # get filename of base_yaml so we can `"include": ` it in our other YAMLs. @@ -37,7 +36,6 @@ def parse_args(): dataset_path = "lukaemon/bbh" for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()): - resp = requests.get( f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt" ).content.decode("utf-8") diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py index 9df56f5feb..052d55bea2 100644 --- a/lm_eval/tasks/belebele/_generate_configs.py +++ b/lm_eval/tasks/belebele/_generate_configs.py @@ -23,7 +23,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() # get filename of base_yaml so we can `"include": ` it in our other YAMLs. diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py index fa8619f40c..3fd5cd6c2b 100644 --- a/lm_eval/tasks/bigbench/generate_tasks.py +++ b/lm_eval/tasks/bigbench/generate_tasks.py @@ -173,7 +173,6 @@ def main() -> None: - for path, task_type in zip( ["multiple_choice", "generate_until"], ["multiple_choice_template_yaml", "generate_until_template_yaml"], diff --git a/lm_eval/tasks/blimp/generate_configs.py b/lm_eval/tasks/blimp/generate_configs.py index 4fa45db4d2..dfc4b4dc95 100644 --- a/lm_eval/tasks/blimp/generate_configs.py +++ b/lm_eval/tasks/blimp/generate_configs.py @@ -73,7 +73,6 @@ def main() -> None: for task in all_subtasks: - file_name = f"{task}.yaml" try: with open(f"{file_name}", "w") as f: diff --git a/lm_eval/tasks/ceval/_generate_configs.py b/lm_eval/tasks/ceval/_generate_configs.py index deaa0372c8..2b96e00713 100644 --- a/lm_eval/tasks/ceval/_generate_configs.py +++ b/lm_eval/tasks/ceval/_generate_configs.py @@ -75,7 +75,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() # get filename of base_yaml so we can `"include": ` it in our other YAMLs. @@ -93,7 +92,9 @@ def parse_args(): if args.cot_prompt_path is not None: description = cot_file[subject_eng] else: - description = f"以下是中国关于{subject_zh}的单项选择题,请选出其中的正确答案。\n\n" + description = ( + f"以下是中国关于{subject_zh}的单项选择题,请选出其中的正确答案。\n\n" + ) yaml_dict = { "include": base_yaml_name, diff --git a/lm_eval/tasks/cmmlu/_generate_configs.py b/lm_eval/tasks/cmmlu/_generate_configs.py index 4b3dba75b1..07553bb1ea 100644 --- a/lm_eval/tasks/cmmlu/_generate_configs.py +++ b/lm_eval/tasks/cmmlu/_generate_configs.py @@ -90,7 +90,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() # get filename of base_yaml so we can `"include": ` it in our other YAMLs. @@ -108,7 +107,9 @@ def parse_args(): if args.cot_prompt_path is not None: description = cot_file[subject_eng] else: - description = f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n" + description = ( + f"以下是关于{subject_zh}的单项选择题,请直接给出正确答案的选项。\n\n" + ) yaml_dict = { "include": base_yaml_name, diff --git a/lm_eval/tasks/code_x_glue/code-text/bleu.py b/lm_eval/tasks/code_x_glue/code-text/bleu.py index 310c626c73..a90fc46b17 100644 --- a/lm_eval/tasks/code_x_glue/code-text/bleu.py +++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py @@ -1,9 +1,7 @@ #!/usr/bin/python -import os import re import sys import math -import subprocess import xml.sax.saxutils from typing import List, Pattern, Tuple, Union, Dict, Any, Optional @@ -65,14 +63,14 @@ def normalize(s): if type(s) is not str: s = " ".join(s) # language-independent part: - for (pattern, replace) in normalize1: + for pattern, replace in normalize1: s = re.sub(pattern, replace, s) s = xml.sax.saxutils.unescape(s, {""": '"'}) # language-dependent part (assuming Western languages): s = " %s " % s if not preserve_case: s = s.lower() # this might not be identical to the original - for (pattern, replace) in normalize2: + for pattern, replace in normalize2: s = re.sub(pattern, replace, s) return s.split() @@ -95,7 +93,7 @@ def cook_refs(refs, n=4): maxcounts: Dict[Tuple[str], int] = {} for ref in refs: counts = count_ngrams(ref, n) - for (ngram, count) in counts.items(): + for ngram, count in counts.items(): maxcounts[ngram] = max(maxcounts.get(ngram, 0), count) return ([len(ref) for ref in refs], maxcounts) @@ -125,7 +123,7 @@ def cook_test(test, item, n=4): result["correct"] = [0] * n counts = count_ngrams(test, n) - for (ngram, count) in counts.items(): + for ngram, count in counts.items(): result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count) return result @@ -222,7 +220,6 @@ def bleuFromMaps(m1, m2): def smoothed_bleu_4(references, predictions, **kwargs): - predictionMap = {} goldMap = {} diff --git a/lm_eval/tasks/code_x_glue/code-text/utils.py b/lm_eval/tasks/code_x_glue/code-text/utils.py index 981a00b912..6975684259 100644 --- a/lm_eval/tasks/code_x_glue/code-text/utils.py +++ b/lm_eval/tasks/code_x_glue/code-text/utils.py @@ -1,5 +1,4 @@ def doc_to_text(doc): - inputs = " ".join(doc["code_tokens"]).replace("\n", " ") inputs = " ".join(inputs.strip().split()) @@ -7,7 +6,6 @@ def doc_to_text(doc): def doc_to_target(doc): - targets = " ".join(doc["docstring_tokens"]).replace("\n", "") targets = " ".join(targets.strip().split()) diff --git a/lm_eval/tasks/coqa/utils.py b/lm_eval/tasks/coqa/utils.py index 4fed8ff8c2..29911cfec5 100644 --- a/lm_eval/tasks/coqa/utils.py +++ b/lm_eval/tasks/coqa/utils.py @@ -7,7 +7,7 @@ def doc_to_text(doc): # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} # and a question qi, the task is to predict the answer ai doc_text = doc["story"] + "\n\n" - for (q, a) in zip_longest( + for q, a in zip_longest( doc["questions"]["input_text"], doc["answers"]["input_text"][:-1] ): # omit target answer ai question = f"Q: {q}\n\n" @@ -17,7 +17,6 @@ def doc_to_text(doc): def doc_to_target(doc): - turn_id = len(doc["questions"]["input_text"]) # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers). answers = [] @@ -71,7 +70,6 @@ def compute_scores(gold_list, pred): def process_results(doc, results): - gold_list = doc_to_target(doc) pred = results[0].strip().split("\n")[0] diff --git a/lm_eval/tasks/csatqa/_generate_configs.py b/lm_eval/tasks/csatqa/_generate_configs.py index ca2bfc436e..56fe825a90 100644 --- a/lm_eval/tasks/csatqa/_generate_configs.py +++ b/lm_eval/tasks/csatqa/_generate_configs.py @@ -21,7 +21,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() # get filename of base_yaml so we can `"include": ` it in our other YAMLs. @@ -30,7 +29,6 @@ def parse_args(): base_yaml = yaml.full_load(f) for name in tqdm(SUBSETS): - yaml_dict = { "include": base_yaml_name, "task": f"csatqa_{args.task_prefix}_{name}" diff --git a/lm_eval/tasks/drop/utils.py b/lm_eval/tasks/drop/utils.py index 1e2888ce3e..03f7218c90 100644 --- a/lm_eval/tasks/drop/utils.py +++ b/lm_eval/tasks/drop/utils.py @@ -62,7 +62,6 @@ def parse_answer(answer): def process_results(doc, results): - preds, golds = results, doc["answers"] max_em = 0 max_f1 = 0 diff --git a/lm_eval/tasks/ifeval/instructions_registry.py b/lm_eval/tasks/ifeval/instructions_registry.py index 1056b139e2..ecb20e9b23 100644 --- a/lm_eval/tasks/ifeval/instructions_registry.py +++ b/lm_eval/tasks/ifeval/instructions_registry.py @@ -78,8 +78,7 @@ # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"}, _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"}, - _LANGUAGE - + "response_language": { + _LANGUAGE + "response_language": { _LANGUAGE + "response_language", _FORMAT + "multiple_sections", _KEYWORD + "existence", @@ -90,16 +89,14 @@ _CHANGE_CASES + "english_lowercase", }, _LENGTH + "number_sentences": {_LENGTH + "number_sentences"}, - _LENGTH - + "number_paragraphs": { + _LENGTH + "number_paragraphs": { _LENGTH + "number_paragraphs", _LENGTH + "nth_paragraph_first_word", _LENGTH + "number_sentences", _LENGTH + "nth_paragraph_first_word", }, _LENGTH + "number_words": {_LENGTH + "number_words"}, - _LENGTH - + "nth_paragraph_first_word": { + _LENGTH + "nth_paragraph_first_word": { _LENGTH + "nth_paragraph_first_word", _LENGTH + "number_paragraphs", }, @@ -110,23 +107,20 @@ # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()), _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"}, - _FORMAT - + "multiple_sections": { + _FORMAT + "multiple_sections": { _FORMAT + "multiple_sections", _LANGUAGE + "response_language", _FORMAT + "number_highlighted_sections", }, # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. # _FORMAT + "rephrase": instructions.RephraseChecker, - _FORMAT - + "json_format": set(INSTRUCTION_DICT.keys()).difference( + _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference( {_KEYWORD + "forbidden_words", _KEYWORD + "existence"} ), _FORMAT + "title": {_FORMAT + "title"}, # TODO(tianjianlu): Re-enable with specific prompts. # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, - _COMBINATION - + "two_responses": set(INSTRUCTION_DICT.keys()).difference( + _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference( { _KEYWORD + "forbidden_words", _KEYWORD + "existence", @@ -135,20 +129,17 @@ _PUNCTUATION + "no_comma", } ), - _COMBINATION - + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference( + _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference( {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"} ), _STARTEND + "end_checker": {_STARTEND + "end_checker"}, - _CHANGE_CASES - + "capital_word_frequency": { + _CHANGE_CASES + "capital_word_frequency": { _CHANGE_CASES + "capital_word_frequency", _CHANGE_CASES + "english_lowercase", _CHANGE_CASES + "english_capital", }, _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"}, - _CHANGE_CASES - + "english_lowercase": { + _CHANGE_CASES + "english_lowercase": { _CHANGE_CASES + "english_lowercase", _CHANGE_CASES + "english_capital", }, diff --git a/lm_eval/tasks/ifeval/instructions_util.py b/lm_eval/tasks/ifeval/instructions_util.py index 2390cba305..ccb531f96e 100644 --- a/lm_eval/tasks/ifeval/instructions_util.py +++ b/lm_eval/tasks/ifeval/instructions_util.py @@ -17,7 +17,6 @@ import functools import random import re -from typing import List import immutabledict import nltk diff --git a/lm_eval/tasks/mgsm/utils.py b/lm_eval/tasks/mgsm/utils.py index 97affac765..3edc78ab28 100644 --- a/lm_eval/tasks/mgsm/utils.py +++ b/lm_eval/tasks/mgsm/utils.py @@ -94,7 +94,6 @@ def add_regex_pattern(regex_pattern): - if regex_pattern is None: return {} return { diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py index 2bf27ac0f7..e6271bc4c2 100644 --- a/lm_eval/tasks/mmlu/_generate_configs.py +++ b/lm_eval/tasks/mmlu/_generate_configs.py @@ -7,7 +7,6 @@ from tqdm import tqdm -from lm_eval import utils from lm_eval.logger import eval_logger SUBJECTS = { @@ -82,7 +81,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs. @@ -98,7 +96,6 @@ def parse_args(): ALL_CATEGORIES = [] for subject, category in tqdm(SUBJECTS.items()): - if category not in ALL_CATEGORIES: ALL_CATEGORIES.append(category) diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py index ca199226a8..aecb40a5eb 100644 --- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py +++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py @@ -1,12 +1,10 @@ import yaml -import inspect import datasets from tqdm import tqdm def main() -> None: - dataset_path = "EleutherAI/advanced_ai_risk" for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()): file_name = f"{task}.yaml" diff --git a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py index a21f28309b..7aff892f03 100644 --- a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py +++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py @@ -1,12 +1,10 @@ import yaml -import inspect import datasets from tqdm import tqdm def main() -> None: - dataset_path = "EleutherAI/persona" for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()): file_name = f"{task}.yaml" diff --git a/lm_eval/tasks/paws-x/_generate_config.py b/lm_eval/tasks/paws-x/_generate_config.py index bff82e4ff0..a1341fec89 100644 --- a/lm_eval/tasks/paws-x/_generate_config.py +++ b/lm_eval/tasks/paws-x/_generate_config.py @@ -1,5 +1,4 @@ import argparse -from typing import Dict, List import yaml diff --git a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py index 51c198703f..0dccf9408a 100644 --- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py +++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py @@ -1,5 +1,6 @@ def doc_to_text(doc) -> str: ctxs = "\n".join(doc["CONTEXTS"]) return "Abstract: {}\nQuestion: {}\nAnswer:".format( - ctxs, doc["QUESTION"], doc["final_decision"] + ctxs, + doc["QUESTION"], ) diff --git a/lm_eval/tasks/qasper/utils.py b/lm_eval/tasks/qasper/utils.py index be6f79dcad..7a02237a78 100644 --- a/lm_eval/tasks/qasper/utils.py +++ b/lm_eval/tasks/qasper/utils.py @@ -3,7 +3,6 @@ def process_docs(dataset, set_answer_type="bool"): - FEATURES = ["title", "abstract", "question", "answer", "answer_type"] def _categorise_answer(answer_blob): diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index e44296a4e0..829e97bde0 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -235,7 +235,6 @@ def process_results(self, doc, results): } def construct_requests(self, doc, ctx, **kwargs): - request_list = [ Instance( request_type="loglikelihood", diff --git a/lm_eval/tasks/squadv2/task.py b/lm_eval/tasks/squadv2/task.py index 4630e2a161..ba308acd43 100644 --- a/lm_eval/tasks/squadv2/task.py +++ b/lm_eval/tasks/squadv2/task.py @@ -14,7 +14,6 @@ Homepage: https://rajpurkar.github.io/SQuAD-explorer/ """ import datasets -from evaluate import load from math import exp from functools import partial @@ -120,14 +119,14 @@ def construct_requests(self, doc, ctx, **kwargs): doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, - **kwargs + **kwargs, ), Instance( request_type="loglikelihood", doc=doc, arguments=(ctx, " " + "unanswerable"), idx=0, - **kwargs + **kwargs, ), ] diff --git a/lm_eval/tasks/super_glue/cb/t5_utils.py b/lm_eval/tasks/super_glue/cb/t5_utils.py index 43eafce9d6..ec02e34538 100644 --- a/lm_eval/tasks/super_glue/cb/t5_utils.py +++ b/lm_eval/tasks/super_glue/cb/t5_utils.py @@ -2,7 +2,6 @@ def mean_3class_f1(predictions, references): # This is a passthrough function - string_label = ["entailment", "contradiction", "neutral"] predictions = ( string_label.index(predictions[0]) if predictions[0] in string_label else 0 @@ -13,7 +12,6 @@ def mean_3class_f1(predictions, references): # This is a passthrough function def agg_mean_3class_f1(items): - predictions, references = zip(*items) """Computes the unweighted average of the F1 per class.""" diff --git a/lm_eval/tasks/super_glue/multirc/t5_utils.py b/lm_eval/tasks/super_glue/multirc/t5_utils.py index ac99aaf962..d17d498fa2 100644 --- a/lm_eval/tasks/super_glue/multirc/t5_utils.py +++ b/lm_eval/tasks/super_glue/multirc/t5_utils.py @@ -5,7 +5,6 @@ def f1(predictions, references): # This is a passthrough function - _prediction = predictions[0] _reference = references[0].split("_")[-1] string_label = ["False", "True"] @@ -20,7 +19,6 @@ def f1(predictions, references): # This is a passthrough function def agg_f1(items): - predictions, references = zip(*items) references, predictions = np.asarray(references), np.asarray(predictions) @@ -28,7 +26,6 @@ def agg_f1(items): def em(predictions, references): # This is a passthrough function - _prediction = predictions[0] _group, _reference = references[0].split("_") string_label = ["False", "True"] diff --git a/lm_eval/tasks/super_glue/record/t5_utils.py b/lm_eval/tasks/super_glue/record/t5_utils.py index 98730cacd4..68301b18b3 100644 --- a/lm_eval/tasks/super_glue/record/t5_utils.py +++ b/lm_eval/tasks/super_glue/record/t5_utils.py @@ -3,14 +3,12 @@ import collections import numpy as np -from tqdm import tqdm -from datasets import Dataset, concatenate_datasets +from datasets import Dataset from lm_eval.api.metrics import metric_max_over_ground_truths def doc_to_text(doc): - passage = doc["passage"] passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage) passage = re.sub(r"\n@highlight\n", ". ", passage) @@ -34,7 +32,6 @@ def split_answers(doc): } answers = doc.pop("answers") for idx, answer in enumerate(answers): - for key in split_doc.keys(): if key in doc: split_doc[key].append(doc[key]) diff --git a/lm_eval/tasks/super_glue/wsc/t5_utils.py b/lm_eval/tasks/super_glue/wsc/t5_utils.py index 7e55a52a7b..eb5331a42a 100644 --- a/lm_eval/tasks/super_glue/wsc/t5_utils.py +++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py @@ -8,7 +8,6 @@ def doc_to_text(x): def _wsc_inputs(x): - words = x["text"].split(" ") # We would need some special logic to handle the case where the pronoun is the @@ -55,7 +54,6 @@ def create_input(): class WSCPostprocess(Filter): def __init__(self, **kwargs): - self.determiners = { "a", "an", @@ -86,10 +84,8 @@ def clean(self, s): return " ".join([w for w in s.split(" ") if w not in self.determiners]) def apply(self, resps, docs): - filtered_resps = [] for prediction, reference in zip(*(resps, docs["span1_text"])): - prediction = self.clean(prediction[0]) reference = self.clean(reference) diff --git a/lm_eval/tasks/translation/utils.py b/lm_eval/tasks/translation/utils.py index f80ae89a4f..f30c4d8625 100644 --- a/lm_eval/tasks/translation/utils.py +++ b/lm_eval/tasks/translation/utils.py @@ -1,9 +1,7 @@ import argparse -from typing import Dict, List import yaml -import sacrebleu try: import pycountry diff --git a/lm_eval/tasks/truthfulqa/utils.py b/lm_eval/tasks/truthfulqa/utils.py index 8c011d2d10..8e2ab43fe8 100644 --- a/lm_eval/tasks/truthfulqa/utils.py +++ b/lm_eval/tasks/truthfulqa/utils.py @@ -6,7 +6,6 @@ def process_results_mc2(doc, results): - lls, is_greedy = zip(*results) # Split on the first `0` as everything before it is true (`1`). @@ -20,7 +19,6 @@ def process_results_mc2(doc, results): def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset: - return dataset.map(preprocess_function) @@ -49,7 +47,6 @@ def _format_answers(answers): def process_results_gen(doc, results): - completion = results[0] true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] all_refs = true_refs + false_refs diff --git a/lm_eval/tasks/xnli/utils.py b/lm_eval/tasks/xnli/utils.py index fa7806fc74..2844d1d7c8 100644 --- a/lm_eval/tasks/xnli/utils.py +++ b/lm_eval/tasks/xnli/utils.py @@ -1,5 +1,4 @@ import argparse -from typing import Dict, List import yaml diff --git a/lm_eval/utils.py b/lm_eval/utils.py index 4067669c0d..74f4f482da 100644 --- a/lm_eval/utils.py +++ b/lm_eval/utils.py @@ -1,25 +1,23 @@ -import os -import re -import sys -import yaml +import collections +import fnmatch +import functools +import gc +import importlib.util import inspect +import logging +import os import pathlib -import functools +import re import subprocess -import collections -import importlib.util -import fnmatch - -from typing import Iterator, List, Literal, Union, Any, Callable +import sys +from itertools import islice +from typing import Any, Callable, Iterator, List, Literal, Union -import gc import torch import transformers - +import yaml from jinja2 import BaseLoader, Environment, StrictUndefined -from itertools import islice -import logging logging.basicConfig( format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", @@ -143,7 +141,7 @@ def __init__(self, choices) -> None: def __contains__(self, values) -> bool: for value in values.split(","): if len(fnmatch.filter(self.choices, value)) == 0: - eval_logger.info(f"Available tasks to choose:") + eval_logger.info("Available tasks to choose:") for choice in self.choices: eval_logger.info(f" - {choice}") raise ValueError("'{}' is not in task list".format(value)) @@ -157,7 +155,7 @@ def __iter__(self) -> Iterator: # Returns a list containing all values of the source_list that # match at least one of the patterns def pattern_match(patterns, source_list): - if type(patterns) == str: + if isinstance(patterns, str): patterns = [patterns] task_names = set() @@ -332,7 +330,7 @@ def get_original(self, grouped_dict): def make_table(result_dict, column: str = "results"): """Generate table of results.""" - from pytablewriter import MarkdownTableWriter, LatexTableWriter + from pytablewriter import LatexTableWriter, MarkdownTableWriter if column == "results": column_name = "Tasks" @@ -466,7 +464,7 @@ def import_function(loader, node): yaml_path = os.path.dirname(loader.name) *module_name, function_name = function_name.split(".") - if type(module_name) == list: + if isinstance(module_name, list): module_name = ".".join(module_name) module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name))) @@ -496,7 +494,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None): include_path = yaml_config["include"] del yaml_config["include"] - if type(include_path) == str: + if isinstance(include_path, str): include_path = [include_path] # Load from the last one first diff --git a/mypy.ini b/mypy.ini index 76a0c86452..2d20dd2cc5 100644 --- a/mypy.ini +++ b/mypy.ini @@ -9,21 +9,19 @@ warn_unused_ignores = True warn_redundant_casts = True # We ignore errors everywhere to gradually add type annotations - -[mypy-lm_eval.*] -ignore_errors = True - -[mypy-lm_eval.api.*] -ignore_errors = True - -[mypy-lm_eval.prompts.*] -ignore_errors = True - -[mypy-lm_eval.models.*] -ignore_errors = True - -[mypy-scripts.*] -ignore_errors = True - -[mypy-main] -ignore_errors = True +# [mypy-lm_eval.*] +# ignore_errors = True +# +# [mypy-lm_eval.api.*] +# ignore_errors = True +# +# [mypy-lm_eval.prompts.*] +# ignore_errors = True +# +# [mypy-lm_eval.models.*] +# ignore_errors = True +# +# [mypy-scripts.*] +# ignore_errors = True +# +# [mypy-main] diff --git a/pyproject.toml b/pyproject.toml index 5a4d191d7c..87eefc72d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,14 +54,7 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness" Repository = "https://github.com/EleutherAI/lm-evaluation-harness" [project.optional-dependencies] -dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"] -linting = [ - "flake8", - "pylint", - "mypy", - "pre-commit", -] -testing = ["pytest", "pytest-cov", "pytest-xdist"] +dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"] @@ -88,3 +81,17 @@ all = [ "lm_eval[ifeval]", "lm_eval[zeno]", ] + +[tool.ruff] +extend-exclude = ["lm_eval/evaluator.py", "lm_eval/tasks/*.py"] + +[tool.ruff.lint] +extend-select = ["I"] + +[tool.ruff.isort] +lines-after-imports = 2 +known-first-party = ["lm_eval"] + +[tool.ruff.extend-per-file-ignores] +"__init__.py" = ["F401","F402","F403","I"] +"lm_eval/tasks/*"= ["E721"] diff --git a/scripts/build_benchmark.py b/scripts/build_benchmark.py index 4cd07dd3eb..ce4b661681 100644 --- a/scripts/build_benchmark.py +++ b/scripts/build_benchmark.py @@ -1,15 +1,14 @@ -import os -import yaml import argparse +import os -from tqdm import tqdm +import yaml from promptsource.templates import DatasetTemplates - -from lm_eval import utils +from tqdm import tqdm # from lm_eval.api.registry import ALL_TASKS from lm_eval.logger import eval_logger + # from lm_eval.tasks import include_task_folder @@ -22,7 +21,6 @@ def parse_args(): if __name__ == "__main__": - args = parse_args() with open(args.benchmark_path) as file: diff --git a/scripts/clean_training_data/compress_and_package.py b/scripts/clean_training_data/compress_and_package.py index c9e7f2593c..d4af5ba5f3 100644 --- a/scripts/clean_training_data/compress_and_package.py +++ b/scripts/clean_training_data/compress_and_package.py @@ -1,15 +1,15 @@ -import glob import argparse +import glob +import logging import os -import subprocess import shutil +import subprocess from tqdm import tqdm from tqdm_multiprocess import TqdmMultiProcessPool - -import logging from tqdm_multiprocess.logger import setup_logger_tqdm + logger = logging.getLogger(__name__) @@ -35,7 +35,7 @@ def compress_and_move(working_directory, output_directory, process_count): tasks = [] bucket_file_paths = glob.glob( - os.path.join(working_directory, "output", f"*.bkt.txt.sorted") + os.path.join(working_directory, "output", "*.bkt.txt.sorted") ) for bucket_file_path in bucket_file_paths: task = (process_task, (working_directory, output_directory, bucket_file_path)) diff --git a/scripts/clean_training_data/generate_13_grams.py b/scripts/clean_training_data/generate_13_grams.py index 27037e394d..66fa0ff45b 100644 --- a/scripts/clean_training_data/generate_13_grams.py +++ b/scripts/clean_training_data/generate_13_grams.py @@ -21,22 +21,22 @@ """ import argparse +import glob import json -import pickle +import logging import os +import pickle +import signal import sys from pathlib import Path -import glob -import signal from signal import SIGINT from tqdm import tqdm +from tqdm_multiprocess.logger import setup_logger_tqdm +from lm_eval.decontamination.archiver import Reader, TextArchive from lm_eval.decontamination.janitor import Janitor, word_ngrams -from lm_eval.decontamination.archiver import TextArchive, Reader -import logging -from tqdm_multiprocess.logger import setup_logger_tqdm logger = logging.getLogger(__name__) @@ -89,7 +89,7 @@ def __init__(self, directory, num_buckets): os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets) ] self.buckets = list(map(TextArchive, self.bucket_files)) - self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt") + self.checkpoint_file = os.path.join(directory, "bucket_offsets.ckpt") if os.path.exists(self.checkpoint_file): self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb")) @@ -119,7 +119,6 @@ def close_buckets(self): def do_ngrams_in_buckets(n_value, working_directory, bucket_count): - pile_statistics = json.load(open("pile_statistics.json", "r")) pile_document_count = pile_statistics["Document Count"] start_offsets = pile_statistics["File Start Offsets"] @@ -130,13 +129,13 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count): logger.info(f"Generating {n_value}-grams and bucketing.") # Done file - done_file = os.path.join(output_directory, f"ngram_buckets.done") + done_file = os.path.join(output_directory, "ngram_buckets.done") if os.path.exists(done_file): logger.info("ngrams already generated and bucketed, skipping") return # Checkpoint - checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt") + checkpoint_file = os.path.join(working_directory, "pile_offset.ckpt") if os.path.exists(checkpoint_file): checkpoint_offset = pickle.load(open(checkpoint_file, "rb")) iterate = True diff --git a/scripts/clean_training_data/investigate_pile.py b/scripts/clean_training_data/investigate_pile.py index dccd3abe70..c1d348d463 100644 --- a/scripts/clean_training_data/investigate_pile.py +++ b/scripts/clean_training_data/investigate_pile.py @@ -1,12 +1,13 @@ -from lm_eval.decontamination.archiver import Reader -import os +import glob import json +import os from functools import reduce -import glob -import tqdm +import tqdm from tqdm_multiprocess import TqdmMultiProcessPool +from lm_eval.decontamination.archiver import Reader + def get_file_stats(file_path, tqdm_func, global_tqdm): reader = Reader() diff --git a/scripts/clean_training_data/process_sorted_buckets.py b/scripts/clean_training_data/process_sorted_buckets.py index 1e145f9198..9d345d8e86 100644 --- a/scripts/clean_training_data/process_sorted_buckets.py +++ b/scripts/clean_training_data/process_sorted_buckets.py @@ -15,18 +15,18 @@ import argparse import glob +import logging import os -from pathlib import Path import re import shutil +from pathlib import Path from tqdm import tqdm from tqdm_multiprocess import TqdmMultiProcessPool +from tqdm_multiprocess.logger import setup_logger_tqdm -from scripts.clean_training_data.archiver import TextReader, TextArchive +from scripts.clean_training_data.archiver import TextArchive, TextReader -import logging -from tqdm_multiprocess.logger import setup_logger_tqdm logger = logging.getLogger(__name__) @@ -35,7 +35,6 @@ def process_bucket( bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm ): - bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) # noqa: W605 done_file = os.path.join( processed_directory, f"ngram_bucket_processing_{bucket_id}.done" @@ -96,7 +95,7 @@ def process_bucket( def process_sorted_buckets(working_directory, move_dir, process_count): - bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted")) + bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt.sorted")) processed_directory = os.path.join(working_directory, "processed") os.makedirs(processed_directory, exist_ok=True) @@ -123,7 +122,6 @@ def on_error(_): parser.add_argument("-procs", "--process_count", type=int, default=4) if __name__ == "__main__": - logfile_path = "process13grams.log" setup_logger_tqdm(logfile_path) diff --git a/scripts/clean_training_data/sort_13_gram_buckets.py b/scripts/clean_training_data/sort_13_gram_buckets.py index 07a2eedcd0..83990de822 100644 --- a/scripts/clean_training_data/sort_13_gram_buckets.py +++ b/scripts/clean_training_data/sort_13_gram_buckets.py @@ -8,18 +8,18 @@ directory and the unsorted buckets are removed after. """ -import glob import argparse +import glob +import logging import os import signal -from signal import SIGINT import subprocess +from signal import SIGINT from tqdm import tqdm - -import logging from tqdm_multiprocess.logger import setup_logger_tqdm + logger = logging.getLogger(__name__) terminate = False @@ -31,7 +31,7 @@ def handler(signal_received, frame): def sort_13_gram_buckets(working_directory): - bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt")) + bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt")) for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True): sorted_file_path = bucket_file_path + ".sorted" @@ -49,7 +49,6 @@ def sort_13_gram_buckets(working_directory): parser.add_argument("-dir", "--working_directory", default="") if __name__ == "__main__": - version = 1.00 print(f"Running version {version}") diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py index 72b8d4b358..6fb64504e8 100644 --- a/scripts/cost_estimate.py +++ b/scripts/cost_estimate.py @@ -1,6 +1,8 @@ import random + import transformers -from lm_eval import tasks, evaluator + +from lm_eval import evaluator, tasks from lm_eval.base import LM diff --git a/scripts/get_prompts.py b/scripts/get_prompts.py index 06e2f89c13..d262ec37e4 100644 --- a/scripts/get_prompts.py +++ b/scripts/get_prompts.py @@ -1,6 +1,8 @@ -from lm_eval import tasks from itertools import islice +from lm_eval import tasks + + ct = 3 for ( diff --git a/scripts/make_gpt2_test_cases.py b/scripts/make_gpt2_test_cases.py index 361bc2ecd6..0c1a4bffe0 100644 --- a/scripts/make_gpt2_test_cases.py +++ b/scripts/make_gpt2_test_cases.py @@ -1,8 +1,9 @@ -import transformers +import random import torch import torch.nn.functional as F -import random +import transformers + random.seed(42) diff --git a/scripts/make_table_results.py b/scripts/make_table_results.py index 690658ccea..72af524ffe 100644 --- a/scripts/make_table_results.py +++ b/scripts/make_table_results.py @@ -2,10 +2,11 @@ Usage: python make_table_tasks.py --output """ +import json import logging -from pytablewriter import MarkdownTableWriter, LatexTableWriter import os -import json + +from pytablewriter import LatexTableWriter, MarkdownTableWriter logging.basicConfig(level=logging.INFO) diff --git a/scripts/make_table_tasks.py b/scripts/make_table_tasks.py index d68d8fe219..ded7c1a596 100644 --- a/scripts/make_table_tasks.py +++ b/scripts/make_table_tasks.py @@ -4,9 +4,11 @@ """ import argparse import logging -from lm_eval import tasks + from pytablewriter import MarkdownTableWriter +from lm_eval import tasks + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/scripts/model_comparator.py b/scripts/model_comparator.py index f3cbd320f4..b1aeb142b7 100644 --- a/scripts/model_comparator.py +++ b/scripts/model_comparator.py @@ -1,13 +1,15 @@ import argparse +import os +from typing import Dict, List, Tuple + import numpy as np -import lm_eval.evaluator -from lm_eval import tasks -from lm_eval import utils -import scipy.stats -from typing import Tuple, Dict, List import pandas as pd +import scipy.stats import torch -import os + +import lm_eval.evaluator +from lm_eval import tasks, utils + os.environ["TOKENIZERS_PARALLELISM"] = "false" eval_logger = utils.eval_logger diff --git a/scripts/regression.py b/scripts/regression.py index ef85d0c75e..2b8167c0eb 100644 --- a/scripts/regression.py +++ b/scripts/regression.py @@ -5,7 +5,7 @@ import time from pathlib import Path -from lm_eval import evaluator, utils +from lm_eval import utils from lm_eval.api.registry import ALL_TASKS @@ -136,14 +136,16 @@ def main(): args = parse_args() args.branches = ( - args.branches.split(",") if type(args.branches) == str else args.branches + args.branches.split(",") if isinstance(args.branches, str) else args.branches + ) + args.models = ( + args.models.split(",") if isinstance(args.models, str) else args.models ) - args.models = args.models.split(",") if type(args.models) == str else args.models args.tasks = ( ALL_TASKS if args.tasks == "all_tasks" else utils.pattern_match(args.tasks.split(","), ALL_TASKS) - if type(args.tasks) == str + if isinstance(args.tasks, str) else args.tasks ) diff --git a/scripts/write_out.py b/scripts/write_out.py index eb81e6732b..360b0b6271 100644 --- a/scripts/write_out.py +++ b/scripts/write_out.py @@ -1,11 +1,13 @@ import argparse -import numpy as np -import json import os import random + +import numpy as np + from lm_eval import tasks -from lm_eval.utils import join_iters, eval_logger -from lm_eval.tasks import initialize_tasks, include_path +from lm_eval.tasks import include_path, initialize_tasks +from lm_eval.utils import eval_logger, join_iters + EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" diff --git a/setup.py b/setup.py index dbe4675d06..b5d8fabb86 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ import setuptools + # This is to make sure that the package supports editable installs setuptools.setup() diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py index 6d186676fe..186b2305e6 100644 --- a/tests/models/test_gguf.py +++ b/tests/models/test_gguf.py @@ -1,12 +1,13 @@ -import unittest -from unittest.mock import patch import hashlib import json import os import pickle -from lm_eval.models.gguf import GGUFLM +import unittest +from unittest.mock import patch from lm_eval.api.instance import Instance +from lm_eval.models.gguf import GGUFLM + base_url = "https://matthoffner-ggml-llm-api.hf.space" diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py index 557ad05124..323d664af8 100644 --- a/tests/models/test_huggingface.py +++ b/tests/models/test_huggingface.py @@ -1,13 +1,16 @@ from __future__ import annotations -import pytest + +import sys from pathlib import Path + import numpy as np -from lm_eval.models.huggingface import HFLM -from lm_eval.api.instance import Instance -import lm_eval.tasks as tasks -import sys import torch +import lm_eval.tasks as tasks +from lm_eval.api.instance import Instance +from lm_eval.models.huggingface import HFLM + + tasks.initialize_tasks() @@ -106,9 +109,10 @@ def test_logliklihood(self) -> None: f.write("\n".join(str(x) for x in _res)) assert np.allclose(_res, _RES, atol=1e-2) # check indices for Multiple Choice - argmax_RES, argmax_res = np.argmax( - np.array(_RES).reshape(-1, 4), axis=1 - ), np.argmax(np.array(_res).reshape(-1, 4), axis=1) + argmax_RES, argmax_res = ( + np.argmax(np.array(_RES).reshape(-1, 4), axis=1), + np.argmax(np.array(_res).reshape(-1, 4), axis=1), + ) assert (argmax_RES == argmax_res).all() def test_generate_until(self) -> None: diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py index 61a024ce71..1da8a48762 100644 --- a/tests/models/test_vllm.py +++ b/tests/models/test_vllm.py @@ -1,10 +1,11 @@ -import pytest from typing import List -from lm_eval.api.instance import Instance -import lm_eval.tasks as tasks -import sys + +import pytest import torch +import lm_eval.tasks as tasks +from lm_eval.api.instance import Instance + @pytest.mark.skip(reason="requires CUDA") class TEST_VLLM: diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index 7f30e21f43..825f57413d 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -1,15 +1,13 @@ -import os - # import lm_eval.base as base -import lm_eval.api.registry as registry -import lm_eval.tasks as tasks +from typing import List + +import pytest # import lm_eval.models as models import lm_eval.api as api import lm_eval.evaluator as evaluator -from typing import List -import random -import pytest +import lm_eval.tasks as tasks + tasks.initialize_tasks() diff --git a/tests/test_janitor.py b/tests/test_janitor.py index b496bfadd1..19ba611dfb 100644 --- a/tests/test_janitor.py +++ b/tests/test_janitor.py @@ -1,11 +1,10 @@ -import re from collections import defaultdict from lm_eval.decontamination.janitor import ( Janitor, form_ngrams, - word_ngrams, split_indices, + word_ngrams, word_ngrams_indices, ) @@ -81,7 +80,6 @@ def test_split_indices(): def test_word_ngrams_indices(): - sequence = ( "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some" " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much." @@ -119,9 +117,9 @@ def test_word_ngrams_indices(): # Assumptions from GPT3 Paper: # the 200 characters to remove include punctuation and is actually a half-window + # All tests below initially test without any registered contaminants, expecting the same sequence back. def test_janitor1(): - # First test using a 1gram and expected the first block before the filth to have some remaining # characters, but the second block should be completely removed. @@ -165,7 +163,6 @@ def test_janitor1(): def test_janitor2(): - # Second test using a 1gram and expected the first block before the filth to have some remaining # characters, and the second block is longer then 200 characters so should also have some remaining. @@ -214,7 +211,6 @@ def test_janitor2(): def test_janitor3(): - # Same test as above but with a 6gram. sequence = ( @@ -262,7 +258,6 @@ def test_janitor3(): def test_janitor4(): - # This test adds another block to that from the previous. The middle block should be entirely # removed as the 200 characters are removed from each side. @@ -318,7 +313,6 @@ def test_janitor4(): def test_janitor5(): - # Same as above but using multiple different filth 6grams. sequence = ( @@ -374,7 +368,6 @@ def test_janitor5(): def test_janitor6(): - # Same as above but now we add 10 filths and expect the same result, the following test does 11. sequence = ( @@ -438,7 +431,6 @@ def test_janitor6(): def test_janitor7(): - # Same as above but now we add 9 filths and expect the same result, the following test does 10. sequence = ( diff --git a/tests/test_misc.py b/tests/test_misc.py index 149a65f4c3..30267f63d0 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -1,6 +1,8 @@ +import random + import pytest + import lm_eval.api.metrics as metrics -import random def test_bootstrapping(): diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 41504430d5..3651fd5ab3 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -1,9 +1,13 @@ from itertools import islice + import pytest -from .utils import new_tasks + import lm_eval.tasks as tasks from lm_eval.api.task import ConfigurableTask +from .utils import new_tasks + + tasks.initialize_tasks() # Default Task TASKS = ["arc_easy"] diff --git a/tests/tests_master/test_description.py b/tests/tests_master/test_description.py index fdf7bf5db0..2503bcea4b 100644 --- a/tests/tests_master/test_description.py +++ b/tests/tests_master/test_description.py @@ -1,6 +1,7 @@ import random -import lm_eval.tasks + import lm_eval.models +import lm_eval.tasks def test_description(): @@ -14,7 +15,6 @@ def test_description(): task_dict = lm_eval.tasks.get_task_dict(task_names) for task_name, task in task_dict.items(): - # patch description field in task (# TODO: make this much more cleaned up) task._config.description = description_dict[task_name] diff --git a/tests/tests_master/test_generate_13_grams.py b/tests/tests_master/test_generate_13_grams.py index 26cd890369..722e69a77e 100644 --- a/tests/tests_master/test_generate_13_grams.py +++ b/tests/tests_master/test_generate_13_grams.py @@ -1,13 +1,13 @@ +import glob +import logging import os -from collections import Counter import shutil -import glob +from collections import Counter +from lm_eval.decontamination.archiver import Archive, TextReader from lm_eval.decontamination.janitor import Janitor, word_ngrams from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets -from lm_eval.decontamination.archiver import Archive, TextReader -import logging logger = logging.getLogger(__name__) @@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog): print("rebuild") rebuilt_ngrams = [] bucket_file_paths = glob.glob( - os.path.join(test_working_directory, "output", f"*.bkt.txt") + os.path.join(test_working_directory, "output", "*.bkt.txt") ) for bucket_file_path in bucket_file_paths: reader = TextReader(bucket_file_path) diff --git a/tests/tests_master/test_models.py b/tests/tests_master/test_models.py index 11ea5a8b46..e56dcaf8e4 100644 --- a/tests/tests_master/test_models.py +++ b/tests/tests_master/test_models.py @@ -2,12 +2,13 @@ import json import os import pickle -import pytest import unittest.mock as mock +import pytest +from openai import OpenAI + import lm_eval.models as models -from openai import OpenAI client = OpenAI() diff --git a/tests/tests_master/test_version_stable.py b/tests/tests_master/test_version_stable.py index 2eba83c6c6..34073d0a69 100644 --- a/tests/tests_master/test_version_stable.py +++ b/tests/tests_master/test_version_stable.py @@ -1,12 +1,14 @@ -import lm_eval.tasks as tasks -import lm_eval.models as models -import lm_eval.evaluator as evaluator +import collections +import hashlib +import json +import os import random + import pytest -import os -import json -import hashlib -import collections + +import lm_eval.evaluator as evaluator +import lm_eval.models as models +import lm_eval.tasks as tasks os.makedirs("tests/testdata", exist_ok=True) diff --git a/tests/utils.py b/tests/utils.py index 3555541e71..fbdbb6a7fb 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,8 +1,8 @@ -from typing import List -from lm_eval.utils import load_yaml_config -from pathlib import Path -from typing import Union import os +from pathlib import Path +from typing import List, Union + +from lm_eval.utils import load_yaml_config # {{{CI}}}