diff --git a/README.md b/README.md index ed50e5e524..835f7472cc 100644 --- a/README.md +++ b/README.md @@ -454,7 +454,6 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"` | sentencepiece | For using the sentencepiece tokenizer | | sparseml | For using NM's SparseML models | | testing | For running library test suite | -| unitxt | For IBM's unitxt dataset tasks | | vllm | For loading models with vLLM | | zeno | For visualizing results with Zeno | |---------------|---------------------------------------| diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md index 23f8dd81ca..e9bd2becb2 100644 --- a/docs/new_task_guide.md +++ b/docs/new_task_guide.md @@ -403,6 +403,17 @@ task: ... ``` +You can also pass a custom argument to your class by accepting `config` in the custom class constructor. +Here's how to do it: + +```yaml +task: 20_newsgroups +class: !function task.Unitxt +recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title +``` + +In this example, `recipe` is the custom argument for the `Unitxt` class. + ## Beautifying Table Display To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set. diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 52c280b7a5..b567a3d4b9 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -1,4 +1,5 @@ import collections +import inspect import logging import os from functools import partial @@ -151,6 +152,14 @@ def _process_alias(self, config, group=None): config["group_alias"] = None return config + def _class_has_config_in_constructor(self, cls): + constructor = getattr(cls, "__init__", None) + return ( + "config" in inspect.signature(constructor).parameters + if constructor + else False + ) + def _load_individual_task_or_group( self, name_or_config: Optional[Union[str, dict]] = None, @@ -168,13 +177,13 @@ def _load_task(config, task): **config, } if self._config_is_python_task(config): - task_object = ( - config["class"](config=config) - if issubclass(config["class"], ConfigurableTask) - else config["class"]() - ) - # very scuffed: set task name here. TODO: fixme? - task_object.config.task = config["task"] + if self._class_has_config_in_constructor(config["class"]): + task_object = config["class"](config=config) + else: + task_object = config["class"]() + if isinstance(task_object, ConfigurableTask): + # very scuffed: set task name here. TODO: fixme? + task_object.config.task = config["task"] else: task_object = ConfigurableTask(config=config) diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index 2940eb2e39..45656be3e9 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -116,7 +116,7 @@ class _SCROLLSTask(ConfigurableTask): PRUNE_MAX_TOKENS = None PRUNE_NUM_PROC = None - def __init__(self): + def __init__(self, config=None): super().__init__(config={"metadata": {"version": self.VERSION}}) if self.DATASET_NAME is not None: self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) diff --git a/lm_eval/tasks/squadv2/task.py b/lm_eval/tasks/squadv2/task.py index 4c3d7f6fa9..32c44c6022 100644 --- a/lm_eval/tasks/squadv2/task.py +++ b/lm_eval/tasks/squadv2/task.py @@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask): DATASET_PATH = "squad_v2" DATASET_NAME = None - def __init__(self): + def __init__(self, config=None): super().__init__(config={"metadata": {"version": self.VERSION}}) # HF changed squad on us so we have to make sure we aren't running the old one diff --git a/lm_eval/tasks/unitxt/20_newsgroups.yaml b/lm_eval/tasks/unitxt/20_newsgroups.yaml index 1a5af68a31..f2444bd24f 100644 --- a/lm_eval/tasks/unitxt/20_newsgroups.yaml +++ b/lm_eval/tasks/unitxt/20_newsgroups.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: 20_newsgroups -dataset_name: card=cards.20_newsgroups,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/ag_news.yaml b/lm_eval/tasks/unitxt/ag_news.yaml index 32914bda93..792ce0b4b4 100644 --- a/lm_eval/tasks/unitxt/ag_news.yaml +++ b/lm_eval/tasks/unitxt/ag_news.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: ag_news -dataset_name: card=cards.ag_news,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.ag_news,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/argument_topic.yaml b/lm_eval/tasks/unitxt/argument_topic.yaml index c333e194d6..d04810cd49 100644 --- a/lm_eval/tasks/unitxt/argument_topic.yaml +++ b/lm_eval/tasks/unitxt/argument_topic.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: argument_topic -dataset_name: card=cards.argument_topic,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.argument_topic,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/atis.yaml b/lm_eval/tasks/unitxt/atis.yaml index 1e7979be2f..e9a26697ac 100644 --- a/lm_eval/tasks/unitxt/atis.yaml +++ b/lm_eval/tasks/unitxt/atis.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.span_labeling.extraction task: atis -dataset_name: card=cards.atis,template=templates.span_labeling.extraction.title +include: unitxt +recipe: card=cards.atis,template=templates.span_labeling.extraction.title diff --git a/lm_eval/tasks/unitxt/banking77.yaml b/lm_eval/tasks/unitxt/banking77.yaml index a888e6a493..6475575dd8 100644 --- a/lm_eval/tasks/unitxt/banking77.yaml +++ b/lm_eval/tasks/unitxt/banking77.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: banking77 -dataset_name: card=cards.banking77,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.banking77,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/claim_stance_topic.yaml b/lm_eval/tasks/unitxt/claim_stance_topic.yaml index 5f8a6133f6..2a2469d5ff 100644 --- a/lm_eval/tasks/unitxt/claim_stance_topic.yaml +++ b/lm_eval/tasks/unitxt/claim_stance_topic.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: claim_stance_topic -dataset_name: card=cards.claim_stance_topic,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.claim_stance_topic,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/cnn_dailymail.yaml b/lm_eval/tasks/unitxt/cnn_dailymail.yaml index f3f26cd530..aa3748c806 100644 --- a/lm_eval/tasks/unitxt/cnn_dailymail.yaml +++ b/lm_eval/tasks/unitxt/cnn_dailymail.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.summarization.abstractive task: cnn_dailymail -dataset_name: card=cards.cnn_dailymail,template=templates.summarization.abstractive.full +include: unitxt +recipe: card=cards.cnn_dailymail,template=templates.summarization.abstractive.full diff --git a/lm_eval/tasks/unitxt/coedit_gec.yaml b/lm_eval/tasks/unitxt/coedit_gec.yaml index 619be722bf..4959064696 100644 --- a/lm_eval/tasks/unitxt/coedit_gec.yaml +++ b/lm_eval/tasks/unitxt/coedit_gec.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.grammatical_error_correction task: coedit_gec -dataset_name: card=cards.coedit_gec,template=templates.grammatical_error_correction.simple +include: unitxt +recipe: card=cards.coedit_gec,template=templates.grammatical_error_correction.simple diff --git a/lm_eval/tasks/unitxt/dbpedia_14.yaml b/lm_eval/tasks/unitxt/dbpedia_14.yaml index 6ef2df7168..b26d65a72b 100644 --- a/lm_eval/tasks/unitxt/dbpedia_14.yaml +++ b/lm_eval/tasks/unitxt/dbpedia_14.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: dbpedia_14 -dataset_name: card=cards.dbpedia_14,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.dbpedia_14,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/ethos_binary.yaml b/lm_eval/tasks/unitxt/ethos_binary.yaml index 8a3b167016..3976de43ac 100644 --- a/lm_eval/tasks/unitxt/ethos_binary.yaml +++ b/lm_eval/tasks/unitxt/ethos_binary.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: ethos_binary -dataset_name: card=cards.ethos_binary,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.ethos_binary,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/financial_tweets.yaml b/lm_eval/tasks/unitxt/financial_tweets.yaml index 8d804e7adb..7b4bb9e538 100644 --- a/lm_eval/tasks/unitxt/financial_tweets.yaml +++ b/lm_eval/tasks/unitxt/financial_tweets.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: financial_tweets -dataset_name: card=cards.financial_tweets,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.financial_tweets,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/generate_yamls.py b/lm_eval/tasks/unitxt/generate_yamls.py deleted file mode 100644 index b1b9c004bb..0000000000 --- a/lm_eval/tasks/unitxt/generate_yamls.py +++ /dev/null @@ -1,135 +0,0 @@ -# -# This file generates a set of LM eval harness yaml file -# that load unitxt datasets (https://github.com/IBM/unitxt) -# - -import unitxt_wrapper -import yaml -from unitxt.artifact import fetch_artifact -from unitxt.standard import StandardRecipe - - -# This code is required to properly dump LM harness YAML that contains references to functions -def function_representer(dumper: yaml.SafeDumper, func) -> yaml.nodes.MappingNode: - return dumper.represent_scalar( - "!function", f"{func.__module__}.{func.__name__}", style=None - ) - - -def write_task_yaml(filename, data): - yaml.add_representer(type(data["process_results"]), function_representer) - with open(filename, "w") as stream: - yaml.dump(data, stream, sort_keys=False) - - -def write_card_yaml(filename, data): - with open(filename, "w") as stream: - yaml.dump(data, stream, sort_keys=False) - - -default_template_per_task = { - "tasks.classification.multi_label": "templates.classification.multi_label.title", - "tasks.classification.multi_class": "templates.classification.multi_class.title", - "tasks.summarization.abstractive": "templates.summarization.abstractive.full", - "tasks.regression.two_texts": "templates.regression.two_texts.simple", - "tasks.qa.with_context.extractive": "templates.qa.with_context.simple", - "tasks.grammatical_error_correction": "templates.grammatical_error_correction.simple", - "tasks.span_labeling.extraction": "templates.span_labeling.extraction.title", -} - - -def generate_task_yaml(task: str): - """ - Generate an LM Eval Harness YAML file based on a Unitxt task defintion. - The output YAML is based on 'template.yaml.file' found in current directoy. - - The common template is filled the the specific metrics for the task. - It still leaves the 'dataset_name' and 'task name' unspecified. - """ - print("*" * 80) - print("*") - print(f"* Generating YAML base file for task {task}") - print("*") - task_definition, _ = fetch_artifact(task) - data = { - "group": ["unitxt"], - "dataset_path": "unitxt/data", - "output_type": "generate_until", - "training_split": "train", - "validation_split": "test", - "doc_to_text": "{{source}}", - "doc_to_target": "target", - "process_results": unitxt_wrapper.process_results, - "generation_kwargs": {"until": [""]}, - "metric_list": [], - "metadata": {"verison": 1.0}, - } - - for metric_name in task_definition.metrics: - new_metric = {"metric": "", "aggregation": "unitxt", "higher_is_better": True} - new_metric["metric"] = metric_name.replace("metrics.", "unitxt_") - data["metric_list"].append(new_metric) - - write_task_yaml(f"unitxt_{task}", data) - - -def generate_card_yaml(card: str): - """ - Generate an LM Eval Harness YAML file based on the Unitxt dataset card. - It includes the task YAML for the dataset, and overrides the 'dataset_name' and 'task' with the card. - """ - - print("*" * 80) - print("*") - print(f"* Generating YAML file for unitxt dataset {card}") - print("*") - - card_definition, _ = fetch_artifact(f"cards.{card}") - task = card_definition.task.__id__ - if task in default_template_per_task: - template = default_template_per_task[task] - else: - raise ValueError( - f"Default template was not defined for task {task} in 'default_template_per_task' dict in generate_yamls.py" - ) - data = {} - data["include"] = f"unitxt_{task}" - data["task"] = card - data["dataset_name"] = f"card=cards.{card},template={template}" - # This is faster that the load_dataset approach - # dataset = load_dataset('unitxt/data', data["dataset_name"]+",loader_limit=100",trust_remote_code=True) - recipe = StandardRecipe(card=f"cards.{card}", template=template, loader_limit=100) - stream = recipe() - dataset = stream.to_dataset() - print(dataset) - print("Sample input:") - print(dataset["test"][0]["source"]) - print("Sample output:") - print(dataset["test"][0]["target"]) - write_card_yaml(f"{card}.yaml", data) - - -def main(): - for task in default_template_per_task.keys(): - try: - generate_task_yaml(task) - except Exception as e: - print(f"Unable to generate YAML for {task} due to:") - print(e) - raise (e) - with open("unitxt_datasets") as f: - for unitxt_dataset in f: - unitxt_dataset = unitxt_dataset.strip() - if unitxt_dataset.startswith("### END ###"): - exit(0) - if not unitxt_dataset.startswith("#"): - try: - generate_card_yaml(unitxt_dataset) - except Exception as e: - print(f"Unable to generate YAML for {unitxt_dataset} due to:") - print(e) - raise e - - -if __name__ == "__main__": - main() diff --git a/lm_eval/tasks/unitxt/law_stack_exchange.yaml b/lm_eval/tasks/unitxt/law_stack_exchange.yaml index 780af103c8..d0c589a3d6 100644 --- a/lm_eval/tasks/unitxt/law_stack_exchange.yaml +++ b/lm_eval/tasks/unitxt/law_stack_exchange.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: law_stack_exchange -dataset_name: card=cards.law_stack_exchange,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.law_stack_exchange,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/ledgar.yaml b/lm_eval/tasks/unitxt/ledgar.yaml index 6d93ef9d28..1c31589764 100644 --- a/lm_eval/tasks/unitxt/ledgar.yaml +++ b/lm_eval/tasks/unitxt/ledgar.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: ledgar -dataset_name: card=cards.ledgar,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.ledgar,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/medical_abstracts.yaml b/lm_eval/tasks/unitxt/medical_abstracts.yaml index dd4e87b688..74cfef0b68 100644 --- a/lm_eval/tasks/unitxt/medical_abstracts.yaml +++ b/lm_eval/tasks/unitxt/medical_abstracts.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: medical_abstracts -dataset_name: card=cards.medical_abstracts,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.medical_abstracts,template=templates.classification.multi_class.title diff --git a/lm_eval/tasks/unitxt/stsb.yaml b/lm_eval/tasks/unitxt/stsb.yaml index 44e78c5848..8d91b0e13c 100644 --- a/lm_eval/tasks/unitxt/stsb.yaml +++ b/lm_eval/tasks/unitxt/stsb.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.regression.two_texts task: stsb -dataset_name: card=cards.stsb,template=templates.regression.two_texts.simple +include: unitxt +recipe: card=cards.stsb,template=templates.regression.two_texts.simple diff --git a/lm_eval/tasks/unitxt/task.py b/lm_eval/tasks/unitxt/task.py new file mode 100644 index 0000000000..339a3076c5 --- /dev/null +++ b/lm_eval/tasks/unitxt/task.py @@ -0,0 +1,142 @@ +""" +In the dynamic landscape of generative NLP, traditional text processing pipelines limit research flexibility and reproducibility, as they are tailored to specific dataset, task, and model combinations. The escalating complexity, involving system prompts, model-specific formats, instructions, and more, calls for a shift to a structured, modular, and customizable solution. + +Addressing this need, we present Unitxt, an innovative library for customizable textual data preparation and evaluation tailored to generative language models. Unitxt natively integrates with common libraries like HuggingFace and LM-eval-harness and deconstructs processing flows into modular components, enabling easy customization and sharing between practitioners. These components encompass model-specific formats, task prompts, and many other comprehensive dataset processing definitions. The Unitxt-Catalog centralizes these components, fostering collaboration and exploration in modern textual data workflows. Beyond being a tool, Unitxt is a community-driven platform, empowering users to build, share, and advance their pipelines collaboratively. +""" + +from functools import partial +from typing import Optional + +import evaluate + +from lm_eval.api.instance import Instance +from lm_eval.api.task import ConfigurableTask + + +_CITATION = """ +@misc{bandel2024unitxt, + title={Unitxt: Flexible, Shareable and Reusable Data Preparation and Evaluation for Generative AI}, + author={Elron Bandel and Yotam Perlitz and Elad Venezian and Roni Friedman-Melamed and Ofir Arviv and Matan Orbach and Shachar Don-Yehyia and Dafna Sheinwald and Ariel Gera and Leshem Choshen and Michal Shmueli-Scheuer and Yoav Katz}, + year={2024}, + eprint={2401.14019}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + + +def score(items, metric): + predictions, references = zip(*items) + evaluator = evaluate.load("unitxt/metric") + for reference in references: + reference["metrics"] = [metric] + results = evaluator.compute(predictions=predictions, references=references) + return results[0]["score"]["global"]["score"] + + +class Unitxt(ConfigurableTask): + VERSION = 0 + + def __init__( + self, + config: Optional[dict] = None, + ) -> None: + assert "recipe" in config, "Unitxt task must have a 'recipe' string." + super().__init__( + config={ + "metadata": {"version": self.VERSION}, + "dataset_kwargs": {"trust_remote_code": True}, + "dataset_name": config["recipe"], + "dataset_path": "unitxt/data", + } + ) + self.metrics = self.dataset["test"][0]["metrics"] + + def has_training_docs(self): + return "train" in self.dataset + + def has_validation_docs(self): + return "validation" in self.dataset + + def has_test_docs(self): + return "test" in self.dataset + + def training_docs(self): + return self.dataset["train"] + + def validation_docs(self): + return self.dataset["validation"] + + def test_docs(self): + return self.dataset["test"] + + def doc_to_text(self, doc): + return doc["source"] + + def should_decontaminate(self): + return False + + def doc_to_target(self, doc): + doc["target"] + + def construct_requests(self, doc, ctx, **kwargs): + """Uses RequestFactory to construct Requests and returns an iterable of + Requests which will be sent to the LM. + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param ctx: str + The context string, generated by fewshot_context. This includes the natural + language description, as well as the few shot examples, and the question + part of the document for `doc`. + """ + + return [ + Instance( + request_type="generate_until", + doc=doc, + arguments=(ctx, {"until": ["\n"]}), + idx=0, + **kwargs, + ) + ] + + def process_results(self, doc, results): + """Take a single document and the LM results and evaluates, returning a + dict where keys are the names of submetrics and values are the values of + the metric for that one document + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param results: + The results of the requests created in construct_requests. + """ + + continuation = results[0] + + predictions = continuation + + references = doc + return { + metric.replace("metrics.", ""): (predictions, references) + for metric in self.metrics + } + + def aggregation(self): + """ + :returns: {str: [float] -> float} + A dictionary where keys are the names of submetrics and values are + functions that aggregate a list of metrics + """ + return { + metric.replace("metrics.", ""): partial(score, metric=metric) + for metric in self.metrics + } + + def higher_is_better(self): + """ + :returns: {str: bool} + A dictionary where keys are the names of submetrics and values are + whether a higher value of the submetric is better + """ + return {metric.replace("metrics.", ""): True for metric in self.metrics} diff --git a/lm_eval/tasks/unitxt/unfair_tos.yaml b/lm_eval/tasks/unitxt/unfair_tos.yaml index 2c52c31655..b401dfeff4 100644 --- a/lm_eval/tasks/unitxt/unfair_tos.yaml +++ b/lm_eval/tasks/unitxt/unfair_tos.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_label task: unfair_tos -dataset_name: card=cards.unfair_tos,template=templates.classification.multi_label.title +include: unitxt +recipe: card=cards.unfair_tos,template=templates.classification.multi_label.title diff --git a/lm_eval/tasks/unitxt/unitxt b/lm_eval/tasks/unitxt/unitxt new file mode 100644 index 0000000000..e6902c46d4 --- /dev/null +++ b/lm_eval/tasks/unitxt/unitxt @@ -0,0 +1 @@ +class: !function task.Unitxt diff --git a/lm_eval/tasks/unitxt/unitxt_datasets b/lm_eval/tasks/unitxt/unitxt_datasets deleted file mode 100644 index cdafa0cf25..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_datasets +++ /dev/null @@ -1,18 +0,0 @@ -coedit_gec -atis -20_newsgroups -ag_news -argument_topic -banking77 -claim_stance_topic -cnn_dailymail -dbpedia_14 -ethos_binary -financial_tweets -law_stack_exchange -ledgar -medical_abstracts -stsb -unfair_tos -xsum -yahoo_answers_topics diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class b/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class deleted file mode 100644 index 7e248b834e..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class +++ /dev/null @@ -1,24 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_f1_micro - aggregation: unitxt - higher_is_better: true -- metric: unitxt_accuracy - aggregation: unitxt - higher_is_better: true -- metric: unitxt_f1_macro - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label b/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label deleted file mode 100644 index 871ebda730..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label +++ /dev/null @@ -1,24 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_f1_micro_multi_label - aggregation: unitxt - higher_is_better: true -- metric: unitxt_accuracy - aggregation: unitxt - higher_is_better: true -- metric: unitxt_f1_macro_multi_label - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction b/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction deleted file mode 100644 index 71dbfdda55..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction +++ /dev/null @@ -1,24 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_char_edit_dist_accuracy - aggregation: unitxt - higher_is_better: true -- metric: unitxt_rouge - aggregation: unitxt - higher_is_better: true -- metric: unitxt_char_edit_distance[reference_field=original_text] - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive b/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive deleted file mode 100644 index 72ecb3134a..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive +++ /dev/null @@ -1,18 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_squad - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts b/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts deleted file mode 100644 index 6db5d58764..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts +++ /dev/null @@ -1,18 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_spearman - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction b/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction deleted file mode 100644 index 60a5e74a40..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction +++ /dev/null @@ -1,18 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_ner - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive b/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive deleted file mode 100644 index 4b5d97f942..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive +++ /dev/null @@ -1,18 +0,0 @@ -group: -- unitxt -dataset_path: unitxt/data -output_type: generate_until -training_split: train -validation_split: test -doc_to_text: '{{source}}' -doc_to_target: target -process_results: !function 'unitxt_wrapper.process_results' -generation_kwargs: - until: - - -metric_list: -- metric: unitxt_rouge - aggregation: unitxt - higher_is_better: true -metadata: - verison: 1.0 diff --git a/lm_eval/tasks/unitxt/unitxt_wrapper.py b/lm_eval/tasks/unitxt/unitxt_wrapper.py deleted file mode 100644 index cfb2ec3830..0000000000 --- a/lm_eval/tasks/unitxt/unitxt_wrapper.py +++ /dev/null @@ -1,46 +0,0 @@ -try: - from unitxt import evaluate -except ImportError: - raise ImportError( - "Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`" - ) - -from lm_eval.api.registry import AGGREGATION_REGISTRY, METRIC_REGISTRY, register_metric - - -def unitxt_agg_metric(items): - preds = [pred[0] for pred, _, _ in items] - refs = [ref for _, ref, _ in items] - metric_name = items[0][2].replace("unitxt_", "metrics.") - for ref in refs: - ref["metrics"] = [metric_name] - - result_metrics = evaluate(preds, refs) - return result_metrics[0]["score"]["global"]["score"] - - -AGGREGATION_REGISTRY["unitxt"] = unitxt_agg_metric - - -def unitxt_metric(items): # This is a passthrough function - return items - - -def process_results(doc, results): - metrics = doc["metrics"] - scores = {} - for metric in metrics: - metric = metric.replace("metrics.", "unitxt_") - scores[metric] = (results, doc, metric) - - if metric not in METRIC_REGISTRY: - register_metric( - metric=metric, - higher_is_better=True, - output_type="generate_until", - aggregation="unitxt", - )(unitxt_metric) - return scores - - -# diff --git a/lm_eval/tasks/unitxt/xsum.yaml b/lm_eval/tasks/unitxt/xsum.yaml index 309e3039de..6fe2999dca 100644 --- a/lm_eval/tasks/unitxt/xsum.yaml +++ b/lm_eval/tasks/unitxt/xsum.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.summarization.abstractive task: xsum -dataset_name: card=cards.xsum,template=templates.summarization.abstractive.full +include: unitxt +recipe: card=cards.xsum,template=templates.summarization.abstractive.full diff --git a/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml b/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml index 3cb8ad2006..6bf12faedb 100644 --- a/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml +++ b/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml @@ -1,3 +1,3 @@ -include: unitxt_tasks.classification.multi_class task: yahoo_answers_topics -dataset_name: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title +include: unitxt +recipe: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title diff --git a/pyproject.toml b/pyproject.toml index 040a9b7c04..08cc47f1d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,6 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"] vllm = ["vllm>=0.4.2"] zeno = ["pandas", "zeno-client"] wandb = ["wandb>=0.16.3", "pandas", "numpy"] -unitxt = ["unitxt"] all = [ "lm_eval[anthropic]", "lm_eval[dev]", @@ -95,7 +94,6 @@ all = [ "lm_eval[vllm]", "lm_eval[zeno]", "lm_eval[wandb]", - "lm_eval[unitxt]" ] [tool.ruff.lint]