Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Support downloading dataset from OpenMind #1792

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,23 @@ Then submit the evaluation task without downloading all the data to your local d
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
```

#### (Optional) Automatic Download with OpenMind

Also you can use the [OpenMind](https://modelers.cn/) to load the datasets on demand.

Installation:

```bash
pip install openmind
export DATASET_SOURCE=OpenMind
```

Then submit the evaluation task without downloading all the data to your local disk. Available datasets include:

```bash
gsm8k
```

Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).

<p align="right"><a href="#top">🔝Back to top</a></p>
Expand Down
16 changes: 16 additions & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,22 @@ export DATASET_SOURCE=ModelScope
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
```

#### (可选) 使用 OpenMind 自动下载

另外,您还可以使用[OpenMind](https://modelers.cn/)来加载数据集:
环境准备:

```bash
pip install openmind
export DATASET_SOURCE=OpenMind
```

配置好环境后,无需下载全部数据,直接提交评测任务即可。目前支持的数据集有:

```bash
gsm8k
```

有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。

<p align="right"><a href="#top">🔝返回顶部</a></p>
Expand Down
7 changes: 6 additions & 1 deletion opencompass/datasets/gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@ class GSM8KDataset(BaseDataset):
@staticmethod
def load(path):
path = get_data_path(path)
if environ.get('DATASET_SOURCE') == 'ModelScope':
dataset_source = environ.get('DATASET_SOURCE', None)

if dataset_source == 'ModelScope':
from modelscope import MsDataset
dataset = MsDataset.load(dataset_name=path)
elif dataset_source == 'OpenMind':
from openmind.integrations.datasets import load_dataset
dataset = load_dataset(path)
else:
datasets = {}
for split in ['train', 'test']:
Expand Down
8 changes: 8 additions & 0 deletions opencompass/utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def get_data_path(dataset_id: str, local_mode: bool = False):
assert ms_id is not None, \
f'{dataset_id} is not supported in ModelScope'
return ms_id
if dataset_source == 'OpenMind':
try:
om_id = DATASETS_MAPPING[dataset_id]['om_id']
except KeyError as ex:
raise KeyError(f"{dataset_id} is not supported in OpenMind.") from ex
assert om_id is not None, \
f'{dataset_id} is not supported in OpenMind'
return om_id
elif dataset_source == 'HF':
# TODO: HuggingFace mode is currently not supported!
hf_id = DATASETS_MAPPING[dataset_id]['hf_id']
Expand Down
1 change: 1 addition & 0 deletions opencompass/utils/datasets_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
"opencompass/gsm8k": {
"ms_id": "opencompass/gsm8k",
"hf_id": "opencompass/gsm8k",
"om_id": "OpenCompass/gsm8k",
"local": "./data/gsm8k/",
},
# HellaSwag
Expand Down
173 changes: 173 additions & 0 deletions tests/dataset/test_om_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import random
import sys
import unittest
import warnings
from os import environ

from datasets import Dataset, DatasetDict
from mmengine.config import read_base
from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

warnings.filterwarnings('ignore', category=DeprecationWarning)


def reload_datasets():
modules_to_remove = [
module_name for module_name in sys.modules
if module_name.startswith('configs.datasets')
]

for module_name in modules_to_remove:
del sys.modules[module_name]

with read_base():
from configs.datasets.gsm8k.gsm8k_gen import gsm8k_datasets

return sum((v for k, v in locals().items() if k.endswith('_datasets')), [])


def load_datasets_conf(source):
environ['DATASET_SOURCE'] = source
datasets_conf = reload_datasets()
return datasets_conf


def load_datasets(source, conf):
environ['DATASET_SOURCE'] = source
if 'lang' in conf:
dataset = conf['type'].load(path=conf['path'], lang=conf['lang'])
return dataset

if 'setting_name' in conf:
dataset = conf['type'].load(path=conf['path'],
name=conf['name'],
setting_name=conf['setting_name'])
return dataset

if 'name' in conf:
dataset = conf['type'].load(path=conf['path'], name=conf['name'])
return dataset

try:
dataset = conf['type'].load(path=conf['path'])
except Exception as ex:
print(ex)
dataset = conf['type'].load(**conf)

return dataset


def clean_string(value):
"""Helper function to clean and normalize string data.

It strips leading and trailing whitespace and replaces multiple whitespace
characters with a single space.
"""
if isinstance(value, str):
return ' '.join(value.split())
return value


class TestingOmDatasets(unittest.TestCase):

def test_datasets(self):
# 加载 OpenMind 和 Local 数据集配置
om_datasets_conf = load_datasets_conf('OpenMind')
local_datasets_conf = load_datasets_conf('Local')

# 初始化成功和失败的数据集列表
successful_comparisons = []
failed_comparisons = []

def compare_datasets(om_conf, local_conf):
openmind_path_name = f"{om_conf.get('path')}/{om_conf.get('name', '')}\t{om_conf.get('lang', '')}"
local_path_name = f"{local_conf.get('path')}/{local_conf.get('name', '')}\t{local_conf.get('lang', '')}"
# 断言类型一致
assert om_conf['type'] == local_conf['type'], "Data types do not match"
print(openmind_path_name, local_path_name)
try:
om_dataset = load_datasets('OpenMind', om_conf)
local_dataset = load_datasets('Local', local_conf)
_check_data(om_dataset, local_dataset, sample_size=sample_size)
return 'success', f'{openmind_path_name} | {local_path_name}'
except Exception as exception:
print(exception)
return 'failure', f'{openmind_path_name} is not the same as {local_path_name}'

with ThreadPoolExecutor(thread) as executor:
futures = {
executor.submit(compare_datasets, om_conf, local_conf): (om_conf, local_conf)
for om_conf, local_conf in zip(om_datasets_conf, local_datasets_conf)
}

for future in tqdm(as_completed(futures), total=len(futures)):
result, message = future.result()
if result == 'success':
successful_comparisons.append(message)
else:
failed_comparisons.append(message)

# 输出测试总结
total_datasets = len(om_datasets_conf)
print(f"All {total_datasets} datasets")
print(f"OK {len(successful_comparisons)} datasets")
for success in successful_comparisons:
print(f" {success}")
print(f"Fail {len(failed_comparisons)} datasets")
for failure in failed_comparisons:
print(f" {failure}")


def _check_data(om_dataset: Dataset | DatasetDict,
oc_dataset: Dataset | DatasetDict,
sample_size):
assert type(om_dataset) == type(
oc_dataset
), f'Dataset type not match: {type(om_dataset)} != {type(oc_dataset)}'

# match DatasetDict
if isinstance(oc_dataset, DatasetDict):
assert om_dataset.keys() == oc_dataset.keys(
), f'DatasetDict not match: {om_dataset.keys()} != {oc_dataset.keys()}'

for key in om_dataset.keys():
_check_data(om_dataset[key], oc_dataset[key], sample_size=sample_size)

elif isinstance(oc_dataset, Dataset):
# match by cols
assert set(om_dataset.column_names) == set(
oc_dataset.column_names
), f'Column names do not match: {om_dataset.column_names} != {oc_dataset.column_names}'

# Check that the number of rows is the same
assert len(om_dataset) == len(
oc_dataset
), f'Number of rows do not match: {len(om_dataset)} != {len(oc_dataset)}'

# Randomly sample indices
sample_indices = random.sample(range(len(om_dataset)),
min(sample_size, len(om_dataset)))

for i, idx in enumerate(sample_indices):
for col in om_dataset.column_names:
om_value = clean_string(str(om_dataset[col][idx]))
oc_value = clean_string(str(oc_dataset[col][idx]))
try:
assert om_value == oc_value, f"Value mismatch in column '{col}', index {idx}: {om_value} != {oc_value}"
except AssertionError as e:
print(f"Assertion failed for column '{col}', index {idx}")
print(f"om_data: {om_dataset[idx]}")
print(f'oc_data: {oc_dataset[idx]}')
print(f'om_value: {om_value} ({type(om_value)})')
print(f'oc_value: {oc_value} ({type(oc_value)})')
raise e
else:
raise ValueError(f'Datasets type not supported {type(om_dataset)}')


if __name__ == '__main__':
sample_size = 100
thread = 1
unittest.main()
Loading