forked from huggingface/datasets
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b85ae23
commit a6430ef
Showing
3 changed files
with
151 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
# coding=utf-8 | ||
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# Lint as: python3 | ||
"""CRD3 dataset""" | ||
|
||
from __future__ import absolute_import, division, print_function | ||
import logging | ||
|
||
import json | ||
import os | ||
|
||
import nlp | ||
|
||
|
||
_CITATION = """ | ||
@inproceedings{ | ||
title = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset}, | ||
author = {Rameshkumar, Revanth and Bailey, Peter}, | ||
year = {2020}, | ||
publisher = {Association for Computational Linguistics}, | ||
conference = {ACL} | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """ | ||
Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset. | ||
Critical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. | ||
The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding | ||
abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player | ||
collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, | ||
and semantic ties to the previous dialogues. | ||
""" | ||
|
||
_URL = "https://github.com/RevanthRameshkumar/CRD3/archive/master.zip" | ||
|
||
def get_train_test_dev_files(files, test_split, train_split, dev_split): | ||
test_files = dev_files = train_files = [] | ||
for file in files: | ||
filename = os.path.split(file)[1].split('_')[0] | ||
if filename in test_split: | ||
test_files.append(file) | ||
elif filename in train_split: | ||
train_files.append(file) | ||
elif filename in dev_split: | ||
dev_files.append(file) | ||
else: | ||
logging.info("skipped file {}".format(file)) | ||
return test_files, train_files, dev_files | ||
|
||
|
||
class CRD3(nlp.GeneratorBasedBuilder): | ||
|
||
def _info(self): | ||
return nlp.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=nlp.Features({ | ||
"chunk": nlp.Value("string"), | ||
"chunk_id": nlp.Value("int32"), | ||
"turn_start": nlp.Value("int32"), | ||
"turn_end": nlp.Value("int32"), | ||
"alignment_score": nlp.Value("float32"), | ||
"turn_num": nlp.Value("int32"), | ||
"turns":nlp.features.Sequence({ | ||
"names": nlp.Value("string"), | ||
"utterances": nlp.Value("string"), | ||
}), | ||
}), | ||
homepage="https://github.com/RevanthRameshkumar/CRD3", | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
path = dl_manager.download_and_extract(_URL) | ||
test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files") | ||
train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files") | ||
dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files") | ||
with open(test_file) as f: | ||
test_splits = [file.replace("\n", "") for file in f.readlines()] | ||
|
||
with open(train_file) as f: | ||
train_splits = [file.replace("\n", "") for file in f.readlines()] | ||
with open(dev_file) as f: | ||
dev_splits = [file.replace("\n", "") for file in f.readlines()] | ||
c2 = "CRD3-master/data/aligned data/c=2" | ||
c3 = "CRD3-master/data/aligned data/c=3" | ||
c4 = "CRD3-master/data/aligned data/c=4" | ||
files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))] | ||
files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))]) | ||
files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))]) | ||
|
||
test_files, train_files, dev_files = get_train_test_dev_files(files, test_splits, train_splits, dev_splits) | ||
|
||
return [ | ||
nlp.SplitGenerator( | ||
name=nlp.Split.TRAIN, | ||
gen_kwargs={"files_path": train_files}, | ||
), | ||
nlp.SplitGenerator( | ||
name=nlp.Split.TEST, | ||
gen_kwargs={"files_path": test_files}, | ||
), | ||
nlp.SplitGenerator( | ||
name=nlp.Split.VALIDATION, | ||
gen_kwargs={"files_path": dev_files}, | ||
) | ||
] | ||
|
||
def _generate_examples(self, files_path): | ||
"""Yields examples.""" | ||
|
||
for file in files_path: | ||
with open(file) as f: | ||
data = json.load(f) | ||
for id1, row in enumerate(data): | ||
chunk = row["CHUNK"] | ||
chunk_id = row["ALIGNMENT"]["CHUNK ID"] | ||
turn_start = row["ALIGNMENT"]["TURN START"] | ||
turn_end = row["ALIGNMENT"]["TURN END"] | ||
score = row["ALIGNMENT"]["ALIGNMENT SCORE"] | ||
for id2, turn in enumerate(row['TURNS']): | ||
turn_names = turn["NAMES"] | ||
turn_utterances = turn["UTTERANCES"] | ||
turn_num = turn["NUMBER"] | ||
yield str(id1)+'_'+str(id2), { | ||
"chunk":chunk, | ||
"chunk_id": chunk_id, | ||
"turn_start": turn_start, | ||
"turn_end": turn_end, | ||
"alignment_score": score, | ||
"turn_num": turn_num, | ||
"turns": { | ||
"names": turn_names, | ||
"utterances": turn_utterances, | ||
}, | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. \nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding \nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player \ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, \nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turn_num": {"dtype": "int32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "cr_d3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "test": {"name": "test", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "validation": {"name": "validation", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 293524408, "checksum": "485ee871073c66359320db3a380cc1fa7d8bc05c9c981d87dbf36df91041ff14"}}, "download_size": 293524408, "dataset_size": 4215618687, "size_in_bytes": 4509143095}} |
Binary file not shown.