Skip to content

Commit

Permalink
add crd3 dataset (huggingface#472)
Browse files Browse the repository at this point in the history
  • Loading branch information
mariamabarham authored Aug 3, 2020
1 parent b85ae23 commit a6430ef
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 0 deletions.
150 changes: 150 additions & 0 deletions datasets/crd3/crd3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""CRD3 dataset"""

from __future__ import absolute_import, division, print_function
import logging

import json
import os

import nlp


_CITATION = """
@inproceedings{
title = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},
author = {Rameshkumar, Revanth and Bailey, Peter},
year = {2020},
publisher = {Association for Computational Linguistics},
conference = {ACL}
}
"""

_DESCRIPTION = """
Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.
Critical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.
The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding
abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player
collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,
and semantic ties to the previous dialogues.
"""

_URL = "https://github.com/RevanthRameshkumar/CRD3/archive/master.zip"

def get_train_test_dev_files(files, test_split, train_split, dev_split):
test_files = dev_files = train_files = []
for file in files:
filename = os.path.split(file)[1].split('_')[0]
if filename in test_split:
test_files.append(file)
elif filename in train_split:
train_files.append(file)
elif filename in dev_split:
dev_files.append(file)
else:
logging.info("skipped file {}".format(file))
return test_files, train_files, dev_files


class CRD3(nlp.GeneratorBasedBuilder):

def _info(self):
return nlp.DatasetInfo(
description=_DESCRIPTION,
features=nlp.Features({
"chunk": nlp.Value("string"),
"chunk_id": nlp.Value("int32"),
"turn_start": nlp.Value("int32"),
"turn_end": nlp.Value("int32"),
"alignment_score": nlp.Value("float32"),
"turn_num": nlp.Value("int32"),
"turns":nlp.features.Sequence({
"names": nlp.Value("string"),
"utterances": nlp.Value("string"),
}),
}),
homepage="https://github.com/RevanthRameshkumar/CRD3",
citation=_CITATION,
)

def _split_generators(self, dl_manager):
path = dl_manager.download_and_extract(_URL)
test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files")
train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files")
dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files")
with open(test_file) as f:
test_splits = [file.replace("\n", "") for file in f.readlines()]

with open(train_file) as f:
train_splits = [file.replace("\n", "") for file in f.readlines()]
with open(dev_file) as f:
dev_splits = [file.replace("\n", "") for file in f.readlines()]
c2 = "CRD3-master/data/aligned data/c=2"
c3 = "CRD3-master/data/aligned data/c=3"
c4 = "CRD3-master/data/aligned data/c=4"
files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))]
files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))])
files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))])

test_files, train_files, dev_files = get_train_test_dev_files(files, test_splits, train_splits, dev_splits)

return [
nlp.SplitGenerator(
name=nlp.Split.TRAIN,
gen_kwargs={"files_path": train_files},
),
nlp.SplitGenerator(
name=nlp.Split.TEST,
gen_kwargs={"files_path": test_files},
),
nlp.SplitGenerator(
name=nlp.Split.VALIDATION,
gen_kwargs={"files_path": dev_files},
)
]

def _generate_examples(self, files_path):
"""Yields examples."""

for file in files_path:
with open(file) as f:
data = json.load(f)
for id1, row in enumerate(data):
chunk = row["CHUNK"]
chunk_id = row["ALIGNMENT"]["CHUNK ID"]
turn_start = row["ALIGNMENT"]["TURN START"]
turn_end = row["ALIGNMENT"]["TURN END"]
score = row["ALIGNMENT"]["ALIGNMENT SCORE"]
for id2, turn in enumerate(row['TURNS']):
turn_names = turn["NAMES"]
turn_utterances = turn["UTTERANCES"]
turn_num = turn["NUMBER"]
yield str(id1)+'_'+str(id2), {
"chunk":chunk,
"chunk_id": chunk_id,
"turn_start": turn_start,
"turn_end": turn_end,
"alignment_score": score,
"turn_num": turn_num,
"turns": {
"names": turn_names,
"utterances": turn_utterances,
},
}


1 change: 1 addition & 0 deletions datasets/crd3/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. \nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding \nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player \ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, \nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turn_num": {"dtype": "int32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "cr_d3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "test": {"name": "test", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "validation": {"name": "validation", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 293524408, "checksum": "485ee871073c66359320db3a380cc1fa7d8bc05c9c981d87dbf36df91041ff14"}}, "download_size": 293524408, "dataset_size": 4215618687, "size_in_bytes": 4509143095}}
Binary file added datasets/crd3/dummy/0.0.0/dummy_data.zip
Binary file not shown.

0 comments on commit a6430ef

Please sign in to comment.