diff --git a/src/summary-tables/collections-text-v1paper.ipynb b/src/summary-tables/collections-text-v1paper.ipynb new file mode 100644 index 00000000..a9e5552a --- /dev/null +++ b/src/summary-tables/collections-text-v1paper.ipynb @@ -0,0 +1,1328 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1937b851-b7a2-4b52-adae-a4f32552376b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import matplotlib as mp\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.cm as cm\n", + "import matplotlib.colors as mcolors" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "95b15d7f-b78a-4b17-b4e6-5b0dea2636da", + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(os.path.expanduser('~/github/Data-Provenance-Collection'))" + ] + }, + { + "cell_type": "markdown", + "id": "8c2c05ed-0cfd-42d8-82f4-79461080f91c", + "metadata": {}, + "source": [ + "# Prepare data" + ] + }, + { + "cell_type": "markdown", + "id": "aad0bd77-6c2a-4d12-9441-a91ad50b8129", + "metadata": {}, + "source": [ + "## Utils" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e68460a4-c2e1-4b83-b7a6-f6dd423e7f58", + "metadata": {}, + "outputs": [], + "source": [ + "def count_unique_with_none(s):\n", + " all_vals = []\n", + " for lst in s:\n", + " if lst is None:\n", + " pass\n", + " else:\n", + " all_vals += lst\n", + " \n", + " return len(set(all_vals))" + ] + }, + { + "cell_type": "markdown", + "id": "0784d801-73b6-4b53-8188-1e9fd496574f", + "metadata": {}, + "source": [ + "## Datasets and some metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1b0aa587-e76a-4b65-ae75-b1f8bb0196a7", + "metadata": {}, + "outputs": [], + "source": [ + "dat = pd.DataFrame([\n", + " ('Anthropic HH', 'Anthropic HH-RLHF', 'bai2022training, gangulired'),\n", + " ('Dolly 15k', 'Dolly 15k', 'dolly15k_2023'),\n", + " ('OpenAssistant', 'Open Assistant', 'kopf2023openassistant'),\n", + " ('Flan Collection', 'Flan Collection (Chain-of-Thought)|Flan Collection (Dialog)|Flan Collection (Flan 2021)|Flan Collection (P3)|Flan Collection (Super-NaturalInstructions)', 'longpre2023flan'),\n", + " ('xP3x', 'xP3x', 'muennighoff2022crosslingual'),\n", + " ('Tasksource Ins.', 'Tasksource Instruct', 'sileo2023tasksource'),\n", + " ('LAION OIG', 'OIG', 'oig2023'),\n", + " ('SHP', 'Stanford Human Preferences', 'SHP'),\n", + " ('ShareGPT', 'ShareGPT Vicuna', r'sharegpt'),\n", + " ('Self-Instruct', 'Self-Instruct', 'selfinstruct2022'),\n", + " ('WebGPT', 'OpenAI (WebGPT)', 'nakano2021webgpt'),\n", + " ('OpenAI Summ.', 'OpenAI (Summarize from Feedback)', 'stienon2020learning'),\n", + " ('Airoboros', 'Airoboros', 'Durbin2023Airoboros'),\n", + " ('Alpaca', 'Alpaca', 'alpaca'),\n", + " ('BaizeChat', 'Baize Chat Data', 'xu2023baize'),\n", + " ('BookSum', 'Book Summaries', 'kryscinski2022booksum'),\n", + " ('CamelAI Sci.', 'Camel-AI Science', 'li2023camel'),\n", + " ('CoT Coll.', 'CoT Collection', 'kim2023cot'),\n", + " ('Code Alpaca', 'Code Alpaca', ''),\n", + " ('GPT-4-Alpaca', 'GPT-4-Alpaca', 'peng2023instruction'),\n", + " ('GPTeacher', 'GPTeacher', ''),\n", + " ('Gorilla', 'Gorilla', 'patil2023gorilla'),\n", + " ('HC3', 'HC3 (Chinese)|HC3 (English)', 'guo2023close'),\n", + " ('Joke Expl.', 'Joke Explanation',''),\n", + " ('LIMA', 'LIMA', 'zhou2023lima'),\n", + " ('Longform', 'Longform', 'koksal2023longform'),\n", + " ('GPT4AllJ', 'NomicAI GPT4AllJ', 'gpt4all'),\n", + " ('OpenOrca', 'Open Orca', 'mukherjee2023orca'),\n", + " ('Tool-Llama', 'Tool-Llama', 'qin2023toolllm'),\n", + " ('UltraChat', 'UltraChat', 'ding2023enhancing'),\n", + " ('Unnatural Instr.', 'Unnatural Instructions', 'honovich2022unnatural'),\n", + " ('Evol-Instr.', 'WizardLM Evol-Instruct|WizardLM Evol-Instruct V2', 'xu2023wizardlm'),\n", + " ('StarCoder', 'StarCoder Self-Instruct', 'li2023starcoder'),\n", + " ('TinyStories', 'Tiny Stories', 'eldan2023tinystories'),\n", + " ('StackExchange', 'Stack Exchange Instruction', ''),\n", + " ('Tasksource ST', 'Tasksource Symbol-Tuning', 'weston2015aicomplete'),\n", + " ('CommitPackFT', 'CommitPackFT', 'muennighoff2023octopack'),\n", + " ('OpAsst OctoPack', 'Open Assistant OctoPack', 'muennighoff2023octopack'),\n", + "], columns=['Collection', 'summary_keys', 'Cite']).set_index('Collection')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7508391f-367d-4a26-ae00-fbe06fd5f699", + "metadata": {}, + "outputs": [], + "source": [ + "sk = [y for x in dat['summary_keys'].str.split('|').tolist() for y in x]\n", + "\n", + "files = [\n", + " os.path.splitext(f)[0]\n", + " for f in os.listdir('data_summaries')\n", + " if not f.startswith('_template')\n", + "]\n", + "\n", + "assert len(set(sk)) == len(sk)\n", + "assert set(sk) <= set(files)" + ] + }, + { + "cell_type": "markdown", + "id": "6b9f63d4-8588-43c1-90af-dbe5bc65f2e8", + "metadata": {}, + "source": [ + "## Other supporting data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "949fe910-4d04-47ff-b215-a5a7f32d76ee", + "metadata": {}, + "outputs": [], + "source": [ + "tmp = []\n", + "short_names = pd.DataFrame(dat['summary_keys'].str.split('|')).to_records().tolist()\n", + "for short, keys in short_names:\n", + " for key in keys:\n", + " tmp += [(short, key)]\n", + "short_names = pd.DataFrame(tmp, columns=['short_name', 'summary_key']).set_index('summary_key')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ddabc229-c3cf-4e19-8257-9c7642c6f1c8", + "metadata": {}, + "outputs": [], + "source": [ + "summaries = {}\n", + "for file in os.listdir('data_summaries'):\n", + " if file.startswith('_template'):\n", + " continue\n", + " \n", + " with open(os.path.join('data_summaries', file), 'rt') as f:\n", + " summaries[file.split('.')[0]] = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "eb01af1d-e511-4056-a451-73b900988468", + "metadata": {}, + "outputs": [], + "source": [ + "with open('constants/domain_groups.json', 'rt') as f:\n", + " domain_groups = json.load(f)\n", + " domain_groups = {\n", + " v: k\n", + " for k, vs in domain_groups.items()\n", + " for v in vs\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0ef36a96-8993-4b89-9342-efedc7aaca6d", + "metadata": {}, + "outputs": [], + "source": [ + "with open('constants/task_groups.json', 'rt') as f:\n", + " task_groups = json.load(f)\n", + " task_groups = {\n", + " v: k\n", + " for k, vs in task_groups.items()\n", + " for v in vs\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "87523fb2-3c74-4f78-ab1e-972b0332c3a3", + "metadata": {}, + "outputs": [], + "source": [ + "with open('constants/license_classes.json', 'rt') as f:\n", + " license_classes = json.load(f)\n", + " license_classes = {k : v[-1] for k, v in license_classes.items()}\n", + " license_classes['Custom'] = 'Unspecified'" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "566f51bd-d10f-405b-9178-8fc59908d4e9", + "metadata": {}, + "outputs": [], + "source": [ + "hf_downloads_new = pd.read_csv('src/summary-tables/hf_downloads.csv', sep='\\t')" + ] + }, + { + "cell_type": "markdown", + "id": "b26662c7-5d8a-450c-81d7-72e1b7b4784e", + "metadata": {}, + "source": [ + "## Licenses" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "34659407-729b-4c53-87d3-efdce68c6e1d", + "metadata": {}, + "outputs": [], + "source": [ + "licenses = {}\n", + "for k in summaries.keys():\n", + " for ds in summaries[k].keys():\n", + " for lic in summaries[k][ds]['Licenses']:\n", + " licenses[k] = licenses.get(k, []) + [lic['License']]\n", + "licenses = pd.Series({k : list(set(v)) for k, v in licenses.items()})\n", + "\n", + "tmp = short_names.copy()\n", + "tmp['licenses'] = licenses\n", + "dat['License'] = tmp.groupby('short_name')['licenses'].apply(lambda s: list(set([y for x in s for y in x])))\n", + "\n", + "license_table = dat[['License', 'Cite']].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9c3a7601-7b39-4981-a7c5-25ab21d86969", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def color_license_classes(s):\n", + " assert len(s) > 0\n", + " \n", + " ret = []\n", + "\n", + " if 'All' in s:\n", + " ret += [r'\\CommercialDataCircle']\n", + " else:\n", + " ret += [r'\\TransparentCircle']\n", + " \n", + " if 'Unspecified' in s:\n", + " ret += [r'\\UnspecifiedDataCircle']\n", + " else:\n", + " ret += [r'\\TransparentCircle']\n", + " \n", + " if 'Acad' in s or 'NC' in s:\n", + " ret += [r'\\NCDataCircle']\n", + " else:\n", + " ret += [r'\\TransparentCircle']\n", + " \n", + " return ' '.join(ret)\n", + "\n", + "dat['Use'] = dat['License'].apply(lambda s: list(set([license_classes[v] for v in s])))\n", + "dat['Use'] = dat['Use'].apply(color_license_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f3409805-d5df-4e1a-becd-1d8dc75ee360", + "metadata": {}, + "outputs": [], + "source": [ + "dat.drop(['License', 'Cite'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "id": "cc195158-48fe-4775-825b-275a4649f958", + "metadata": {}, + "source": [ + "## Property counts and text lens" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "2e0be802-6b39-4368-8ada-dd4aa237c784", + "metadata": {}, + "outputs": [], + "source": [ + "raw = []\n", + "for collection in dat.index:\n", + " for file in dat.loc[collection, 'summary_keys'].split('|'):\n", + " for k in summaries[file].keys():\n", + " if 'Languages' in summaries[file][k].keys():\n", + " langs = summaries[file][k]['Languages']\n", + " else:\n", + " langs = None\n", + " \n", + " metrics = summaries[file][k].get('Text Metrics', None)\n", + " if metrics is None or metrics == '' or metrics == {}:\n", + " num_dialogs = np.nan\n", + " mean_inputs_length = np.nan\n", + " mean_targets_length = np.nan\n", + " else:\n", + " num_dialogs = metrics['Num Dialogs']\n", + " mean_inputs_length = metrics['Mean Inputs Length']\n", + " mean_targets_length = metrics['Mean Targets Length']\n", + " \n", + " if 'Text Sources' not in summaries[file][k].keys():\n", + " domains = None\n", + " elif not isinstance(summaries[file][k]['Text Sources'], (list, tuple)):\n", + " domains = None\n", + " else:\n", + " domains = summaries[file][k]['Text Sources']\n", + " domains = [domain_groups[d] for d in domains]\n", + "\n", + " if 'Task Categories' not in summaries[file][k].keys():\n", + " tasks = None\n", + " elif not isinstance(summaries[file][k]['Task Categories'], (list, tuple)):\n", + " tasks = None\n", + " else:\n", + " tasks = summaries[file][k]['Task Categories']\n", + " tasks = [task_groups[d] for d in tasks]\n", + "\n", + " inf_metadata = summaries[file][k].get('Inferred Metadata', None)\n", + " if inf_metadata is None or inf_metadata == '' or inf_metadata == {}:\n", + " topics = np.nan\n", + " else:\n", + " if 'Text Topics' not in inf_metadata.keys():\n", + " topics = None\n", + " elif not isinstance(inf_metadata['Text Topics'], (list, tuple)):\n", + " topics = None\n", + " else:\n", + " topics = inf_metadata['Text Topics']\n", + "\n", + " raw += [{\n", + " 'collection': collection,\n", + " 'summary_key': file,\n", + " 'sub': k,\n", + "\n", + " 'num_dialogs': num_dialogs,\n", + " 'mean_inputs_length': mean_inputs_length,\n", + " 'mean_targets_length': mean_targets_length,\n", + "\n", + " 'langs': langs,\n", + " 'topics': topics,\n", + " 'domains': domains,\n", + " 'tasks': tasks,\n", + " 'datasets': 1,\n", + " }]\n", + "raw = pd.DataFrame(raw)\n", + "\n", + "total_input_length = raw['num_dialogs'] * raw['mean_inputs_length']\n", + "total_targets_length = raw['num_dialogs'] * raw['mean_targets_length']\n", + "\n", + "num_dialogs = raw.groupby('collection')['num_dialogs'].sum()\n", + "mean_inputs_length = total_input_length.groupby(raw['collection']).sum() / num_dialogs\n", + "mean_targets_length = total_targets_length.groupby(raw['collection']).sum() / num_dialogs\n", + "\n", + "num_langs = raw.groupby('collection')['langs'].apply(count_unique_with_none)\n", + "num_topics = raw.groupby('collection')['topics'].apply(count_unique_with_none)\n", + "num_domains = raw.groupby('collection')['domains'].apply(count_unique_with_none)\n", + "num_tasks = raw.groupby('collection')['tasks'].apply(count_unique_with_none)\n", + "num_datasets = raw.groupby('collection')['datasets'].sum()\n", + "\n", + "dat['Num Langs'] = num_langs.fillna(0).astype(int)\n", + "dat['Num Dialogs'] = num_dialogs.fillna(0).astype(int)\n", + "dat['Mean Inputs Length'] = mean_inputs_length.fillna(0).astype(int)\n", + "dat['Mean Targets Length'] = mean_targets_length.fillna(0).astype(int)\n", + "dat['Num Topics'] = num_topics.fillna(0).astype(int)\n", + "dat['Num Datasets'] = num_datasets.fillna(0).astype(int)\n", + "dat['Num Tasks'] = num_tasks.fillna(0).astype(int)\n", + "dat['Num Domains'] = num_domains.fillna(0).astype(int).apply(lambda s: max(s, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d7946fe5-01c7-4c84-9cce-5db82cd2ed7f", + "metadata": {}, + "outputs": [], + "source": [ + "num_downs = []\n", + "for collection in dat.index:\n", + " for file in dat.loc[collection, 'summary_keys'].split('|'):\n", + " if file in hf_downloads_new['Collection'].tolist():\n", + " downs = hf_downloads_new.loc[hf_downloads_new['Collection'] == file, 'sum HF Downloads (October 2023)'].item()\n", + " else:\n", + " downs = np.nan\n", + "\n", + " num_downs += [(file, downs)]\n", + "\n", + "downs = short_names.copy().rename({'short_name': 'collection'}, axis=1)\n", + "downs['downs'] = pd.Series(dict(num_downs))\n", + "num_downs = downs.groupby('collection').sum()\n", + "dat['Num Downs'] = num_downs.fillna(0).astype(int)" + ] + }, + { + "cell_type": "markdown", + "id": "3735c5f2-c5f7-4837-aedd-ead6cc21453d", + "metadata": {}, + "source": [ + "## Source" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fb479332-8bd6-412f-b841-87e10c790663", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_3825612/1098043720.py:14: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '\\emoji{globe-with-meridians}\\emoji{robot}' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.\n", + " dat.loc[(dat['Source'] > 0) & (dat['Source'] < 1), 'Source'] = r'\\emoji{globe-with-meridians}\\emoji{robot}'\n" + ] + } + ], + "source": [ + "mgen = {}\n", + "for k in summaries.keys():\n", + " for ds in summaries[k].keys():\n", + " models = summaries[k][ds]['Model Generated']\n", + " models = [m for m in models if m != '']\n", + " mgen[k] = mgen.get(k, []) + [len(models) > 0]\n", + "mgen = pd.Series({k : list(set(v)) for k, v in mgen.items()})\n", + "tmp = short_names.copy()\n", + "tmp['mgen'] = mgen\n", + "dat['Source'] = tmp.groupby('short_name')['mgen'] \\\n", + " .agg(lambda x: [item for sublist in x for item in sublist]) \\\n", + " .apply(lambda s: 1 - sum(s) / len(s))\n", + "\n", + "dat.loc[(dat['Source'] > 0) & (dat['Source'] < 1), 'Source'] = r'\\emoji{globe-with-meridians}\\emoji{robot}'\n", + "dat.loc[dat['Source'] == 1, 'Source'] = r'\\emoji{globe-with-meridians}\\emojiblank'\n", + "dat.loc[dat['Source'] == 0, 'Source'] = r'\\emojiblank\\emoji{robot}'" + ] + }, + { + "cell_type": "markdown", + "id": "d5838c91-b252-4be9-be85-3db3412cf581", + "metadata": {}, + "source": [ + "## Format" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "eb184790-c5c2-4611-b049-3a67529c346f", + "metadata": {}, + "outputs": [], + "source": [ + "formats_map = {\n", + " 'Chain-of-Thought': 'CT',\n", + " 'Few-shot': 'FS',\n", + " 'Multi-turn Dialog': 'MD',\n", + " 'Response Ranking': 'RR',\n", + " 'Zero-shot': 'ZS',\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b6111a88-2c0c-462a-bdcc-21a296c22ae5", + "metadata": {}, + "outputs": [], + "source": [ + "found_formats = [\n", + " summaries[file][k]['Format']\n", + " for file in summaries.keys()\n", + " for k in summaries[file].keys()\n", + "]\n", + "found_formats = set([y for x in found_formats for y in x])\n", + "# assert found_formats <= set(formats_map.keys())\n", + "\n", + "fmts = []\n", + "for collection in dat.index:\n", + " for file in dat.loc[collection, 'summary_keys'].split('|'):\n", + " for k in summaries[file].keys():\n", + " tmp_fmts = summaries[file][k].get('Format', [])\n", + " tmp_fmts = [\n", + " formats_map[f]\n", + " for f in tmp_fmts\n", + " if f in formats_map.keys()\n", + " ]\n", + " \n", + " fmts += [{\n", + " 'collection': collection,\n", + " 'summary_key': file,\n", + " 'sub': k,\n", + " \n", + " 'formats': tmp_fmts,\n", + " }]\n", + "fmts = pd.DataFrame(fmts)\n", + "fmts = fmts.groupby('collection')['formats'] \\\n", + " .apply(lambda s: list(set([y for x in s for y in x]))) \\\n", + " .rename('formats')\n", + "\n", + "for fmt in formats_map.values():\n", + " dat[fmt] = fmts.apply(lambda s: fmt in s) \\\n", + " .replace(True, r'\\greencheck') \\\n", + " .replace(False, r'\\emojiblank')" + ] + }, + { + "cell_type": "markdown", + "id": "2acde63d-c13f-4346-bd7f-1047182717ff", + "metadata": {}, + "source": [ + "## Format for LaTeX output" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2e1a62b6-ce23-4d37-9598-21f9c3ea1c9d", + "metadata": {}, + "outputs": [], + "source": [ + "dat = dat[[\n", + " 'Num Datasets',\n", + " 'Num Dialogs',\n", + " 'Num Tasks',\n", + " 'Num Langs',\n", + " 'Num Topics',\n", + " 'Num Domains',\n", + " 'Num Downs',\n", + " \n", + " 'Mean Inputs Length',\n", + " 'Mean Targets Length',\n", + " \n", + " 'Source',\n", + " \n", + " 'ZS',\n", + " 'FS',\n", + " 'CT',\n", + " 'RR',\n", + " 'MD',\n", + " \n", + " 'Use',\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "6e435e1d-0fc5-47b2-b5c9-42d0f7d8784e", + "metadata": {}, + "outputs": [], + "source": [ + "column_mapping = {\n", + " 'Num Datasets': ('Property Counts', 'Datasets'),\n", + " 'Num Dialogs': ('Property Counts', 'Dialogs'),\n", + " 'Num Tasks': ('Property Counts', 'Tasks'),\n", + " 'Num Langs': ('Property Counts', 'Langs'),\n", + " 'Num Topics': ('Property Counts', 'Topics'),\n", + " 'Num Cites': ('Property Counts', 'Cites'),\n", + " 'Num Downs': ('Property Counts', 'Downs'),\n", + " 'Num Domains': ('Property Counts', 'Domains'),\n", + " 'Mean Inputs Length': ('Text Lens', 'Inpt'),\n", + " 'Mean Targets Length': ('Text Lens', 'Tgt'),\n", + " 'Source': ('Dataset Types', 'Source'),\n", + "\n", + " 'CT': ('Dataset Types', 'C'),\n", + " 'ZS': ('Dataset Types', 'Z'),\n", + " 'RR': ('Dataset Types', 'R'),\n", + " 'MD': ('Dataset Types', 'M'),\n", + " 'FS': ('Dataset Types', 'F'),\n", + " \n", + " 'Use': ('Dataset Types', 'Use'),\n", + "}\n", + "\n", + "dat.columns = pd.MultiIndex.from_arrays([\n", + " [column_mapping[col][0] for col in dat.columns],\n", + " [column_mapping[col][1] for col in dat.columns]\n", + "])\n", + "\n", + "dat.columns = pd.MultiIndex.from_tuples([\n", + " (r'\\textsc{' + c[0] + r'}', r'\\textsc{\\thead{' + c[1] + r'}}')\n", + " for c in dat.columns\n", + "])\n", + "\n", + "dat.index.name = r'\\textsc{' + dat.index.name + r'}'\n", + "\n", + "def color_map(value, cmap='BrBG', vmin=None, vmax=None):\n", + " norm = mcolors.Normalize(vmin=vmin, vmax=vmax)\n", + " colormap = mp.colormaps[cmap]\n", + " color = [int(255*x) for x in colormap(norm(value))[:3]]\n", + " return ','.join(map(str, color)) # Convert the color to a CSV string\n", + "\n", + "color_def = ''\n", + "formatters = {}\n", + "tmp_val_color = {}\n", + "num_cols = [\n", + " c\n", + " for c in dat.columns\n", + " if (\n", + " 'Property Counts' in c[0] or\n", + " 'Text Lens' in c[0]\n", + " )\n", + "]\n", + "\n", + "for col in num_cols:\n", + " tmp_val_color[col] = {}\n", + " \n", + " vmin = np.log(dat[col].min() + 1e-6)\n", + " vmax = np.log(dat[col].max() + 1e-6)\n", + " midpt = (vmax + vmin) / 2\n", + " vmin, vmax = vmin - midpt, vmax - midpt\n", + " \n", + " for row in dat.index:\n", + " value = np.log(dat.loc[row, col] + 1e-6)\n", + " value -= midpt\n", + " \n", + " if pd.notnull(value):\n", + " row_color_name = row.replace(' ', '') \\\n", + " .replace(r'\\textsc{', '').replace('}', '')\n", + " col_color_name = '_'.join(col).replace(' ', '') \\\n", + " .replace(r'\\textsc{\\thead{', '').replace('}}', '') \\\n", + " .replace(r'\\textsc{', '').replace('}', '')\n", + " \n", + " color_name = f\"color{row_color_name}{col_color_name}\"\n", + " \n", + " color_def += f\"\\\\definecolor{{{color_name}}}{{RGB}}{{{color_map(value / 4, vmin=vmin, vmax=vmax)}}}\\n\"\n", + " \n", + " tmp_val_color[col][dat.loc[row, col]] = color_name\n", + "\n", + " if 'Dialogs' in col[1] or 'Downs' in col[1]:\n", + " def func(v, col=col):\n", + " color_name = tmp_val_color[col][v]\n", + " \n", + " if col in num_cols and v >= 1000:\n", + " v /= 1000\n", + " return f'\\\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'\n", + " elif col in num_cols and v == 0:\n", + " return '-'\n", + " else:\n", + " return f'\\\\cellcolor{{{color_name}}}{{<1k}}' if pd.notnull(v) else '-'\n", + " else:\n", + " def func(v, col=col):\n", + " color_name = tmp_val_color[col][v]\n", + " \n", + " if col in num_cols and v >= 1000:\n", + " v /= 1000\n", + " return f'\\\\cellcolor{{{color_name}}}{{{v:,.0f}k}}' if pd.notnull(v) else '-'\n", + " elif col in num_cols and v == 0:\n", + " return '-'\n", + " else:\n", + " return f'\\\\cellcolor{{{color_name}}}{{{v:,.0f}}}' if pd.notnull(v) else '-'\n", + " \n", + " formatters[col] = func" + ] + }, + { + "cell_type": "markdown", + "id": "6431fe91-5470-41fb-8207-f8aa2e740160", + "metadata": {}, + "source": [ + "# Main table" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1526d036-6a7c-482e-97d3-b4dfc5edd48e", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\setlength{\\tabcolsep}{1.9pt}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorDolly15kPropertyCounts_Datasets}{RGB}{245,239,222}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Datasets}{RGB}{245,244,242}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Datasets}{RGB}{182,227,220}\n", + "\\definecolor{colorxP3xPropertyCounts_Datasets}{RGB}{179,226,219}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Datasets}{RGB}{196,232,227}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Datasets}{RGB}{242,244,244}\n", + "\\definecolor{colorSHPPropertyCounts_Datasets}{RGB}{245,244,242}\n", + "\\definecolor{colorShareGPTPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorWebGPTPropertyCounts_Datasets}{RGB}{245,237,214}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorAiroborosPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorAlpacaPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Datasets}{RGB}{245,236,210}\n", + "\\definecolor{colorBookSumPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Datasets}{RGB}{245,243,238}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Datasets}{RGB}{245,238,218}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Datasets}{RGB}{245,236,210}\n", + "\\definecolor{colorGorillaPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorHC3PropertyCounts_Datasets}{RGB}{245,241,232}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorLIMAPropertyCounts_Datasets}{RGB}{245,237,214}\n", + "\\definecolor{colorLongformPropertyCounts_Datasets}{RGB}{245,239,222}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Datasets}{RGB}{245,239,222}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Datasets}{RGB}{245,236,210}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorUltraChatPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Datasets}{RGB}{245,232,196}\n", + "\\definecolor{colorStarCoderPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorStackExchangePropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Datasets}{RGB}{200,234,229}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Datasets}{RGB}{196,232,227}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Datasets}{RGB}{240,223,178}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Dialogs}{RGB}{245,242,234}\n", + "\\definecolor{colorDolly15kPropertyCounts_Dialogs}{RGB}{245,237,214}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Dialogs}{RGB}{245,236,210}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Dialogs}{RGB}{222,239,237}\n", + "\\definecolor{colorxP3xPropertyCounts_Dialogs}{RGB}{179,226,219}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Dialogs}{RGB}{231,241,240}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Dialogs}{RGB}{222,239,237}\n", + "\\definecolor{colorSHPPropertyCounts_Dialogs}{RGB}{245,243,240}\n", + "\\definecolor{colorShareGPTPropertyCounts_Dialogs}{RGB}{245,240,228}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Dialogs}{RGB}{245,240,228}\n", + "\\definecolor{colorWebGPTPropertyCounts_Dialogs}{RGB}{245,237,216}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Dialogs}{RGB}{245,241,230}\n", + "\\definecolor{colorAiroborosPropertyCounts_Dialogs}{RGB}{245,240,226}\n", + "\\definecolor{colorAlpacaPropertyCounts_Dialogs}{RGB}{245,239,224}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Dialogs}{RGB}{245,242,236}\n", + "\\definecolor{colorBookSumPropertyCounts_Dialogs}{RGB}{245,235,206}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Dialogs}{RGB}{244,244,244}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Dialogs}{RGB}{235,242,241}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Dialogs}{RGB}{245,237,216}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Dialogs}{RGB}{245,239,224}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Dialogs}{RGB}{245,241,230}\n", + "\\definecolor{colorGorillaPropertyCounts_Dialogs}{RGB}{245,237,214}\n", + "\\definecolor{colorHC3PropertyCounts_Dialogs}{RGB}{245,239,222}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Dialogs}{RGB}{239,221,175}\n", + "\\definecolor{colorLIMAPropertyCounts_Dialogs}{RGB}{244,229,189}\n", + "\\definecolor{colorLongformPropertyCounts_Dialogs}{RGB}{245,237,216}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Dialogs}{RGB}{242,244,244}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Dialogs}{RGB}{229,241,239}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Dialogs}{RGB}{245,239,222}\n", + "\\definecolor{colorUltraChatPropertyCounts_Dialogs}{RGB}{236,243,242}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Dialogs}{RGB}{245,240,226}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Dialogs}{RGB}{245,242,236}\n", + "\\definecolor{colorStarCoderPropertyCounts_Dialogs}{RGB}{243,227,186}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Dialogs}{RGB}{245,236,212}\n", + "\\definecolor{colorStackExchangePropertyCounts_Dialogs}{RGB}{222,239,237}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Dialogs}{RGB}{245,243,240}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Dialogs}{RGB}{244,244,244}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Dialogs}{RGB}{245,236,210}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Tasks}{RGB}{245,240,228}\n", + "\\definecolor{colorDolly15kPropertyCounts_Tasks}{RGB}{240,243,243}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Tasks}{RGB}{245,244,242}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Tasks}{RGB}{179,226,219}\n", + "\\definecolor{colorxP3xPropertyCounts_Tasks}{RGB}{240,243,243}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Tasks}{RGB}{202,234,230}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Tasks}{RGB}{204,235,230}\n", + "\\definecolor{colorSHPPropertyCounts_Tasks}{RGB}{233,242,240}\n", + "\\definecolor{colorShareGPTPropertyCounts_Tasks}{RGB}{217,238,235}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Tasks}{RGB}{233,242,240}\n", + "\\definecolor{colorWebGPTPropertyCounts_Tasks}{RGB}{245,244,242}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Tasks}{RGB}{240,243,243}\n", + "\\definecolor{colorAiroborosPropertyCounts_Tasks}{RGB}{240,243,243}\n", + "\\definecolor{colorAlpacaPropertyCounts_Tasks}{RGB}{220,239,236}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Tasks}{RGB}{204,235,230}\n", + "\\definecolor{colorBookSumPropertyCounts_Tasks}{RGB}{245,244,242}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Tasks}{RGB}{245,236,212}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Tasks}{RGB}{204,235,230}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Tasks}{RGB}{245,240,228}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Tasks}{RGB}{226,240,238}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Tasks}{RGB}{220,239,236}\n", + "\\definecolor{colorGorillaPropertyCounts_Tasks}{RGB}{245,244,242}\n", + "\\definecolor{colorHC3PropertyCounts_Tasks}{RGB}{233,242,240}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Tasks}{RGB}{245,236,212}\n", + "\\definecolor{colorLIMAPropertyCounts_Tasks}{RGB}{211,237,233}\n", + "\\definecolor{colorLongformPropertyCounts_Tasks}{RGB}{208,236,232}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Tasks}{RGB}{211,237,233}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Tasks}{RGB}{208,236,232}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Tasks}{RGB}{245,236,212}\n", + "\\definecolor{colorUltraChatPropertyCounts_Tasks}{RGB}{226,240,238}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Tasks}{RGB}{245,244,242}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Tasks}{RGB}{208,236,232}\n", + "\\definecolor{colorStarCoderPropertyCounts_Tasks}{RGB}{239,221,175}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Tasks}{RGB}{245,244,242}\n", + "\\definecolor{colorStackExchangePropertyCounts_Tasks}{RGB}{239,221,175}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Tasks}{RGB}{196,232,227}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Tasks}{RGB}{239,221,175}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Tasks}{RGB}{245,240,228}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorDolly15kPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Langs}{RGB}{240,243,243}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Langs}{RGB}{227,240,239}\n", + "\\definecolor{colorxP3xPropertyCounts_Langs}{RGB}{185,228,221}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorSHPPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorShareGPTPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorWebGPTPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorAiroborosPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorAlpacaPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorBookSumPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Langs}{RGB}{245,242,236}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Langs}{RGB}{245,240,226}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorGorillaPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorHC3PropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorLIMAPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorLongformPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorUltraChatPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorStarCoderPropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorStackExchangePropertyCounts_Langs}{RGB}{245,232,196}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Langs}{RGB}{240,223,178}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Langs}{RGB}{179,226,219}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Langs}{RGB}{240,243,243}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorDolly15kPropertyCounts_Topics}{RGB}{245,238,218}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Topics}{RGB}{245,244,244}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Topics}{RGB}{179,226,219}\n", + "\\definecolor{colorxP3xPropertyCounts_Topics}{RGB}{235,242,241}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Topics}{RGB}{202,234,230}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Topics}{RGB}{233,242,240}\n", + "\\definecolor{colorSHPPropertyCounts_Topics}{RGB}{235,242,241}\n", + "\\definecolor{colorShareGPTPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorWebGPTPropertyCounts_Topics}{RGB}{245,237,216}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorAiroborosPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorAlpacaPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Topics}{RGB}{245,237,216}\n", + "\\definecolor{colorBookSumPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Topics}{RGB}{245,240,226}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Topics}{RGB}{245,236,210}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Topics}{RGB}{245,237,214}\n", + "\\definecolor{colorGorillaPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorHC3PropertyCounts_Topics}{RGB}{245,244,244}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorLIMAPropertyCounts_Topics}{RGB}{245,238,220}\n", + "\\definecolor{colorLongformPropertyCounts_Topics}{RGB}{245,241,232}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Topics}{RGB}{245,240,228}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Topics}{RGB}{245,236,210}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorUltraChatPropertyCounts_Topics}{RGB}{241,224,181}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Topics}{RGB}{245,232,196}\n", + "\\definecolor{colorStarCoderPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorStackExchangePropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Topics}{RGB}{208,236,232}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Topics}{RGB}{196,232,227}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Topics}{RGB}{240,223,178}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorDolly15kPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Domains}{RGB}{179,226,219}\n", + "\\definecolor{colorxP3xPropertyCounts_Domains}{RGB}{206,235,231}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Domains}{RGB}{187,229,223}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Domains}{RGB}{215,237,234}\n", + "\\definecolor{colorSHPPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorShareGPTPropertyCounts_Domains}{RGB}{245,236,210}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorWebGPTPropertyCounts_Domains}{RGB}{245,240,226}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorAiroborosPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorAlpacaPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Domains}{RGB}{245,240,226}\n", + "\\definecolor{colorBookSumPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorGorillaPropertyCounts_Domains}{RGB}{245,236,210}\n", + "\\definecolor{colorHC3PropertyCounts_Domains}{RGB}{236,243,242}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorLIMAPropertyCounts_Domains}{RGB}{236,243,242}\n", + "\\definecolor{colorLongformPropertyCounts_Domains}{RGB}{245,243,238}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Domains}{RGB}{179,226,219}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorUltraChatPropertyCounts_Domains}{RGB}{245,236,210}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorStarCoderPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorStackExchangePropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Domains}{RGB}{196,232,227}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Domains}{RGB}{240,223,178}\n", + "\\definecolor{colorAnthropicHHPropertyCounts_Downs}{RGB}{206,235,231}\n", + "\\definecolor{colorDolly15kPropertyCounts_Downs}{RGB}{179,226,219}\n", + "\\definecolor{colorOpenAssistantPropertyCounts_Downs}{RGB}{213,237,234}\n", + "\\definecolor{colorFlanCollectionPropertyCounts_Downs}{RGB}{211,237,233}\n", + "\\definecolor{colorxP3xPropertyCounts_Downs}{RGB}{222,239,237}\n", + "\\definecolor{colorTasksourceIns.PropertyCounts_Downs}{RGB}{240,243,243}\n", + "\\definecolor{colorLAIONOIGPropertyCounts_Downs}{RGB}{226,240,238}\n", + "\\definecolor{colorSHPPropertyCounts_Downs}{RGB}{217,238,235}\n", + "\\definecolor{colorShareGPTPropertyCounts_Downs}{RGB}{224,240,237}\n", + "\\definecolor{colorSelf-InstructPropertyCounts_Downs}{RGB}{218,238,235}\n", + "\\definecolor{colorWebGPTPropertyCounts_Downs}{RGB}{222,239,237}\n", + "\\definecolor{colorOpenAISumm.PropertyCounts_Downs}{RGB}{213,237,234}\n", + "\\definecolor{colorAiroborosPropertyCounts_Downs}{RGB}{222,239,237}\n", + "\\definecolor{colorAlpacaPropertyCounts_Downs}{RGB}{204,235,230}\n", + "\\definecolor{colorBaizeChatPropertyCounts_Downs}{RGB}{224,240,237}\n", + "\\definecolor{colorBookSumPropertyCounts_Downs}{RGB}{224,240,237}\n", + "\\definecolor{colorCamelAISci.PropertyCounts_Downs}{RGB}{233,242,240}\n", + "\\definecolor{colorCoTColl.PropertyCounts_Downs}{RGB}{226,240,238}\n", + "\\definecolor{colorCodeAlpacaPropertyCounts_Downs}{RGB}{217,238,235}\n", + "\\definecolor{colorGPT-4-AlpacaPropertyCounts_Downs}{RGB}{222,239,237}\n", + "\\definecolor{colorGPTeacherPropertyCounts_Downs}{RGB}{231,241,240}\n", + "\\definecolor{colorGorillaPropertyCounts_Downs}{RGB}{231,241,240}\n", + "\\definecolor{colorHC3PropertyCounts_Downs}{RGB}{220,239,236}\n", + "\\definecolor{colorJokeExpl.PropertyCounts_Downs}{RGB}{235,242,241}\n", + "\\definecolor{colorLIMAPropertyCounts_Downs}{RGB}{218,238,235}\n", + "\\definecolor{colorLongformPropertyCounts_Downs}{RGB}{218,238,235}\n", + "\\definecolor{colorGPT4AllJPropertyCounts_Downs}{RGB}{224,240,237}\n", + "\\definecolor{colorOpenOrcaPropertyCounts_Downs}{RGB}{209,236,232}\n", + "\\definecolor{colorTool-LlamaPropertyCounts_Downs}{RGB}{240,223,178}\n", + "\\definecolor{colorUltraChatPropertyCounts_Downs}{RGB}{220,239,236}\n", + "\\definecolor{colorUnnaturalInstr.PropertyCounts_Downs}{RGB}{238,243,242}\n", + "\\definecolor{colorEvol-Instr.PropertyCounts_Downs}{RGB}{220,239,236}\n", + "\\definecolor{colorStarCoderPropertyCounts_Downs}{RGB}{227,240,239}\n", + "\\definecolor{colorTinyStoriesPropertyCounts_Downs}{RGB}{213,237,234}\n", + "\\definecolor{colorStackExchangePropertyCounts_Downs}{RGB}{224,240,237}\n", + "\\definecolor{colorTasksourceSTPropertyCounts_Downs}{RGB}{229,241,239}\n", + "\\definecolor{colorCommitPackFTPropertyCounts_Downs}{RGB}{217,238,235}\n", + "\\definecolor{colorOpAsstOctoPackPropertyCounts_Downs}{RGB}{235,242,241}\n", + "\\definecolor{colorAnthropicHHTextLens_Inpt}{RGB}{240,223,178}\n", + "\\definecolor{colorDolly15kTextLens_Inpt}{RGB}{245,239,224}\n", + "\\definecolor{colorOpenAssistantTextLens_Inpt}{RGB}{246,232,195}\n", + "\\definecolor{colorFlanCollectionTextLens_Inpt}{RGB}{233,242,240}\n", + "\\definecolor{colorxP3xTextLens_Inpt}{RGB}{245,241,232}\n", + "\\definecolor{colorTasksourceIns.TextLens_Inpt}{RGB}{245,241,230}\n", + "\\definecolor{colorLAIONOIGTextLens_Inpt}{RGB}{245,238,220}\n", + "\\definecolor{colorSHPTextLens_Inpt}{RGB}{245,244,242}\n", + "\\definecolor{colorShareGPTTextLens_Inpt}{RGB}{245,237,216}\n", + "\\definecolor{colorSelf-InstructTextLens_Inpt}{RGB}{245,233,198}\n", + "\\definecolor{colorWebGPTTextLens_Inpt}{RGB}{245,243,238}\n", + "\\definecolor{colorOpenAISumm.TextLens_Inpt}{RGB}{238,243,242}\n", + "\\definecolor{colorAiroborosTextLens_Inpt}{RGB}{245,237,216}\n", + "\\definecolor{colorAlpacaTextLens_Inpt}{RGB}{245,241,230}\n", + "\\definecolor{colorBaizeChatTextLens_Inpt}{RGB}{240,223,178}\n", + "\\definecolor{colorBookSumTextLens_Inpt}{RGB}{179,226,219}\n", + "\\definecolor{colorCamelAISci.TextLens_Inpt}{RGB}{245,233,200}\n", + "\\definecolor{colorCoTColl.TextLens_Inpt}{RGB}{245,243,238}\n", + "\\definecolor{colorCodeAlpacaTextLens_Inpt}{RGB}{244,229,189}\n", + "\\definecolor{colorGPT-4-AlpacaTextLens_Inpt}{RGB}{245,232,196}\n", + "\\definecolor{colorGPTeacherTextLens_Inpt}{RGB}{245,236,210}\n", + "\\definecolor{colorGorillaTextLens_Inpt}{RGB}{246,232,195}\n", + "\\definecolor{colorHC3TextLens_Inpt}{RGB}{246,232,195}\n", + "\\definecolor{colorJokeExpl.TextLens_Inpt}{RGB}{243,227,186}\n", + "\\definecolor{colorLIMATextLens_Inpt}{RGB}{245,236,210}\n", + "\\definecolor{colorLongformTextLens_Inpt}{RGB}{245,243,240}\n", + "\\definecolor{colorGPT4AllJTextLens_Inpt}{RGB}{245,244,242}\n", + "\\definecolor{colorOpenOrcaTextLens_Inpt}{RGB}{240,243,243}\n", + "\\definecolor{colorTool-LlamaTextLens_Inpt}{RGB}{200,234,229}\n", + "\\definecolor{colorUltraChatTextLens_Inpt}{RGB}{245,237,216}\n", + "\\definecolor{colorUnnaturalInstr.TextLens_Inpt}{RGB}{245,238,218}\n", + "\\definecolor{colorEvol-Instr.TextLens_Inpt}{RGB}{245,241,232}\n", + "\\definecolor{colorStarCoderTextLens_Inpt}{RGB}{245,235,206}\n", + "\\definecolor{colorTinyStoriesTextLens_Inpt}{RGB}{245,241,230}\n", + "\\definecolor{colorStackExchangeTextLens_Inpt}{RGB}{240,243,243}\n", + "\\definecolor{colorTasksourceSTTextLens_Inpt}{RGB}{224,240,237}\n", + "\\definecolor{colorCommitPackFTTextLens_Inpt}{RGB}{245,242,236}\n", + "\\definecolor{colorOpAsstOctoPackTextLens_Inpt}{RGB}{246,232,195}\n", + "\\definecolor{colorAnthropicHHTextLens_Tgt}{RGB}{245,241,230}\n", + "\\definecolor{colorDolly15kTextLens_Tgt}{RGB}{245,241,232}\n", + "\\definecolor{colorOpenAssistantTextLens_Tgt}{RGB}{245,243,240}\n", + "\\definecolor{colorFlanCollectionTextLens_Tgt}{RGB}{245,238,218}\n", + "\\definecolor{colorxP3xTextLens_Tgt}{RGB}{245,242,234}\n", + "\\definecolor{colorTasksourceIns.TextLens_Tgt}{RGB}{246,232,195}\n", + "\\definecolor{colorLAIONOIGTextLens_Tgt}{RGB}{245,243,238}\n", + "\\definecolor{colorSHPTextLens_Tgt}{RGB}{245,242,236}\n", + "\\definecolor{colorShareGPTTextLens_Tgt}{RGB}{244,244,244}\n", + "\\definecolor{colorSelf-InstructTextLens_Tgt}{RGB}{245,237,216}\n", + "\\definecolor{colorWebGPTTextLens_Tgt}{RGB}{245,243,240}\n", + "\\definecolor{colorOpenAISumm.TextLens_Tgt}{RGB}{245,238,220}\n", + "\\definecolor{colorAiroborosTextLens_Tgt}{RGB}{245,243,238}\n", + "\\definecolor{colorAlpacaTextLens_Tgt}{RGB}{245,240,228}\n", + "\\definecolor{colorBaizeChatTextLens_Tgt}{RGB}{245,240,226}\n", + "\\definecolor{colorBookSumTextLens_Tgt}{RGB}{240,243,243}\n", + "\\definecolor{colorCamelAISci.TextLens_Tgt}{RGB}{245,243,240}\n", + "\\definecolor{colorCoTColl.TextLens_Tgt}{RGB}{245,240,228}\n", + "\\definecolor{colorCodeAlpacaTextLens_Tgt}{RGB}{245,239,224}\n", + "\\definecolor{colorGPT-4-AlpacaTextLens_Tgt}{RGB}{245,242,236}\n", + "\\definecolor{colorGPTeacherTextLens_Tgt}{RGB}{245,241,232}\n", + "\\definecolor{colorGorillaTextLens_Tgt}{RGB}{245,236,212}\n", + "\\definecolor{colorHC3TextLens_Tgt}{RGB}{245,243,238}\n", + "\\definecolor{colorJokeExpl.TextLens_Tgt}{RGB}{245,242,236}\n", + "\\definecolor{colorLIMATextLens_Tgt}{RGB}{235,242,241}\n", + "\\definecolor{colorLongformTextLens_Tgt}{RGB}{238,243,242}\n", + "\\definecolor{colorGPT4AllJTextLens_Tgt}{RGB}{242,244,244}\n", + "\\definecolor{colorOpenOrcaTextLens_Tgt}{RGB}{245,242,236}\n", + "\\definecolor{colorTool-LlamaTextLens_Tgt}{RGB}{244,244,244}\n", + "\\definecolor{colorUltraChatTextLens_Tgt}{RGB}{242,244,244}\n", + "\\definecolor{colorUnnaturalInstr.TextLens_Tgt}{RGB}{245,236,210}\n", + "\\definecolor{colorEvol-Instr.TextLens_Tgt}{RGB}{240,243,243}\n", + "\\definecolor{colorStarCoderTextLens_Tgt}{RGB}{245,242,236}\n", + "\\definecolor{colorTinyStoriesTextLens_Tgt}{RGB}{179,226,219}\n", + "\\definecolor{colorStackExchangeTextLens_Tgt}{RGB}{245,244,242}\n", + "\\definecolor{colorTasksourceSTTextLens_Tgt}{RGB}{240,223,178}\n", + "\\definecolor{colorCommitPackFTTextLens_Tgt}{RGB}{245,244,242}\n", + "\\definecolor{colorOpAsstOctoPackTextLens_Tgt}{RGB}{245,244,242}\n", + "\n", + "\\begin{table*}\n", + "\\centering\n", + "\\caption{\\textbf{Alignment tuning collections and their characteristics}. Properties of the collections include numbers of datasets, dialogs, tasks, languages, topics, Huggingface downloads (\"Downs\"), text domains, the average length of input and target text. The \\textsc{Source} column indicates whether a collection consists of human-generated web text (\\emoji{globe-with-meridians}), was generated with the help of a language model (\\emoji{robot}) or is a mix of both (\\emoji{globe-with-meridians}\\emoji{robot}). We indicate the type of dialogs present in several columns, with some collections have more than one: zero-shot (Z), few-shot (F), response ranking (R), chain-of-thought (C), and multi-turn dialog (M). Finally, the \\textsc{Use} column indicates whether a collection includes data freely usable even for commercial purposes (\\protect\\CommercialDataCircle), data usable only for noncommercial purposes or academic research (\\protect\\NCDataCircle) and data whose license status is not specified precisely enough to allow us to determine commercial use permissions (\\protect\\UnspecifiedDataCircle). Note that each collection may have different datasets with one, two, or all three of these statuses.}\n", + "\\label{tab:collections}\n", + "\\begin{tabular}{l|ccccccc|rr|cp{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}c}\n", + "\\toprule\n", + " & \\multicolumn{7}{c}{\\textsc{Property Counts}} & \\multicolumn{2}{c}{\\textsc{Text Lens}} & \\multicolumn{7}{c}{\\textsc{Dataset Types}} \\\\\n", + " & \\textsc{\\thead{Datasets}} & \\textsc{\\thead{Dialogs}} & \\textsc{\\thead{Tasks}} & \\textsc{\\thead{Langs}} & \\textsc{\\thead{Topics}} & \\textsc{\\thead{Domains}} & \\textsc{\\thead{Downs}} & \\textsc{\\thead{Inpt}} & \\textsc{\\thead{Tgt}} & \\textsc{\\thead{Source}} & \\textsc{\\thead{Z}} & \\textsc{\\thead{F}} & \\textsc{\\thead{C}} & \\textsc{\\thead{R}} & \\textsc{\\thead{M}} & \\textsc{\\thead{Use}} \\\\\n", + "\\textsc{Collection} & & & & & & & & & & & & & & & & \\\\\n", + "\\midrule\n", + "Airoboros & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorAiroborosPropertyCounts_Dialogs}{59k} & \\cellcolor{colorAiroborosPropertyCounts_Tasks}{5} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorAiroborosPropertyCounts_Downs}{1k} & \\cellcolor{colorAiroborosTextLens_Inpt}{300} & \\cellcolor{colorAiroborosTextLens_Tgt}{600} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Alpaca & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorAlpacaPropertyCounts_Dialogs}{52k} & \\cellcolor{colorGPTeacherPropertyCounts_Tasks}{8} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorAlpacaPropertyCounts_Downs}{100k} & \\cellcolor{colorAlpacaTextLens_Inpt}{505} & \\cellcolor{colorAlpacaTextLens_Tgt}{270} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Anthropic HH & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorAnthropicHHPropertyCounts_Dialogs}{161k} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Tasks}{3} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorAnthropicHHPropertyCounts_Downs}{82k} & \\cellcolor{colorAnthropicHHTextLens_Inpt}{69} & \\cellcolor{colorAnthropicHHTextLens_Tgt}{311} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "BaizeChat & \\cellcolor{colorOpenOrcaPropertyCounts_Datasets}{4} & \\cellcolor{colorBaizeChatPropertyCounts_Dialogs}{210k} & \\cellcolor{colorCoTColl.PropertyCounts_Tasks}{12} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorBaizeChatPropertyCounts_Topics}{37} & \\cellcolor{colorBaizeChatPropertyCounts_Domains}{3} & \\cellcolor{colorBaizeChatPropertyCounts_Downs}{<1k} & \\cellcolor{colorBaizeChatTextLens_Inpt}{74} & \\cellcolor{colorBaizeChatTextLens_Tgt}{234} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "BookSum & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorBookSumPropertyCounts_Dialogs}{7k} & \\cellcolor{colorTinyStoriesPropertyCounts_Tasks}{4} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorBookSumPropertyCounts_Downs}{<1k} & \\cellcolor{colorBookSumTextLens_Inpt}{14k} & \\cellcolor{colorBookSumTextLens_Tgt}{2k} & \\emoji{globe-with-meridians}\\emojiblank & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "CamelAI Sci. & \\cellcolor{colorCamelAISci.PropertyCounts_Datasets}{15} & \\cellcolor{colorCamelAISci.PropertyCounts_Dialogs}{660k} & \\cellcolor{colorTool-LlamaPropertyCounts_Tasks}{2} & \\cellcolor{colorCamelAISci.PropertyCounts_Langs}{11} & \\cellcolor{colorCamelAISci.PropertyCounts_Topics}{53} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorCamelAISci.PropertyCounts_Downs}{<1k} & \\cellcolor{colorCamelAISci.TextLens_Inpt}{150} & \\cellcolor{colorCamelAISci.TextLens_Tgt}{687} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "CoT Coll. & \\cellcolor{colorCoTColl.PropertyCounts_Datasets}{6} & \\cellcolor{colorCoTColl.PropertyCounts_Dialogs}{2,183k} & \\cellcolor{colorCoTColl.PropertyCounts_Tasks}{12} & \\cellcolor{colorCoTColl.PropertyCounts_Langs}{7} & \\cellcolor{colorCoTColl.PropertyCounts_Topics}{29} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorCoTColl.PropertyCounts_Downs}{<1k} & \\cellcolor{colorCoTColl.TextLens_Inpt}{728} & \\cellcolor{colorCoTColl.TextLens_Tgt}{265} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\greencheck & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Code Alpaca & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorCodeAlpacaPropertyCounts_Dialogs}{20k} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Tasks}{3} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorCodeAlpacaPropertyCounts_Downs}{5k} & \\cellcolor{colorCodeAlpacaTextLens_Inpt}{97} & \\cellcolor{colorCodeAlpacaTextLens_Tgt}{196} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "CommitPackFT & \\cellcolor{colorCommitPackFTPropertyCounts_Datasets}{277} & \\cellcolor{colorCommitPackFTPropertyCounts_Dialogs}{702k} & \\cellcolor{colorCommitPackFTPropertyCounts_Tasks}{1} & \\cellcolor{colorCommitPackFTPropertyCounts_Langs}{278} & \\cellcolor{colorCommitPackFTPropertyCounts_Topics}{751} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorCommitPackFTPropertyCounts_Downs}{4k} & \\cellcolor{colorCommitPackFTTextLens_Inpt}{645} & \\cellcolor{colorCommitPackFTTextLens_Tgt}{784} & \\emoji{globe-with-meridians}\\emojiblank & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Dolly 15k & \\cellcolor{colorGPT4AllJPropertyCounts_Datasets}{7} & \\cellcolor{colorDolly15kPropertyCounts_Dialogs}{15k} & \\cellcolor{colorAiroborosPropertyCounts_Tasks}{5} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorDolly15kPropertyCounts_Topics}{38} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorDolly15kPropertyCounts_Downs}{10,116k} & \\cellcolor{colorDolly15kTextLens_Inpt}{423} & \\cellcolor{colorDolly15kTextLens_Tgt}{357} & \\emoji{globe-with-meridians}\\emojiblank & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Evol-Instr. & \\cellcolor{colorEvol-Instr.PropertyCounts_Datasets}{2} & \\cellcolor{colorEvol-Instr.PropertyCounts_Dialogs}{213k} & \\cellcolor{colorEvol-Instr.PropertyCounts_Tasks}{11} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorEvol-Instr.PropertyCounts_Topics}{17} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorEvol-Instr.PropertyCounts_Downs}{2k} & \\cellcolor{colorEvol-Instr.TextLens_Inpt}{570} & \\cellcolor{colorEvol-Instr.TextLens_Tgt}{2k} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Flan Collection & \\cellcolor{colorFlanCollectionPropertyCounts_Datasets}{450} & \\cellcolor{colorFlanCollectionPropertyCounts_Dialogs}{9,813k} & \\cellcolor{colorFlanCollectionPropertyCounts_Tasks}{19} & \\cellcolor{colorFlanCollectionPropertyCounts_Langs}{39} & \\cellcolor{colorFlanCollectionPropertyCounts_Topics}{1k} & \\cellcolor{colorOpenOrcaPropertyCounts_Domains}{23} & \\cellcolor{colorFlanCollectionPropertyCounts_Downs}{19k} & \\cellcolor{colorFlanCollectionTextLens_Inpt}{2k} & \\cellcolor{colorFlanCollectionTextLens_Tgt}{128} & \\emoji{globe-with-meridians}\\emoji{robot} & \\greencheck & \\greencheck & \\greencheck & \\emojiblank & \\emojiblank & \\TransparentCircle \\UnspecifiedDataCircle \\TransparentCircle \\\\\n", + "GPT-4-Alpaca & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorGPT-4-AlpacaPropertyCounts_Dialogs}{55k} & \\cellcolor{colorUltraChatPropertyCounts_Tasks}{7} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorGPT-4-AlpacaPropertyCounts_Downs}{1k} & \\cellcolor{colorGPT-4-AlpacaTextLens_Inpt}{130} & \\cellcolor{colorGPT-4-AlpacaTextLens_Tgt}{543} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "GPT4AllJ & \\cellcolor{colorGPT4AllJPropertyCounts_Datasets}{7} & \\cellcolor{colorGPT4AllJPropertyCounts_Dialogs}{809k} & \\cellcolor{colorGPT4AllJPropertyCounts_Tasks}{10} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorGPT4AllJPropertyCounts_Topics}{56} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorGPT4AllJPropertyCounts_Downs}{<1k} & \\cellcolor{colorGPT4AllJTextLens_Inpt}{883} & \\cellcolor{colorGPT4AllJTextLens_Tgt}{1k} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "GPTeacher & \\cellcolor{colorOpenOrcaPropertyCounts_Datasets}{4} & \\cellcolor{colorGPTeacherPropertyCounts_Dialogs}{103k} & \\cellcolor{colorGPTeacherPropertyCounts_Tasks}{8} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorGPTeacherPropertyCounts_Topics}{33} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorGPTeacherPropertyCounts_Downs}{<1k} & \\cellcolor{colorGPTeacherTextLens_Inpt}{227} & \\cellcolor{colorGPTeacherTextLens_Tgt}{360} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Gorilla & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorGorillaPropertyCounts_Dialogs}{15k} & \\cellcolor{colorTinyStoriesPropertyCounts_Tasks}{4} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorUltraChatPropertyCounts_Domains}{2} & \\cellcolor{colorGorillaPropertyCounts_Downs}{<1k} & \\cellcolor{colorHC3TextLens_Inpt}{119} & \\cellcolor{colorGorillaTextLens_Tgt}{76} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "HC3 & \\cellcolor{colorHC3PropertyCounts_Datasets}{12} & \\cellcolor{colorHC3PropertyCounts_Dialogs}{37k} & \\cellcolor{colorHC3PropertyCounts_Tasks}{6} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorHC3PropertyCounts_Topics}{102} & \\cellcolor{colorLIMAPropertyCounts_Domains}{6} & \\cellcolor{colorHC3PropertyCounts_Downs}{2k} & \\cellcolor{colorHC3TextLens_Inpt}{119} & \\cellcolor{colorHC3TextLens_Tgt}{652} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Joke Expl. & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorJokeExpl.PropertyCounts_Dialogs}{<1k} & \\cellcolor{colorTool-LlamaPropertyCounts_Tasks}{2} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorJokeExpl.PropertyCounts_Downs}{<1k} & \\cellcolor{colorJokeExpl.TextLens_Inpt}{96} & \\cellcolor{colorJokeExpl.TextLens_Tgt}{547} & \\emoji{globe-with-meridians}\\emojiblank & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "LAION OIG & \\cellcolor{colorLAIONOIGPropertyCounts_Datasets}{26} & \\cellcolor{colorLAIONOIGPropertyCounts_Dialogs}{9,211k} & \\cellcolor{colorCoTColl.PropertyCounts_Tasks}{12} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorLAIONOIGPropertyCounts_Topics}{171} & \\cellcolor{colorLAIONOIGPropertyCounts_Domains}{11} & \\cellcolor{colorLAIONOIGPropertyCounts_Downs}{<1k} & \\cellcolor{colorLAIONOIGTextLens_Inpt}{343} & \\cellcolor{colorLAIONOIGTextLens_Tgt}{595} & \\emoji{globe-with-meridians}\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\UnspecifiedDataCircle \\TransparentCircle \\\\\n", + "LIMA & \\cellcolor{colorLIMAPropertyCounts_Datasets}{5} & \\cellcolor{colorLIMAPropertyCounts_Dialogs}{1k} & \\cellcolor{colorGPT4AllJPropertyCounts_Tasks}{10} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorLIMAPropertyCounts_Topics}{43} & \\cellcolor{colorLIMAPropertyCounts_Domains}{6} & \\cellcolor{colorLIMAPropertyCounts_Downs}{3k} & \\cellcolor{colorLIMATextLens_Inpt}{228} & \\cellcolor{colorLIMATextLens_Tgt}{3k} & \\emoji{globe-with-meridians}\\emojiblank & \\greencheck & \\greencheck & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Longform & \\cellcolor{colorGPT4AllJPropertyCounts_Datasets}{7} & \\cellcolor{colorLongformPropertyCounts_Dialogs}{23k} & \\cellcolor{colorEvol-Instr.PropertyCounts_Tasks}{11} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorLongformPropertyCounts_Topics}{63} & \\cellcolor{colorLongformPropertyCounts_Domains}{4} & \\cellcolor{colorLongformPropertyCounts_Downs}{3k} & \\cellcolor{colorLongformTextLens_Inpt}{810} & \\cellcolor{colorLongformTextLens_Tgt}{2k} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "OpAsst OctoPack & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Dialogs}{10k} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Tasks}{3} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Langs}{20} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Downs}{<1k} & \\cellcolor{colorOpAsstOctoPackTextLens_Inpt}{118} & \\cellcolor{colorOpAsstOctoPackTextLens_Tgt}{884} & \\emoji{globe-with-meridians}\\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "OpenAI Summ. & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorOpenAISumm.PropertyCounts_Dialogs}{93k} & \\cellcolor{colorAiroborosPropertyCounts_Tasks}{5} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorOpenAISumm.PropertyCounts_Downs}{14k} & \\cellcolor{colorOpenAISumm.TextLens_Inpt}{1k} & \\cellcolor{colorOpenAISumm.TextLens_Tgt}{134} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "OpenAssistant & \\cellcolor{colorOpenAssistantPropertyCounts_Datasets}{19} & \\cellcolor{colorOpenAssistantPropertyCounts_Dialogs}{10k} & \\cellcolor{colorTinyStoriesPropertyCounts_Tasks}{4} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Langs}{20} & \\cellcolor{colorOpenAssistantPropertyCounts_Topics}{99} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorOpenAssistantPropertyCounts_Downs}{14k} & \\cellcolor{colorOpAsstOctoPackTextLens_Inpt}{118} & \\cellcolor{colorOpenAssistantTextLens_Tgt}{711} & \\emoji{globe-with-meridians}\\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "OpenOrca & \\cellcolor{colorOpenOrcaPropertyCounts_Datasets}{4} & \\cellcolor{colorOpenOrcaPropertyCounts_Dialogs}{4,234k} & \\cellcolor{colorEvol-Instr.PropertyCounts_Tasks}{11} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpenOrcaPropertyCounts_Topics}{30} & \\cellcolor{colorOpenOrcaPropertyCounts_Domains}{23} & \\cellcolor{colorOpenOrcaPropertyCounts_Downs}{28k} & \\cellcolor{colorOpenOrcaTextLens_Inpt}{1k} & \\cellcolor{colorOpenOrcaTextLens_Tgt}{492} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "SHP & \\cellcolor{colorSHPPropertyCounts_Datasets}{18} & \\cellcolor{colorSHPPropertyCounts_Dialogs}{349k} & \\cellcolor{colorHC3PropertyCounts_Tasks}{6} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorSHPPropertyCounts_Topics}{151} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorSHPPropertyCounts_Downs}{4k} & \\cellcolor{colorSHPTextLens_Inpt}{824} & \\cellcolor{colorSHPTextLens_Tgt}{496} & \\emoji{globe-with-meridians}\\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Self-Instruct & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorSelf-InstructPropertyCounts_Dialogs}{83k} & \\cellcolor{colorHC3PropertyCounts_Tasks}{6} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorSelf-InstructPropertyCounts_Downs}{3k} & \\cellcolor{colorSelf-InstructTextLens_Inpt}{134} & \\cellcolor{colorSelf-InstructTextLens_Tgt}{104} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "ShareGPT & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorShareGPTPropertyCounts_Dialogs}{77k} & \\cellcolor{colorShareGPTPropertyCounts_Tasks}{9} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorUltraChatPropertyCounts_Domains}{2} & \\cellcolor{colorShareGPTPropertyCounts_Downs}{<1k} & \\cellcolor{colorShareGPTTextLens_Inpt}{303} & \\cellcolor{colorShareGPTTextLens_Tgt}{1k} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "StackExchange & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorStackExchangePropertyCounts_Dialogs}{10,607k} & \\cellcolor{colorCommitPackFTPropertyCounts_Tasks}{1} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorStackExchangePropertyCounts_Downs}{<1k} & \\cellcolor{colorStackExchangeTextLens_Inpt}{1k} & \\cellcolor{colorStackExchangeTextLens_Tgt}{901} & \\emoji{globe-with-meridians}\\emojiblank & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "StarCoder & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorStarCoderPropertyCounts_Dialogs}{<1k} & \\cellcolor{colorCommitPackFTPropertyCounts_Tasks}{1} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorStarCoderPropertyCounts_Downs}{<1k} & \\cellcolor{colorStarCoderTextLens_Inpt}{195} & \\cellcolor{colorStarCoderTextLens_Tgt}{504} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Tasksource Ins. & \\cellcolor{colorTasksourceIns.PropertyCounts_Datasets}{288} & \\cellcolor{colorTasksourceIns.PropertyCounts_Dialogs}{3,397k} & \\cellcolor{colorTasksourceIns.PropertyCounts_Tasks}{13} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorTasksourceIns.PropertyCounts_Topics}{582} & \\cellcolor{colorTasksourceIns.PropertyCounts_Domains}{20} & \\cellcolor{colorTasksourceIns.PropertyCounts_Downs}{<1k} & \\cellcolor{colorTasksourceIns.TextLens_Inpt}{518} & \\cellcolor{colorTasksourceIns.TextLens_Tgt}{18} & \\emoji{globe-with-meridians}\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\UnspecifiedDataCircle \\TransparentCircle \\\\\n", + "Tasksource ST & \\cellcolor{colorTasksourceSTPropertyCounts_Datasets}{229} & \\cellcolor{colorTasksourceSTPropertyCounts_Dialogs}{338k} & \\cellcolor{colorTasksourceSTPropertyCounts_Tasks}{15} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorTasksourceSTPropertyCounts_Topics}{477} & \\cellcolor{colorTasksourceSTPropertyCounts_Domains}{18} & \\cellcolor{colorTasksourceSTPropertyCounts_Downs}{<1k} & \\cellcolor{colorTasksourceSTTextLens_Inpt}{3k} & \\cellcolor{colorTasksourceSTTextLens_Tgt}{6} & \\emoji{globe-with-meridians}\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\UnspecifiedDataCircle \\TransparentCircle \\\\\n", + "TinyStories & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorTinyStoriesPropertyCounts_Dialogs}{14k} & \\cellcolor{colorTinyStoriesPropertyCounts_Tasks}{4} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorTinyStoriesPropertyCounts_Downs}{12k} & \\cellcolor{colorTinyStoriesTextLens_Inpt}{517} & \\cellcolor{colorTinyStoriesTextLens_Tgt}{194k} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Tool-Llama & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorTool-LlamaPropertyCounts_Dialogs}{37k} & \\cellcolor{colorTool-LlamaPropertyCounts_Tasks}{2} & \\cellcolor{colorStackExchangePropertyCounts_Langs}{2} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & - & \\cellcolor{colorTool-LlamaTextLens_Inpt}{7k} & \\cellcolor{colorTool-LlamaTextLens_Tgt}{1k} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "UltraChat & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorUltraChatPropertyCounts_Dialogs}{1,468k} & \\cellcolor{colorUltraChatPropertyCounts_Tasks}{7} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorUltraChatPropertyCounts_Topics}{11} & \\cellcolor{colorUltraChatPropertyCounts_Domains}{2} & \\cellcolor{colorUltraChatPropertyCounts_Downs}{2k} & \\cellcolor{colorUltraChatTextLens_Inpt}{282} & \\cellcolor{colorUltraChatTextLens_Tgt}{1k} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "Unnatural Instr. & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Datasets}{1} & \\cellcolor{colorUnnaturalInstr.PropertyCounts_Dialogs}{66k} & \\cellcolor{colorTinyStoriesPropertyCounts_Tasks}{4} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Topics}{10} & \\cellcolor{colorOpAsstOctoPackPropertyCounts_Domains}{1} & \\cellcolor{colorUnnaturalInstr.PropertyCounts_Downs}{<1k} & \\cellcolor{colorUnnaturalInstr.TextLens_Inpt}{331} & \\cellcolor{colorUnnaturalInstr.TextLens_Tgt}{68} & \\emojiblank\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "WebGPT & \\cellcolor{colorLIMAPropertyCounts_Datasets}{5} & \\cellcolor{colorWebGPTPropertyCounts_Dialogs}{20k} & \\cellcolor{colorTinyStoriesPropertyCounts_Tasks}{4} & \\cellcolor{colorTasksourceSTPropertyCounts_Langs}{1} & \\cellcolor{colorWebGPTPropertyCounts_Topics}{35} & \\cellcolor{colorBaizeChatPropertyCounts_Domains}{3} & \\cellcolor{colorWebGPTPropertyCounts_Downs}{1k} & \\cellcolor{colorWebGPTTextLens_Inpt}{737} & \\cellcolor{colorWebGPTTextLens_Tgt}{743} & \\emojiblank\\emoji{robot} & \\emojiblank & \\emojiblank & \\emojiblank & \\greencheck & \\emojiblank & \\TransparentCircle \\TransparentCircle \\TransparentCircle \\\\\n", + "xP3x & \\cellcolor{colorxP3xPropertyCounts_Datasets}{467} & \\cellcolor{colorxP3xPropertyCounts_Dialogs}{886,240k} & \\cellcolor{colorAiroborosPropertyCounts_Tasks}{5} & \\cellcolor{colorxP3xPropertyCounts_Langs}{245} & \\cellcolor{colorSHPPropertyCounts_Topics}{151} & \\cellcolor{colorxP3xPropertyCounts_Domains}{14} & \\cellcolor{colorxP3xPropertyCounts_Downs}{<1k} & \\cellcolor{colorxP3xTextLens_Inpt}{589} & \\cellcolor{colorxP3xTextLens_Tgt}{441} & \\emoji{globe-with-meridians}\\emoji{robot} & \\greencheck & \\emojiblank & \\emojiblank & \\emojiblank & \\emojiblank & \\TransparentCircle \\UnspecifiedDataCircle \\TransparentCircle \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\\end{table*}\n", + "\n" + ] + } + ], + "source": [ + "kwargs = {\n", + " 'environment': 'table*',\n", + " 'label': 'tab:collections',\n", + " 'position_float': 'centering',\n", + " 'column_format': 'l|ccccccc|rr|cp{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}p{0.3cm}c',\n", + " 'multicol_align': 'c',\n", + " \n", + " 'caption': r'''\n", + " \\textbf{Alignment tuning collections and their characteristics}. Properties of the collections include numbers of datasets, dialogs, tasks, languages, topics, Huggingface downloads (\"Downs\"), text domains, the average length of input and target text. The \\textsc{Source} column indicates whether a collection consists of human-generated web text (\\emoji{globe-with-meridians}), was generated with the help of a language model (\\emoji{robot}) or is a mix of both (\\emoji{globe-with-meridians}\\emoji{robot}). We indicate the type of dialogs present in several columns, with some collections have more than one: zero-shot (Z), few-shot (F), response ranking (R), chain-of-thought (C), and multi-turn dialog (M). Finally, the \\textsc{Use} column indicates whether a collection includes data freely usable even for commercial purposes (\\protect\\CommercialDataCircle), data usable only for noncommercial purposes or academic research (\\protect\\NCDataCircle) and data whose license status is not specified precisely enough to allow us to determine commercial use permissions (\\protect\\UnspecifiedDataCircle). Note that each collection may have different datasets with one, two, or all three of these statuses.\n", + " '''.strip(),\n", + " \n", + " 'hrules': True,\n", + " 'convert_css': True,\n", + "}\n", + "\n", + "latex = dat \\\n", + " .sort_index() \\\n", + " .style \\\n", + " .format(formatter=formatters) \\\n", + " .to_latex(**kwargs)\n", + "\n", + "print('\\n'.join([\n", + " r'\\setlength{\\tabcolsep}{1.9pt}',\n", + " color_def,\n", + " latex,\n", + "]))" + ] + }, + { + "cell_type": "markdown", + "id": "1335a6b5-9fb1-4c83-9954-80320d9257b7", + "metadata": {}, + "source": [ + "# Appendix license/cite table" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "8f6ec891-8345-4649-9d1a-b375c818a72b", + "metadata": {}, + "outputs": [], + "source": [ + "license_table['OpenAI NC'] = license_table['License'] \\\n", + " .apply(lambda s: 'OpenAI' in s) \\\n", + " .replace(True, r'\\redcross') \\\n", + " .replace(False, r'\\emojiblank')\n", + "\n", + "license_table['License'] = license_table['License'].apply(lambda s: [v for v in s if v != 'OpenAI'])\n", + "license_table['License'] = license_table['License'].apply(lambda s: [v if v != 'Academic Research Purposes Only' else 'Academic Only' for v in s])\n", + "license_table['License'] = license_table['License'].apply(lambda s: s if 'Various' not in s else ['Various'])\n", + "license_table['License'] = license_table['License'].apply(lambda s: s if len(s) > 0 else ['Unspecified'])\n", + "license_table['License'] = license_table['License'].apply(lambda s: s if len(s) <= 3 else ['Various'])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "3acd044b-a57c-445d-a3c5-83ca52edb804", + "metadata": {}, + "outputs": [], + "source": [ + "license_table = license_table[['OpenAI NC', 'Cite', 'License']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "49f58c9d-c813-4e2e-a301-57c89a4e0fc1", + "metadata": {}, + "outputs": [], + "source": [ + "license_table['Cite'] = license_table['Cite'].apply(lambda s: r'\\citet{' + s + '}')\n", + "license_table.loc[license_table['Cite'] == r'\\citet{}', 'Cite'] = '--'\n", + "\n", + "license_table['License'] = license_table['License'].str.join(', ')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "b5a5568e-ffb2-45e3-81a1-55a3680b149a", + "metadata": {}, + "outputs": [], + "source": [ + "license_table.rename({'License': 'Licenses'}, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3abd2b9d-b83f-4c68-a24b-3787f13caec6", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{table*}\n", + "\\centering\n", + "\\caption{\\textbf{Licenses and citations} for the dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ''Various`` licenses, and we refer readers to our raw data for the full details. Datasets which were generated with the use of OpenAI APIs and implicate those APIs' noncompetition restrictions are marked with a red X (\\redcross) in the ``OpenAI NC'' column.}\n", + "\\label{tab:licenses}\n", + "\\begin{tabular}{l|clp{5.5cm}}\n", + "\\toprule\n", + " & OpenAI NC & Cite & Licenses \\\\\n", + "Collection & & & \\\\\n", + "\\midrule\n", + "Airoboros & \\redcross & \\citet{Durbin2023Airoboros} & Various \\\\\n", + "Alpaca & \\redcross & \\citet{alpaca} & CC BY-NC 4.0 \\\\\n", + "Anthropic HH & \\emojiblank & \\citet{bai2022training, gangulired} & MIT License \\\\\n", + "BaizeChat & \\redcross & \\citet{xu2023baize} & CC BY-NC 4.0 \\\\\n", + "BookSum & \\emojiblank & \\citet{kryscinski2022booksum} & Academic Only \\\\\n", + "CamelAI Sci. & \\redcross & \\citet{li2023camel} & CC BY-NC 4.0 \\\\\n", + "CoT Coll. & \\redcross & \\citet{kim2023cot} & Non Commercial \\\\\n", + "Code Alpaca & \\redcross & -- & Unspecified \\\\\n", + "CommitPackFT & \\emojiblank & \\citet{muennighoff2023octopack} & Various \\\\\n", + "Dolly 15k & \\emojiblank & \\citet{dolly15k_2023} & CC BY-SA 3.0 \\\\\n", + "Evol-Instr. & \\redcross & \\citet{xu2023wizardlm} & Academic Only \\\\\n", + "Flan Collection & \\emojiblank & \\citet{longpre2023flan} & Various \\\\\n", + "GPT-4-Alpaca & \\redcross & \\citet{peng2023instruction} & CC BY-NC 4.0 \\\\\n", + "GPT4AllJ & \\redcross & \\citet{gpt4all} & Various \\\\\n", + "GPTeacher & \\redcross & -- & MIT License \\\\\n", + "Gorilla & \\redcross & \\citet{patil2023gorilla} & Apache License 2.0 \\\\\n", + "HC3 & \\redcross & \\citet{guo2023close} & Various \\\\\n", + "Joke Expl. & \\emojiblank & -- & MIT License \\\\\n", + "LAION OIG & \\redcross & \\citet{oig2023} & Various \\\\\n", + "LIMA & \\emojiblank & \\citet{zhou2023lima} & CC BY-NC-SA 4.0 \\\\\n", + "Longform & \\redcross & \\citet{koksal2023longform} & CC BY-SA 4.0, Unspecified, CC BY-SA 3.0 \\\\\n", + "OpAsst OctoPack & \\emojiblank & \\citet{muennighoff2023octopack} & CC BY 4.0 \\\\\n", + "OpenAI Summ. & \\redcross & \\citet{stienon2020learning} & CC BY 4.0 \\\\\n", + "OpenAssistant & \\emojiblank & \\citet{kopf2023openassistant} & CC BY 4.0 \\\\\n", + "OpenOrca & \\redcross & \\citet{mukherjee2023orca} & Various \\\\\n", + "SHP & \\emojiblank & \\citet{SHP} & Unspecified \\\\\n", + "Self-Instruct & \\redcross & \\citet{selfinstruct2022} & Apache License 2.0 \\\\\n", + "ShareGPT & \\redcross & \\citet{sharegpt} & Unspecified \\\\\n", + "StackExchange & \\emojiblank & -- & Unspecified \\\\\n", + "StarCoder & \\emojiblank & \\citet{li2023starcoder} & BigScience OpenRAIL-M \\\\\n", + "Tasksource Ins. & \\redcross & \\citet{sileo2023tasksource} & Various \\\\\n", + "Tasksource ST & \\emojiblank & \\citet{weston2015aicomplete} & Various \\\\\n", + "TinyStories & \\redcross & \\citet{eldan2023tinystories} & CDLA Sharing 1.0 \\\\\n", + "Tool-Llama & \\redcross & \\citet{qin2023toolllm} & CC BY-NC 4.0 \\\\\n", + "UltraChat & \\redcross & \\citet{ding2023enhancing} & CC BY-NC 4.0 \\\\\n", + "Unnatural Instr. & \\redcross & \\citet{honovich2022unnatural} & MIT License \\\\\n", + "WebGPT & \\redcross & \\citet{nakano2021webgpt} & Apache License 2.0, CC BY-SA 4.0 \\\\\n", + "xP3x & \\emojiblank & \\citet{muennighoff2022crosslingual} & Various \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\\end{table*}\n", + "\n" + ] + } + ], + "source": [ + "kwargs = {\n", + " 'environment': 'table*',\n", + " 'label': 'tab:licenses',\n", + " 'position_float': 'centering',\n", + " 'column_format': 'l|clp{5.5cm}',\n", + " \n", + " 'caption': r'''\n", + " \\textbf{Licenses and citations} for the dataset collections presented in this paper. Collections containing material under more than three distinct licenses are marked as having ''Various`` licenses, and we refer readers to our raw data for the full details. Datasets which were generated with the use of OpenAI APIs and implicate those APIs' noncompetition restrictions are marked with a red X (\\redcross) in the ``OpenAI NC'' column.\n", + " '''.strip(),\n", + " \n", + " 'hrules': True,\n", + " 'convert_css': True,\n", + "}\n", + "\n", + "latex = license_table \\\n", + " .sort_index() \\\n", + " .style \\\n", + " .to_latex(**kwargs)\n", + "\n", + "print(latex)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa945fa4-75d7-4b50-aeb5-52e126e219e7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "DPC", + "language": "python", + "name": "pdc" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/summary-tables/hf_downloads.csv b/src/summary-tables/hf_downloads.csv new file mode 100644 index 00000000..bfc9a7b1 --- /dev/null +++ b/src/summary-tables/hf_downloads.csv @@ -0,0 +1,44 @@ +Collection sum HF Downloads (October 2023) +Airoboros 1134 +Alpaca 99653 +Anthropic HH-RLHF 82242 +Baize Chat Data 868 +Book Summaries 603 +Camel-AI Science 59 +CoT Collection 374 +Code Alpaca 4575 +CommitPackFT 4085 +Dolly 15k 10115614 +Flan Collection (Chain-of-Thought) 1230 +Flan Collection (Dialog) 235 +Flan Collection (Flan 2021) 704 +Flan Collection (P3) 14492 +Flan Collection (Super-NaturalInstructions) 2120 +GPT-4-Alpaca 1052 +GPTeacher 132 +Gorilla 101 +HC3 (Chinese) 89 +HC3 (English) 2014 +Joke Explanation 52 +LIMA 3035 +Longform 3454 +NomicAI GPT4AllJ 635 +OIG 395 +Open Assistant 13518 +Open Assistant OctoPack 36 +Open Orca 27963 +OpenAI (Summarize from Feedback) 13501 +OpenAI (WebGPT) 1380 +Self-Instruct 3478 +ShareGPT Vicuna 616 +Stack Exchange Instruction 547 +Stanford Human Preferences 4309 +StarCoder Self-Instruct 304 +Tasksource Instruct 11 +Tasksource Symbol-Tuning 197 +Tiny Stories 12260 +UltraChat 2031 +Unnatural Instructions 16 +WizardLM Evol-Instruct 1434 +WizardLM Evol-Instruct V2 631 +xP3x 965