diff --git a/code/README.md b/code/README.md index 9ddec4f..d13fa44 100644 --- a/code/README.md +++ b/code/README.md @@ -4,4 +4,5 @@ This folder contains notebooks to replicate analyses and figures (`_rep` denotes 2. `results(_rep).ipynb` - code to run all non-statistical analyses and generate all data figures 3. `r_code(_rep).ipynb` - code to run statistical tests in R 4. `stats_results(_rep).ipynb` - code to generate all statistical results -5. `meta-analysis.ipynb` - code to carry out the meta analysis +5. `meta-analysis.ipynb` - code to carry out the large scale analysis +6. `metafor.ipynb` - code to carry out the meta-analysis in R diff --git a/code/meta-analysis.ipynb b/code/meta-analysis.ipynb index 144c1b1..d6be9b3 100644 --- a/code/meta-analysis.ipynb +++ b/code/meta-analysis.ipynb @@ -50,6 +50,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -83,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -214,11 +215,12 @@ " sentence_dfs[file_name] = df\n", "\n", " df_results = pd.DataFrame(results_dict).T\n", - "\n", + " \n", " return df_results, sentence_dfs" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -235,7 +237,7 @@ "text/markdown": [ "**Past keywords and phrases:**\n", "\n", - "elapsed | yesterday | had | last semester | made | previously | did | last quarter | bygone | once upon a time | last year | heretofore | recently | terminated | used to be | then | earlier | since | final | hitherto | back when | last season | used to | before | historically | ago | thus far | antiquity | last week | concluded | formerly | last month | yesteryear | olden days | said | so far | were | to date | up to now | was | ceased | already | once | last time | wrote | in the past | long ago | expired | in those days | last night" + "final | so far | used to be | to date | had | made | last night | long ago | already | last season | concluded | were | once | previously | last month | ceased | earlier | in the past | before | said | up to now | heretofore | last year | wrote | terminated | last semester | yesteryear | was | antiquity | last time | since | in those days | did | thus far | back when | last quarter | ago | formerly | elapsed | olden days | yesterday | recently | once upon a time | then | expired | hitherto | used to | historically | last week | bygone" ], "text/plain": [ "" @@ -252,6 +254,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -268,7 +271,7 @@ "text/markdown": [ "**Future keywords and phrases:**\n", "\n", - "later | next season | next quarter | next week | forthcoming | eventually | succeeding | might | upcoming | in the works | predicted | imminently | prospective | to be | futuristic | next year | could | after | tomorrow | in the future | shall | next time | soon | intend to | in time | shortly | on the horizon | subsequent | next month | anticipated | going to | can | some day | may | looming | will | impending | later on | eventual | down the line | scheduled to | next semester | plan to | in the cards | hereafter" + "to be | prospective | futuristic | next time | tomorrow | on the horizon | imminently | next quarter | forthcoming | soon | next year | next season | could | subsequent | impending | can | down the line | in time | eventual | later on | going to | predicted | may | in the future | some day | might | succeeding | anticipated | shall | next week | looming | scheduled to | later | in the cards | intend to | eventually | hereafter | upcoming | after | will | next semester | shortly | in the works | next month | plan to" ], "text/plain": [ "" @@ -285,6 +288,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -592,6 +596,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -629,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 115, "metadata": {}, "outputs": [ { @@ -654,11 +659,13 @@ "source": [ "datadir = Path.cwd().parent.joinpath('data')\n", "results = []\n", + "sentence = []\n", "\n", "# should we just download the already-completed results or compute them from scratch?\n", "force_rerun = False\n", "\n", "for i, row in data.iterrows():\n", + "# for i, row in data[9:10].iterrows():\n", " print('Processing dataset: ' + row['Dataset'])\n", " results_fname = datadir.joinpath(row['Short name'].lower() + '_results.pkl')\n", " if not results_fname.exists():\n", @@ -679,14 +686,15 @@ " f.write(x.content)\n", " \n", " with open(results_fname, 'rb') as f:\n", - " next_results, _ = pickle.load(f)\n", + " next_results, sentence_dfs = pickle.load(f)\n", " \n", - " next_results = next_results.reset_index().rename(columns={\"index\": \"filename\"}).melt(id_vars=[\"filename\"], var_name=\"tense\", value_name=\"count\")\n", + " next_results = next_results.reset_index().rename(columns={\"index\": \"filename\"}).melt(id_vars=[\"filename\"], \n", + " var_name=\"tense\", value_name=\"count\")\n", " next_results['proportion'] = next_results['count'] / next_results.groupby('filename')['count'].transform('sum')\n", " next_results['Dataset'] = row['Short name']\n", "\n", " results.append(next_results)\n", - "\n", + " sentence.append(sentence_dfs)\n", "results = pd.concat(results)" ] }, @@ -694,184 +702,247 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Load in manual counts for *The Chair*" + "## the Chair" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "d = 0\n", + "\n", + "for key in sentence[d].keys():\n", + " res = {}\n", + " # res['dataset'] = data['Short name'][d]\n", + " # res['n_files'] = len(sentence[d])\n", + " res['episode'] = key\n", + " res['n_sentences'] = len(sentence[d][key])\n", + " res['n_past_refs_c'] = sentence[d][key]['past'].astype(bool).sum()\n", + " res['n_future_refs_c'] = sentence[d][key]['future'].astype(bool).sum()\n", + " print(res)\n", + " results.append(res)\n", + " \n", + "# pd.DataFrame(results).to_csv(\"../data/the_chair/the_chair_auto_reference_counts.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## all datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = []\n", + "\n", + "for d in range(len(sentence)):\n", + " res = {}\n", + " res['dataset'] = data_filter['Short name'][d]\n", + " res['n_files'] = len(sentence[d])\n", + " res['n_sentences'] = sum([len(sentence[d][key]) for key in sentence[d].keys()])\n", + " res['n_past_refs_c'] = sum([sentence[d][key]['past'].astype(bool).sum() for key in sentence[d].keys()])\n", + " res['n_future_refs_c'] = sum([sentence[d][key]['future'].astype(bool).sum() for key in sentence[d].keys()])\n", + " results.append(res)\n", + " \n", + "# pd.DataFrame(results).to_csv(\"../data/ref_counts_summary.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# figure S12" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bokeh.palettes import Category20c" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "auto = pd.read_csv(\"../data/the_chair/the_chair_auto_reference_counts.csv\")\n", + "manual = pd.read_csv(\"../data/the_chair/the_chair_manual_reference_counts.csv\")\n", + "\n", + "auto['auto_ratio'] = auto['Past']/auto['Future']\n", + "manual['manual_ratio'] = manual['Past']/manual['Future']\n", + "auto_long = auto[['Episode','Past','Future']].melt(var_name='Direction', value_name='auto_count', id_vars=['Episode'])\n", + "manual_long = manual[['Episode','Past','Future']].melt(var_name='Direction', value_name='manual_count', id_vars=['Episode'])\n", + "auto_long['auto_proportion'] = auto_long['auto_count'] / auto_long.groupby('Episode')['auto_count'].transform('sum')\n", + "manual_long['manual_proportion'] = manual_long['manual_count'] / manual_long.groupby('Episode')['manual_count'].transform('sum')\n", + "\n", + "count_all_long = manual_long.merge(auto_long, on=['Episode','Direction'])\n", + "count_all = manual.merge(auto, on=['Episode'])\n", + "count_all['manual_prop'] = count_all['Past_x']/(count_all['Past_x']+count_all['Future_x'])\n", + "count_all['auto_prop'] = count_all['Past_y']/(count_all['Past_y']+count_all['Future_y'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Episodetensecountproportion
01Past600.769231
11Future180.230769
22Past300.681818
32Future140.318182
43Past430.565789
53Future330.434211
64Past310.596154
74Future210.403846
85Past360.765957
95Future110.234043
106Past270.692308
116Future120.307692
\n", - "
" + "\n", + " function displayChart(vegaEmbed) {\n", + " vegaEmbed(outputDiv, spec, embedOpt)\n", + " .catch(err => showError(`Javascript Error: ${err.message}
This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n", + " }\n", + "\n", + " if(typeof define === \"function\" && define.amd) {\n", + " requirejs.config({paths});\n", + " require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n", + " } else {\n", + " maybeLoadScript(\"vega\", \"5\")\n", + " .then(() => maybeLoadScript(\"vega-lite\", \"4.17.0\"))\n", + " .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n", + " .catch(showError)\n", + " .then(() => displayChart(vegaEmbed));\n", + " }\n", + " })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}, \"axis\": {\"labelFontSize\": 14, \"labelFontWeight\": \"normal\", \"titleFontSize\": 14, \"titleFontWeight\": \"normal\"}, \"concat\": {\"spacing\": 50}, \"legend\": {}, \"title\": {\"anchor\": \"start\", \"fontSize\": 20}}, \"hconcat\": [{\"data\": {\"name\": \"data-8f38ffeb03340ce2de25c70f02a2cafa\"}, \"facet\": {\"column\": {\"field\": \"Direction\", \"header\": {\"labelFontSize\": 14, \"labelOrient\": \"top\", \"titleFontSize\": 14, \"titleFontWeight\": \"normal\", \"titlePadding\": 0}, \"sort\": \"descending\", \"title\": \"\", \"type\": \"nominal\"}}, \"spec\": {\"layer\": [{\"mark\": {\"type\": \"bar\", \"color\": \"#bdbdbd\"}, \"encoding\": {\"x\": {\"field\": \"Episode\", \"type\": \"ordinal\"}, \"y\": {\"field\": \"auto_count\", \"title\": \"Number of references\", \"type\": \"quantitative\"}}, \"width\": {\"step\": 30}}, {\"mark\": {\"type\": \"tick\", \"color\": \"#8C6238\", \"thickness\": 2}, \"encoding\": {\"x\": {\"field\": \"Episode\", \"type\": \"ordinal\"}, \"y\": {\"field\": \"manual_count\", \"type\": \"quantitative\"}}}]}, \"title\": \"A\"}, {\"layer\": [{\"mark\": {\"type\": \"bar\", \"color\": \"#bdbdbd\"}, \"encoding\": {\"x\": {\"field\": \"Episode\", \"type\": \"ordinal\"}, \"y\": {\"axis\": {\"format\": \"%\"}, \"field\": \"auto_prop\", \"scale\": {\"domain\": [0, 1]}, \"title\": \"Past / (Past + Future) %\", \"type\": \"quantitative\"}}, \"width\": {\"step\": 30}}, {\"mark\": {\"type\": \"tick\", \"color\": \"#8C6238\", \"thickness\": 2}, \"encoding\": {\"x\": {\"field\": \"Episode\", \"type\": \"ordinal\"}, \"y\": {\"axis\": {\"format\": \"%\"}, \"field\": \"manual_prop\", \"scale\": {\"domain\": [0, 1]}, \"type\": \"quantitative\"}}}], \"data\": {\"name\": \"data-723fe09044526ebf24326bdcf66425ba\"}, \"title\": \"B\"}, {\"layer\": [{\"mark\": {\"type\": \"bar\", \"color\": \"#bdbdbd\"}, \"encoding\": {\"x\": {\"field\": \"Episode\", \"type\": \"ordinal\"}, \"y\": {\"field\": \"auto_ratio\", \"scale\": {\"base\": 2, \"domain\": [1, 6], \"type\": \"log\"}, \"title\": \"Past / Future Ratio (log scale)\", \"type\": \"quantitative\"}}, \"width\": {\"step\": 30}}, {\"mark\": {\"type\": \"tick\", \"color\": \"#8C6238\", \"thickness\": 2}, \"encoding\": {\"x\": {\"field\": \"Episode\", \"type\": \"ordinal\"}, \"y\": {\"field\": \"manual_ratio\", \"scale\": {\"base\": 2, \"domain\": [1, 6], \"type\": \"log\"}, \"type\": \"quantitative\"}}}], \"data\": {\"name\": \"data-723fe09044526ebf24326bdcf66425ba\"}, \"title\": \"C\"}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.17.0.json\", \"datasets\": {\"data-8f38ffeb03340ce2de25c70f02a2cafa\": [{\"Episode\": 1, \"Direction\": \"Past\", \"manual_count\": 60, \"manual_proportion\": 0.759493670886076, \"auto_count\": 112, \"auto_proportion\": 0.6021505376344086}, {\"Episode\": 2, \"Direction\": \"Past\", \"manual_count\": 29, \"manual_proportion\": 0.6744186046511628, \"auto_count\": 80, \"auto_proportion\": 0.5555555555555556}, {\"Episode\": 3, \"Direction\": \"Past\", \"manual_count\": 43, \"manual_proportion\": 0.5657894736842105, \"auto_count\": 123, \"auto_proportion\": 0.6}, {\"Episode\": 4, \"Direction\": \"Past\", \"manual_count\": 30, \"manual_proportion\": 0.6, \"auto_count\": 108, \"auto_proportion\": 0.5901639344262295}, {\"Episode\": 5, \"Direction\": \"Past\", \"manual_count\": 37, \"manual_proportion\": 0.7872340425531915, \"auto_count\": 116, \"auto_proportion\": 0.6373626373626373}, {\"Episode\": 6, \"Direction\": \"Past\", \"manual_count\": 27, \"manual_proportion\": 0.6923076923076923, \"auto_count\": 121, \"auto_proportion\": 0.55}, {\"Episode\": 1, \"Direction\": \"Future\", \"manual_count\": 19, \"manual_proportion\": 0.24050632911392406, \"auto_count\": 74, \"auto_proportion\": 0.3978494623655914}, {\"Episode\": 2, \"Direction\": \"Future\", \"manual_count\": 14, \"manual_proportion\": 0.32558139534883723, \"auto_count\": 64, \"auto_proportion\": 0.4444444444444444}, {\"Episode\": 3, \"Direction\": \"Future\", \"manual_count\": 33, \"manual_proportion\": 0.4342105263157895, \"auto_count\": 82, \"auto_proportion\": 0.4}, {\"Episode\": 4, \"Direction\": \"Future\", \"manual_count\": 20, \"manual_proportion\": 0.4, \"auto_count\": 75, \"auto_proportion\": 0.4098360655737705}, {\"Episode\": 5, \"Direction\": \"Future\", \"manual_count\": 10, \"manual_proportion\": 0.2127659574468085, \"auto_count\": 66, \"auto_proportion\": 0.3626373626373626}, {\"Episode\": 6, \"Direction\": \"Future\", \"manual_count\": 12, \"manual_proportion\": 0.3076923076923077, \"auto_count\": 99, \"auto_proportion\": 0.45}], \"data-723fe09044526ebf24326bdcf66425ba\": [{\"Episode\": 1, \"Past_x\": 60, \"Future_x\": 19, \"manual_ratio\": 3.1578947368421053, \"Total\": 457, \"Past_y\": 112, \"Future_y\": 74, \"auto_ratio\": 1.5135135135135136, \"manual_prop\": 0.759493670886076, \"auto_prop\": 0.6021505376344086}, {\"Episode\": 2, \"Past_x\": 29, \"Future_x\": 14, \"manual_ratio\": 2.0714285714285716, \"Total\": 501, \"Past_y\": 80, \"Future_y\": 64, \"auto_ratio\": 1.25, \"manual_prop\": 0.6744186046511628, \"auto_prop\": 0.5555555555555556}, {\"Episode\": 3, \"Past_x\": 43, \"Future_x\": 33, \"manual_ratio\": 1.303030303030303, \"Total\": 518, \"Past_y\": 123, \"Future_y\": 82, \"auto_ratio\": 1.5, \"manual_prop\": 0.5657894736842105, \"auto_prop\": 0.6}, {\"Episode\": 4, \"Past_x\": 30, \"Future_x\": 20, \"manual_ratio\": 1.5, \"Total\": 442, \"Past_y\": 108, \"Future_y\": 75, \"auto_ratio\": 1.44, \"manual_prop\": 0.6, \"auto_prop\": 0.5901639344262295}, {\"Episode\": 5, \"Past_x\": 37, \"Future_x\": 10, \"manual_ratio\": 3.7, \"Total\": 508, \"Past_y\": 116, \"Future_y\": 66, \"auto_ratio\": 1.7575757575757576, \"manual_prop\": 0.7872340425531915, \"auto_prop\": 0.6373626373626373}, {\"Episode\": 6, \"Past_x\": 27, \"Future_x\": 12, \"manual_ratio\": 2.25, \"Total\": 474, \"Past_y\": 121, \"Future_y\": 99, \"auto_ratio\": 1.2222222222222223, \"manual_prop\": 0.6923076923076923, \"auto_prop\": 0.55}]}}, {\"mode\": \"vega-lite\"});\n", + "" ], "text/plain": [ - " Episode tense count proportion\n", - "0 1 Past 60 0.769231\n", - "1 1 Future 18 0.230769\n", - "2 2 Past 30 0.681818\n", - "3 2 Future 14 0.318182\n", - "4 3 Past 43 0.565789\n", - "5 3 Future 33 0.434211\n", - "6 4 Past 31 0.596154\n", - "7 4 Future 21 0.403846\n", - "8 5 Past 36 0.765957\n", - "9 5 Future 11 0.234043\n", - "10 6 Past 27 0.692308\n", - "11 6 Future 12 0.307692" + "alt.HConcatChart(...)" ] }, - "execution_count": 9, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "# make sure the chair dataset has been downloaded\n", - "chair_url = data.query('Dataset == \"The Chair\"')['Data URL'].values[0]\n", - "chair_datadir = datadir.joinpath(get_folder_name(chair_url))\n", - "if not (chair_datadir.exists() and len(lsdir(str(chair_datadir.joinpath('*.txt')))) >= 5):\n", - " download_dataset(chair_url, datadir)\n", - "\n", - "# fill in proportions for manual reference counts\n", - "ref_fname = str(Path.cwd().parent.joinpath('data', 'the_chair', 'the_chair_manual_reference_counts.csv'))\n", - "manual = pd.read_csv(ref_fname)\n", - "manual['Total'] = manual['Past'] + manual['Future']\n", - "\n", - "manual.reset_index(inplace=True)\n", - "manual['Episode'] = manual['index'] + 1\n", - "manual.drop(['index', 'Total'], axis=1, inplace=True)\n", - "\n", - "manual = manual.melt(var_name='tense', value_name='count', id_vars=['Episode'])\n", - "manual.sort_values(['Episode'], inplace=True)\n", - "manual.reset_index(inplace=True, drop=True)\n", - "manual['proportion'] = manual['count'] / manual.groupby('Episode')['count'].transform('sum')\n", - "manual" + "bar_count = alt.Chart().mark_bar(color=Category20c[20][18]).encode(\n", + " x='Episode:O',\n", + " y=alt.Y('auto_count', title=\"Number of references\"),\n", + " # column='direction'\n", + ").properties(\n", + " width=alt.Step(30) # controls width of bar.\n", + ")\n", + "tick_count = alt.Chart().mark_tick(color='#8C6238', thickness=2,).encode(\n", + " x='Episode:O',\n", + " y='manual_count',\n", + " # column='Direction',\n", + ")\n", + "count_plot = alt.layer(bar, tick, data=count_all_long).facet(column=alt.Column('Direction', sort=\"descending\", title='', header=alt.Header(labelOrient='top', titleFontSize=14, labelFontSize=14, titleFontWeight='normal', titlePadding=0))).properties(title='A')\n", + "\n", + "bar_prop = alt.Chart(count_all).mark_bar(color=Category20c[20][18]).encode(\n", + " x='Episode:O',\n", + " y=alt.Y('auto_prop', scale=alt.Scale(domain=[0,1]), axis=alt.Axis(format='%'), title=\"Past / (Past + Future) %\"),\n", + ").properties(\n", + " width=alt.Step(30) # controls width of bar.\n", + ")\n", + "\n", + "tick_prop = alt.Chart(count_all).mark_tick(color='#8C6238', thickness=2,).encode(\n", + " x='Episode:O',\n", + " y=alt.Y('manual_prop', scale=alt.Scale(domain=[0,1]), axis=alt.Axis(format='%')),\n", + ")\n", + "prop_plot = (bar_prop+tick_prop).properties(title='B')\n", + "bar_ratio = alt.Chart(count_all).mark_bar(color=Category20c[20][18]).encode(\n", + " x='Episode:O',\n", + " y=alt.Y('auto_ratio', scale=alt.Scale(domain=[1,6], type=\"log\", base=2), title=\"Past / Future Ratio (log scale)\"),\n", + ").properties(\n", + " width=alt.Step(30) # controls width of bar.\n", + ")\n", + "\n", + "tick_ratio = alt.Chart(count_all).mark_tick(color='#8C6238', thickness=2,).encode(\n", + " x='Episode:O',\n", + " y=alt.Y('manual_ratio', scale=alt.Scale(domain=[1,6], type=\"log\", base=2)),\n", + ")\n", + "\n", + "ratio_plot = (bar_ratio+tick_ratio).properties(title='C')\n", + "\n", + "(count_plot | prop_plot | ratio_plot\n", + ").configure_legend(\n", + "\n", + ").configure_axis(\n", + " titleFontSize=14,\n", + " labelFontSize=14,\n", + " titleFontWeight='normal',\n", + " labelFontWeight='normal',\n", + ").configure_concat(\n", + " spacing=50\n", + ").configure_title(\n", + " fontSize=20,\n", + " anchor='start',\n", + "# offset=20\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Load automatically identified counts from *The Chair*" + "# stats" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.stats import chi2_contingency" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [ { @@ -895,602 +966,462 @@ " \n", " \n", " \n", - " tense\n", - " count\n", - " Episode\n", - " proportion\n", + " dataset\n", + " type\n", + " source\n", + " full\n", + " non-empty\n", + " is_equal\n", + " past\n", + " future\n", + " total\n", + " corrected_past\n", + " corrected_future\n", + " past_prop\n", + " future_prop\n", + " RR\n", + " non_past\n", + " non_future\n", " \n", " \n", " \n", " \n", " 0\n", - " Past\n", - " 152\n", - " 1\n", - " 0.575758\n", + " IMSDb\n", + " Scripted\n", + " NaN\n", + " 1091\n", + " 1091\n", + " True\n", + " 833026\n", + " 472519\n", + " 3080674\n", + " 657475\n", + " 316525\n", + " 0.213419\n", + " 0.102745\n", + " 2.077166\n", + " 2423199\n", + " 2764149\n", " \n", " \n", " 1\n", - " Future\n", - " 112\n", - " 1\n", - " 0.424242\n", + " Movies\n", + " Scripted\n", + " ConvoKit\n", + " 304713\n", + " 304446\n", + " False\n", + " 179729\n", + " 129622\n", + " 516163\n", + " 127744\n", + " 85937\n", + " 0.247488\n", + " 0.166492\n", + " 1.486484\n", + " 388419\n", + " 430226\n", " \n", " \n", " 2\n", - " Past\n", - " 108\n", - " 2\n", - " 0.529412\n", + " Switchboard\n", + " Spontaneous\n", + " ConvoKit\n", + " 122646\n", + " 122646\n", + " True\n", + " 62464\n", + " 32372\n", + " 245461\n", + " 41488\n", + " 22079\n", + " 0.169021\n", + " 0.089949\n", + " 1.879071\n", + " 203973\n", + " 223382\n", " \n", " \n", " 3\n", - " Future\n", - " 96\n", - " 2\n", - " 0.470588\n", + " SCOTUS\n", + " Constrained\n", + " ConvoKit\n", + " 1700789\n", + " 1700789\n", + " True\n", + " 3089509\n", + " 1802239\n", + " 3880259\n", + " 1963578\n", + " 1207377\n", + " 0.506043\n", + " 0.311159\n", + " 1.626317\n", + " 1916681\n", + " 2672882\n", " \n", " \n", " 4\n", - " Past\n", - " 177\n", - " 3\n", - " 0.608247\n", + " Tennis\n", + " Constrained\n", + " ConvoKit\n", + " 163948\n", + " 163948\n", + " True\n", + " 448444\n", + " 193802\n", + " 599172\n", + " 281669\n", + " 134638\n", + " 0.470097\n", + " 0.224707\n", + " 2.092047\n", + " 317503\n", + " 464534\n", " \n", " \n", " 5\n", - " Future\n", - " 114\n", - " 3\n", - " 0.391753\n", + " PfG\n", + " Constrained\n", + " ConvoKit\n", + " 20932\n", + " 20932\n", + " True\n", + " 9695\n", + " 15520\n", + " 37184\n", + " 7408\n", + " 9771\n", + " 0.199225\n", + " 0.262774\n", + " 0.758162\n", + " 29776\n", + " 27413\n", " \n", " \n", " 6\n", - " Past\n", - " 148\n", - " 4\n", - " 0.594378\n", + " IQ2\n", + " Constrained\n", + " ConvoKit\n", + " 26562\n", + " 26317\n", + " False\n", + " 67626\n", + " 51780\n", + " 122925\n", + " 46630\n", + " 34811\n", + " 0.379337\n", + " 0.283189\n", + " 1.339519\n", + " 76295\n", + " 88114\n", " \n", " \n", " 7\n", - " Future\n", - " 101\n", - " 4\n", - " 0.405622\n", + " GAP\n", + " Constrained\n", + " ConvoKit\n", + " 8009\n", + " 8009\n", + " True\n", + " 2739\n", + " 1958\n", + " 8009\n", + " 1800\n", + " 1338\n", + " 0.224747\n", + " 0.167062\n", + " 1.345291\n", + " 6209\n", + " 6671\n", " \n", " \n", " 8\n", - " Past\n", - " 164\n", - " 5\n", - " 0.616541\n", + " Chair\n", + " Scripted\n", + " NaN\n", + " 6\n", + " 6\n", + " True\n", + " 909\n", + " 663\n", + " 2900\n", + " 660\n", + " 460\n", + " 0.227586\n", + " 0.158621\n", + " 1.434783\n", + " 2240\n", + " 2440\n", " \n", " \n", " 9\n", - " Future\n", - " 102\n", - " 5\n", - " 0.383459\n", + " Friends\n", + " Scripted\n", + " ConvoKit\n", + " 67373\n", + " 61310\n", + " False\n", + " 32105\n", + " 23931\n", + " 107082\n", + " 22067\n", + " 16356\n", + " 0.206076\n", + " 0.152743\n", + " 1.349169\n", + " 85015\n", + " 90726\n", " \n", " \n", " 10\n", - " Past\n", - " 160\n", - " 6\n", - " 0.536913\n", + " Gutenberg\n", + " Scripted\n", + " NaN\n", + " 14773741\n", + " 14773741\n", + " True\n", + " 14617983\n", + " 13714226\n", + " 29119393\n", + " 10234952\n", + " 8672030\n", + " 0.351482\n", + " 0.297809\n", + " 1.180226\n", + " 18884441\n", + " 20447363\n", " \n", " \n", " 11\n", - " Future\n", - " 138\n", - " 6\n", - " 0.463087\n", + " Reddit\n", + " Constrained\n", + " ConvoKit\n", + " 74468\n", + " 72985\n", + " False\n", + " 120512\n", + " 105127\n", + " 217924\n", + " 86513\n", + " 66700\n", + " 0.396987\n", + " 0.306070\n", + " 1.297046\n", + " 131411\n", + " 151224\n", " \n", " \n", "\n", "" ], "text/plain": [ - " tense count Episode proportion\n", - "0 Past 152 1 0.575758\n", - "1 Future 112 1 0.424242\n", - "2 Past 108 2 0.529412\n", - "3 Future 96 2 0.470588\n", - "4 Past 177 3 0.608247\n", - "5 Future 114 3 0.391753\n", - "6 Past 148 4 0.594378\n", - "7 Future 101 4 0.405622\n", - "8 Past 164 5 0.616541\n", - "9 Future 102 5 0.383459\n", - "10 Past 160 6 0.536913\n", - "11 Future 138 6 0.463087" + " dataset type source full non-empty is_equal \\\n", + "0 IMSDb Scripted NaN 1091 1091 True \n", + "1 Movies Scripted ConvoKit 304713 304446 False \n", + "2 Switchboard Spontaneous ConvoKit 122646 122646 True \n", + "3 SCOTUS Constrained ConvoKit 1700789 1700789 True \n", + "4 Tennis Constrained ConvoKit 163948 163948 True \n", + "5 PfG Constrained ConvoKit 20932 20932 True \n", + "6 IQ2 Constrained ConvoKit 26562 26317 False \n", + "7 GAP Constrained ConvoKit 8009 8009 True \n", + "8 Chair Scripted NaN 6 6 True \n", + "9 Friends Scripted ConvoKit 67373 61310 False \n", + "10 Gutenberg Scripted NaN 14773741 14773741 True \n", + "11 Reddit Constrained ConvoKit 74468 72985 False \n", + "\n", + " past future total corrected_past corrected_future past_prop \\\n", + "0 833026 472519 3080674 657475 316525 0.213419 \n", + "1 179729 129622 516163 127744 85937 0.247488 \n", + "2 62464 32372 245461 41488 22079 0.169021 \n", + "3 3089509 1802239 3880259 1963578 1207377 0.506043 \n", + "4 448444 193802 599172 281669 134638 0.470097 \n", + "5 9695 15520 37184 7408 9771 0.199225 \n", + "6 67626 51780 122925 46630 34811 0.379337 \n", + "7 2739 1958 8009 1800 1338 0.224747 \n", + "8 909 663 2900 660 460 0.227586 \n", + "9 32105 23931 107082 22067 16356 0.206076 \n", + "10 14617983 13714226 29119393 10234952 8672030 0.351482 \n", + "11 120512 105127 217924 86513 66700 0.396987 \n", + "\n", + " future_prop RR non_past non_future \n", + "0 0.102745 2.077166 2423199 2764149 \n", + "1 0.166492 1.486484 388419 430226 \n", + "2 0.089949 1.879071 203973 223382 \n", + "3 0.311159 1.626317 1916681 2672882 \n", + "4 0.224707 2.092047 317503 464534 \n", + "5 0.262774 0.758162 29776 27413 \n", + "6 0.283189 1.339519 76295 88114 \n", + "7 0.167062 1.345291 6209 6671 \n", + "8 0.158621 1.434783 2240 2440 \n", + "9 0.152743 1.349169 85015 90726 \n", + "10 0.297809 1.180226 18884441 20447363 \n", + "11 0.306070 1.297046 131411 151224 " ] }, - "execution_count": 10, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "chair_fname = str(Path.cwd().parent.joinpath('data', 'chair_results.pkl'))\n", - "with open(chair_fname, 'rb') as f:\n", - " chair_results, _ = pickle.load(f)\n", - "\n", - "auto = chair_results.reset_index().rename(columns={\"index\": \"filename\"}).melt(id_vars=[\"filename\"], var_name=\"tense\", value_name=\"count\")\n", - "auto['Episode'] = auto['filename'].apply(lambda filename: int(filename.split('_')[2][3]))\n", - "auto['proportion'] = auto['count'] / auto.groupby('Episode')['count'].transform('sum')\n", - "auto.sort_values(by=['Episode'], inplace=True)\n", - "auto.drop(columns=['filename'], inplace=True)\n", - "auto.reset_index(drop=True, inplace=True)\n", - "auto" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create meta-analysis figure\n", - "\n", - "- Panel A: Numbers of (manually and automatically detected) past and future events from each episode of *The Chair*, season 1.\n", - "- Panel B: Proportions of (manually and automatically detected) past and future events from each episode of *The Chair*, season 1.\n", - "- Panel C: Proportions of automatically detected past and future events from each dataset" + "df = pd.read_csv(\"ref_counts_summary.csv\")\n", + "df" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { "data": { - "image/png": "", "text/plain": [ - "
" + "24040006" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], "source": [ - "fig, axes = plt.subplots(figsize=(16, 4), ncols=3)\n", - "\n", - "# panel A\n", - "sns.barplot(data=manual, x='tense', y='count', hue='Episode', palette='viridis', ax=axes[0])\n", - "sns.barplot(data=auto, x='tense', y='count', hue='Episode', palette='viridis', alpha=0.5, ax=axes[0])\n", - "sns.barplot(data=auto, x='tense', y='count', hue='Episode', fill=False, edgecolor='k', linewidth=1.5, ax=axes[0])\n", - "axes[0].get_legend().remove()\n", - "\n", - "axes[0].set_xlabel('Tense', fontsize=14)\n", - "axes[0].set_ylabel('Number of events', fontsize=14)\n", - "sns.despine(top=True, right=True)\n", - "\n", - "\n", - "# panel B\n", - "sns.barplot(data=manual, x='tense', y='proportion', hue='Episode', palette='viridis', ax=axes[1])\n", - "sns.barplot(data=auto, x='tense', y='proportion', hue='Episode', palette='viridis', alpha=0.5, ax=axes[1])\n", - "sns.barplot(data=auto, x='tense', y='proportion', hue='Episode', fill=False, edgecolor='k', linewidth=1.5, ax=axes[1])\n", - "handles, labels = axes[1].get_legend_handles_labels()\n", - "axes[1].legend(loc='upper right', title='Episode', handles=handles[:6], labels=labels[:6], frameon=True, framealpha=0.75, ncol=2, fontsize=8)\n", - "\n", - "axes[1].set_xlabel('Tense', fontsize=14)\n", - "axes[1].set_ylabel('Proportion of events', fontsize=14)\n", - "sns.despine(top=True, right=True)\n", - "\n", - "\n", - "# panel C\n", - "sns.barplot(results, x='tense', y='proportion', hue='Dataset', palette='Spectral', ax=axes[2])\n", - "axes[2].set_xlabel('Tense', fontsize=14)\n", - "axes[2].set_ylabel('Proportion of events', fontsize=14)\n", - "axes[2].legend(loc='lower left', title='Dataset', frameon=True, ncol=3, fontsize=8, facecolor='white', framealpha=0.75)\n", - "axes[2].set_ylim(axes[1].get_ylim())\n", - "sns.despine(top=True, right=True)\n", - "\n", - "\n", - "plt.tight_layout()\n", - "plt.savefig('meta-analysis.pdf', bbox_inches='tight')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Include some stats (to report in the main text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Total number of observations and words across all datasets" + "total = df['corrected_past'].sum() + df['corrected_future'].sum()\n", + "total" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are a total of 17264278 observations across all datasets.\n", - "There are a total of 443756731 words across all datasets.\n" - ] + "data": { + "text/plain": [ + "13471984" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "x = data.sum()\n", - "print(f\"There are a total of {x['Number of observations']} observations across all datasets.\")\n", - "print(f\"There are a total of {x['Number of words']} words across all datasets.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Proportion of past versus future events, across all documents (and total number of past + future events)" + "df['corrected_past'].sum()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Past events: 19464741 (54.06%)\n", - "Future events: 16543759 (45.94%)\n", - "Total events: 36008500\n" - ] + "data": { + "text/plain": [ + "0.5603985290186699" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "# print the numbers and proportions of past and future events, across all documents\n", - "\n", - "n_past = results.query('tense == \"Past\"')['count'].sum()\n", - "n_future = results.query('tense == \"Future\"')['count'].sum()\n", - "n_total = n_past + n_future\n", - "\n", - "print(f\"Past events: {n_past} ({n_past / n_total:.2%})\")\n", - "print(f\"Future events: {n_future} ({n_future / n_total:.2%})\")\n", - "print(f\"Total events: {n_total}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "T-test comparing proportions of past vs. future events across datasets" + "df['corrected_past'].sum() / total" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Average proportion of past events: 58.99% ± 7.28%\n", - "t(11) = 4.28, p = 0.0013\n" - ] + "data": { + "text/plain": [ + "10568022" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "# average proportions of past events for each dataset\n", - "past_proportions = results.groupby('Dataset').apply(lambda x: x.query('tense == \"Past\"')['proportion'].mean()).sort_values(ascending=False)\n", - "x = ttest_1samp(past_proportions, 0.5)\n", - "print(f\"Average proportion of past events: {past_proportions.mean():.2%} ± {past_proportions.std():.2%}\")\n", - "print(f\"t({x.df}) = {x.statistic:.2f}, p = {x.pvalue:.4f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Effect sizes and confidence intervals (effect size = past refs / future refs)" + "df['corrected_future'].sum()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Average effect size: 1.45 ± 0.40\n", - "Median effect size: 1.44\n", - "95% CI: [0.82, 2.16]\n", - "Range: [0.69, 2.18]\n" - ] - }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tenseFuturePastEffect size
Dataset
Chair0.4231250.5768751.363366
Friends0.4169980.5830021.398092
GAP0.4153730.5846271.407473
Gutenberg0.4639620.5360381.155350
IMSDb0.3605410.6394591.773610
IQ20.4380840.5619161.282669
Movies0.4054140.5945861.466612
PfG0.5917780.4082220.689823
Reddit0.4036090.5963911.477646
SCOTUS0.3659240.6340761.732808
Switchboard0.3221300.6778702.104335
Tennis0.3145150.6854852.179502
\n", - "
" - ], "text/plain": [ - "tense Future Past Effect size\n", - "Dataset \n", - "Chair 0.423125 0.576875 1.363366\n", - "Friends 0.416998 0.583002 1.398092\n", - "GAP 0.415373 0.584627 1.407473\n", - "Gutenberg 0.463962 0.536038 1.155350\n", - "IMSDb 0.360541 0.639459 1.773610\n", - "IQ2 0.438084 0.561916 1.282669\n", - "Movies 0.405414 0.594586 1.466612\n", - "PfG 0.591778 0.408222 0.689823\n", - "Reddit 0.403609 0.596391 1.477646\n", - "SCOTUS 0.365924 0.634076 1.732808\n", - "Switchboard 0.322130 0.677870 2.104335\n", - "Tennis 0.314515 0.685485 2.179502" + "0.43960147098133" ] }, - "execution_count": 15, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "# Overall effect size\n", - "\n", - "# compute average numbers of past and future references per dataset\n", - "df = results.groupby(['Dataset', 'tense'])['proportion'].mean().unstack()\n", - "df['Effect size'] = df['Past'] / df['Future']\n", - "\n", - "# compute average of effect sizes\n", - "def average_effect_size(x):\n", - " return np.exp(np.mean(np.log(x)))\n", - "\n", - "def standard_error_effect_size(x):\n", - " return np.exp(np.std(np.log(x))) / np.sqrt(len(x) - 1)\n", - "\n", - "\n", - "# report average effect size, standard deviation, and 95% confidence interval across datasets\n", - "x = df['Effect size']\n", - "print(f\"Average effect size: {average_effect_size(x):.2f} ± {standard_error_effect_size(x):.2f}\")\n", - "print(f\"Median effect size: {x.median():.2f}\")\n", - "print(f\"95% CI: [{np.quantile(x, 0.025):.2f}, {np.quantile(x, 0.975):.2f}]\")\n", - "print(f\"Range: [{x.min():.2f}, {x.max():.2f}]\")\n", - "\n", - "df" + "df['corrected_future'].sum() / total" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "def print_effect_size_stats(x, n_bootstraps=1000):\n", - " effect_sizes = []\n", - "\n", - " for _ in range(n_bootstraps):\n", - " # take a random sample from the dataset (with replacement)\n", - " sample = x.sample(n=len(x), replace=True)\n", - "\n", - " # compute the effect size for the sample\n", - " n_past = sample.query('tense == \"Past\"')['count'].sum()\n", - " n_future = sample.query('tense == \"Future\"')['count'].sum()\n", - " effect_size = n_past / n_future\n", - "\n", - " effect_sizes.append(effect_size)\n", - " \n", - " effect_sizes = np.array(effect_sizes)\n", - " print(f'Average effect size: {average_effect_size(effect_sizes):.2f} ± {standard_error_effect_size(effect_sizes):.2f}')\n", - " print(f'Effect size boostrap-estimated 95% CI: [{np.percentile(effect_sizes, 2.5):.2f}, {np.percentile(effect_sizes, 97.5):.2f}]')\n", - " print(f'Estimated p-value: {2 * np.min([np.mean(effect_sizes > 1), np.mean(effect_sizes < 1)]):.4f}')" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "IMSDb: \n", - "Average effect size: 1.78 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.64, 1.92]\n", - "Estimated p-value: 0.0000\n", - "\n", - "Movies: \n", - "Average effect size: 1.39 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.37, 1.40]\n", - "Estimated p-value: 0.0000\n", - "\n", - "Switchboard: \n", - "Average effect size: 1.93 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.88, 1.96]\n", - "Estimated p-value: 0.0000\n", - "\n", - "SCOTUS: \n", - "Average effect size: 1.71 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.70, 1.72]\n", - "Estimated p-value: 0.0000\n", - "\n", - "Tennis: \n", - "Average effect size: 2.32 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [2.29, 2.34]\n", - "Estimated p-value: 0.0000\n", - "\n", - "PfG: \n", - "Average effect size: 0.63 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [0.61, 0.65]\n", - "Estimated p-value: 0.0000\n", - "\n", - "IQ2: \n", - "Average effect size: 1.31 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.26, 1.37]\n", - "Estimated p-value: 0.0000\n", - "\n", - "GAP: \n", - "Average effect size: 1.40 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.29, 1.49]\n", - "Estimated p-value: 0.0000\n", - "\n", - "Chair: \n", - "Average effect size: 1.61 ± 0.18\n", - "Effect size boostrap-estimated 95% CI: [0.57, 4.21]\n", - "Estimated p-value: 0.4000\n", - "\n", - "Friends: \n", - "Average effect size: 1.34 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.31, 1.37]\n", - "Estimated p-value: 0.0000\n", - "\n", - "Gutenberg: \n", - "Average effect size: 1.07 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.06, 1.07]\n", - "Estimated p-value: 0.0000\n", - "\n", - "Reddit: \n", - "Average effect size: 1.15 ± 0.10\n", - "Effect size boostrap-estimated 95% CI: [1.12, 1.18]\n", - "Estimated p-value: 0.0000\n" + "IMSDb\n", + "0.0\n", + "Movies\n", + "0.0\n", + "Switchboard\n", + "0.0\n", + "SCOTUS\n", + "0.0\n", + "Tennis\n", + "0.0\n", + "PfG\n", + "7.651987244822489e-94\n", + "IQ2\n", + "0.0\n", + "GAP\n", + "4.415379561347529e-20\n", + "Chair\n", + "3.6004633077240275e-11\n", + "Friends\n", + "7.023120502011673e-227\n", + "Gutenberg\n", + "0.0\n", + "Reddit\n", + "0.0\n" ] } ], "source": [ - "debug = False # set to True to print out quicker estimates\n", - "if debug:\n", - " n_boostraps = 10\n", - "else:\n", - " n_boostraps = 100\n", - "\n", - "datasets = results['Dataset'].unique()\n", - "\n", - "for d in datasets:\n", - " past_proportions = results.query('Dataset == @d and tense == \"Past\"')['proportion']\n", - " future_proportions = results.query('Dataset == @d and tense == \"Future\"')['proportion']\n", - "\n", - " odds_ratios = past_proportions / future_proportions\n", - "\n", - " print(f\"\\n{d}: \")\n", - " print_effect_size_stats(results.query('Dataset == @d'), n_bootstraps=n_boostraps)" + "for i in df.iterrows():\n", + " print(i[1]['dataset'])\n", + " stat, p, dof, expected = chi2_contingency([[i[1]['corrected_past'], i[1]['non_past']], [i[1]['corrected_future'], i[1]['non_future']]])\n", + " print(p)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "prediction-retrodiction", + "display_name": "pr", "language": "python", - "name": "prediction-retrodiction" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1502,7 +1433,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/code/meta-analysis.pdf b/code/meta-analysis.pdf deleted file mode 100644 index 9794532..0000000 Binary files a/code/meta-analysis.pdf and /dev/null differ diff --git a/code/metafor.ipynb b/code/metafor.ipynb new file mode 100644 index 0000000..41604cb --- /dev/null +++ b/code/metafor.ipynb @@ -0,0 +1,430 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading required package: Matrix\n", + "\n", + "Loading required package: metadat\n", + "\n", + "Loading required package: numDeriv\n", + "\n", + "\n", + "Loading the 'metafor' package (version 4.4-0). For an\n", + "introduction to the package please type: help(metafor)\n", + "\n", + "\n", + "-- \u001b[1mAttaching core tidyverse packages\u001b[22m ------------------------ tidyverse 2.0.0 --\n", + "\u001b[32mv\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4 \u001b[32mv\u001b[39m \u001b[34mreadr \u001b[39m 2.1.4\n", + "\u001b[32mv\u001b[39m \u001b[34mforcats \u001b[39m 1.0.0 \u001b[32mv\u001b[39m \u001b[34mstringr \u001b[39m 1.5.1\n", + "\u001b[32mv\u001b[39m \u001b[34mggplot2 \u001b[39m 3.4.4 \u001b[32mv\u001b[39m \u001b[34mtibble \u001b[39m 3.2.1\n", + "\u001b[32mv\u001b[39m \u001b[34mlubridate\u001b[39m 1.9.3 \u001b[32mv\u001b[39m \u001b[34mtidyr \u001b[39m 1.3.0\n", + "\u001b[32mv\u001b[39m \u001b[34mpurrr \u001b[39m 1.0.2 \n", + "-- \u001b[1mConflicts\u001b[22m ------------------------------------------ tidyverse_conflicts() --\n", + "\u001b[31mx\u001b[39m \u001b[34mtidyr\u001b[39m::\u001b[32mexpand()\u001b[39m masks \u001b[34mMatrix\u001b[39m::expand()\n", + "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n", + "\u001b[31mx\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n", + "\u001b[31mx\u001b[39m \u001b[34mtidyr\u001b[39m::\u001b[32mpack()\u001b[39m masks \u001b[34mMatrix\u001b[39m::pack()\n", + "\u001b[31mx\u001b[39m \u001b[34mtidyr\u001b[39m::\u001b[32munpack()\u001b[39m masks \u001b[34mMatrix\u001b[39m::unpack()\n", + "\u001b[36mi\u001b[39m Use the conflicted package (\u001b[3m\u001b[34m\u001b[39m\u001b[23m) to force all conflicts to become errors\n" + ] + } + ], + "source": [ + "library(metafor)\n", + "library(tidyverse)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A data.frame: 12 x 16
X...datasettypesourcefullnon.emptyis_equalpastfuturetotalcorrected_pastcorrected_futurepast_propfuture_propRRnon_pastnon_future
<chr><chr><chr><int><int><lgl><int><int><int><int><int><dbl><dbl><dbl><int><int>
IMSDb Scripted 1091 1091 TRUE 833026 472519 3080674 657475 3165250.21341920.102745372.0771661 2423199 2764149
Movies Scripted ConvoKit 304713 304446FALSE 179729 129622 516163 127744 859370.24748770.166491981.4864843 388419 430226
SwitchboardSpontaneousConvoKit 122646 122646 TRUE 62464 32372 245461 41488 220790.16902070.089949121.8790706 203973 223382
SCOTUS ConstrainedConvoKit 1700789 1700789 TRUE 3089509 1802239 3880259 196357812073770.50604300.311158871.6263172 1916681 2672882
Tennis ConstrainedConvoKit 163948 163948 TRUE 448444 193802 599172 281669 1346380.47009710.224706762.0920468 317503 464534
PfG ConstrainedConvoKit 20932 20932 TRUE 9695 15520 37184 7408 97710.19922550.262774310.7581619 29776 27413
IQ2 ConstrainedConvoKit 26562 26317FALSE 67626 51780 122925 46630 348110.37933700.283188941.3395191 76295 88114
GAP ConstrainedConvoKit 8009 8009 TRUE 2739 1958 8009 1800 13380.22474720.167062051.3452915 6209 6671
Chair Scripted 6 6 TRUE 909 663 2900 660 4600.22758620.158620691.4347826 2240 2440
Friends Scripted ConvoKit 67373 61310FALSE 32105 23931 107082 22067 163560.20607570.152742761.3491685 85015 90726
Gutenberg Scripted 1477374114773741 TRUE1461798313714226291193931023495286720300.35148230.297809441.18022561888444120447363
Reddit ConstrainedConvoKit 74468 72985FALSE 120512 105127 217924 86513 667000.39698700.306070011.2970465 131411 151224
\n" + ], + "text/latex": [ + "A data.frame: 12 x 16\n", + "\\begin{tabular}{llllllllllllllll}\n", + " X...dataset & type & source & full & non.empty & is\\_equal & past & future & total & corrected\\_past & corrected\\_future & past\\_prop & future\\_prop & RR & non\\_past & non\\_future\\\\\n", + " & & & & & & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t IMSDb & Scripted & & 1091 & 1091 & TRUE & 833026 & 472519 & 3080674 & 657475 & 316525 & 0.2134192 & 0.10274537 & 2.0771661 & 2423199 & 2764149\\\\\n", + "\t Movies & Scripted & ConvoKit & 304713 & 304446 & FALSE & 179729 & 129622 & 516163 & 127744 & 85937 & 0.2474877 & 0.16649198 & 1.4864843 & 388419 & 430226\\\\\n", + "\t Switchboard & Spontaneous & ConvoKit & 122646 & 122646 & TRUE & 62464 & 32372 & 245461 & 41488 & 22079 & 0.1690207 & 0.08994912 & 1.8790706 & 203973 & 223382\\\\\n", + "\t SCOTUS & Constrained & ConvoKit & 1700789 & 1700789 & TRUE & 3089509 & 1802239 & 3880259 & 1963578 & 1207377 & 0.5060430 & 0.31115887 & 1.6263172 & 1916681 & 2672882\\\\\n", + "\t Tennis & Constrained & ConvoKit & 163948 & 163948 & TRUE & 448444 & 193802 & 599172 & 281669 & 134638 & 0.4700971 & 0.22470676 & 2.0920468 & 317503 & 464534\\\\\n", + "\t PfG & Constrained & ConvoKit & 20932 & 20932 & TRUE & 9695 & 15520 & 37184 & 7408 & 9771 & 0.1992255 & 0.26277431 & 0.7581619 & 29776 & 27413\\\\\n", + "\t IQ2 & Constrained & ConvoKit & 26562 & 26317 & FALSE & 67626 & 51780 & 122925 & 46630 & 34811 & 0.3793370 & 0.28318894 & 1.3395191 & 76295 & 88114\\\\\n", + "\t GAP & Constrained & ConvoKit & 8009 & 8009 & TRUE & 2739 & 1958 & 8009 & 1800 & 1338 & 0.2247472 & 0.16706205 & 1.3452915 & 6209 & 6671\\\\\n", + "\t Chair & Scripted & & 6 & 6 & TRUE & 909 & 663 & 2900 & 660 & 460 & 0.2275862 & 0.15862069 & 1.4347826 & 2240 & 2440\\\\\n", + "\t Friends & Scripted & ConvoKit & 67373 & 61310 & FALSE & 32105 & 23931 & 107082 & 22067 & 16356 & 0.2060757 & 0.15274276 & 1.3491685 & 85015 & 90726\\\\\n", + "\t Gutenberg & Scripted & & 14773741 & 14773741 & TRUE & 14617983 & 13714226 & 29119393 & 10234952 & 8672030 & 0.3514823 & 0.29780944 & 1.1802256 & 18884441 & 20447363\\\\\n", + "\t Reddit & Constrained & ConvoKit & 74468 & 72985 & FALSE & 120512 & 105127 & 217924 & 86513 & 66700 & 0.3969870 & 0.30607001 & 1.2970465 & 131411 & 151224\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 12 x 16\n", + "\n", + "| X...dataset <chr> | type <chr> | source <chr> | full <int> | non.empty <int> | is_equal <lgl> | past <int> | future <int> | total <int> | corrected_past <int> | corrected_future <int> | past_prop <dbl> | future_prop <dbl> | RR <dbl> | non_past <int> | non_future <int> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| IMSDb | Scripted | | 1091 | 1091 | TRUE | 833026 | 472519 | 3080674 | 657475 | 316525 | 0.2134192 | 0.10274537 | 2.0771661 | 2423199 | 2764149 |\n", + "| Movies | Scripted | ConvoKit | 304713 | 304446 | FALSE | 179729 | 129622 | 516163 | 127744 | 85937 | 0.2474877 | 0.16649198 | 1.4864843 | 388419 | 430226 |\n", + "| Switchboard | Spontaneous | ConvoKit | 122646 | 122646 | TRUE | 62464 | 32372 | 245461 | 41488 | 22079 | 0.1690207 | 0.08994912 | 1.8790706 | 203973 | 223382 |\n", + "| SCOTUS | Constrained | ConvoKit | 1700789 | 1700789 | TRUE | 3089509 | 1802239 | 3880259 | 1963578 | 1207377 | 0.5060430 | 0.31115887 | 1.6263172 | 1916681 | 2672882 |\n", + "| Tennis | Constrained | ConvoKit | 163948 | 163948 | TRUE | 448444 | 193802 | 599172 | 281669 | 134638 | 0.4700971 | 0.22470676 | 2.0920468 | 317503 | 464534 |\n", + "| PfG | Constrained | ConvoKit | 20932 | 20932 | TRUE | 9695 | 15520 | 37184 | 7408 | 9771 | 0.1992255 | 0.26277431 | 0.7581619 | 29776 | 27413 |\n", + "| IQ2 | Constrained | ConvoKit | 26562 | 26317 | FALSE | 67626 | 51780 | 122925 | 46630 | 34811 | 0.3793370 | 0.28318894 | 1.3395191 | 76295 | 88114 |\n", + "| GAP | Constrained | ConvoKit | 8009 | 8009 | TRUE | 2739 | 1958 | 8009 | 1800 | 1338 | 0.2247472 | 0.16706205 | 1.3452915 | 6209 | 6671 |\n", + "| Chair | Scripted | | 6 | 6 | TRUE | 909 | 663 | 2900 | 660 | 460 | 0.2275862 | 0.15862069 | 1.4347826 | 2240 | 2440 |\n", + "| Friends | Scripted | ConvoKit | 67373 | 61310 | FALSE | 32105 | 23931 | 107082 | 22067 | 16356 | 0.2060757 | 0.15274276 | 1.3491685 | 85015 | 90726 |\n", + "| Gutenberg | Scripted | | 14773741 | 14773741 | TRUE | 14617983 | 13714226 | 29119393 | 10234952 | 8672030 | 0.3514823 | 0.29780944 | 1.1802256 | 18884441 | 20447363 |\n", + "| Reddit | Constrained | ConvoKit | 74468 | 72985 | FALSE | 120512 | 105127 | 217924 | 86513 | 66700 | 0.3969870 | 0.30607001 | 1.2970465 | 131411 | 151224 |\n", + "\n" + ], + "text/plain": [ + " X...dataset type source full non.empty is_equal past \n", + "1 IMSDb Scripted 1091 1091 TRUE 833026\n", + "2 Movies Scripted ConvoKit 304713 304446 FALSE 179729\n", + "3 Switchboard Spontaneous ConvoKit 122646 122646 TRUE 62464\n", + "4 SCOTUS Constrained ConvoKit 1700789 1700789 TRUE 3089509\n", + "5 Tennis Constrained ConvoKit 163948 163948 TRUE 448444\n", + "6 PfG Constrained ConvoKit 20932 20932 TRUE 9695\n", + "7 IQ2 Constrained ConvoKit 26562 26317 FALSE 67626\n", + "8 GAP Constrained ConvoKit 8009 8009 TRUE 2739\n", + "9 Chair Scripted 6 6 TRUE 909\n", + "10 Friends Scripted ConvoKit 67373 61310 FALSE 32105\n", + "11 Gutenberg Scripted 14773741 14773741 TRUE 14617983\n", + "12 Reddit Constrained ConvoKit 74468 72985 FALSE 120512\n", + " future total corrected_past corrected_future past_prop future_prop\n", + "1 472519 3080674 657475 316525 0.2134192 0.10274537 \n", + "2 129622 516163 127744 85937 0.2474877 0.16649198 \n", + "3 32372 245461 41488 22079 0.1690207 0.08994912 \n", + "4 1802239 3880259 1963578 1207377 0.5060430 0.31115887 \n", + "5 193802 599172 281669 134638 0.4700971 0.22470676 \n", + "6 15520 37184 7408 9771 0.1992255 0.26277431 \n", + "7 51780 122925 46630 34811 0.3793370 0.28318894 \n", + "8 1958 8009 1800 1338 0.2247472 0.16706205 \n", + "9 663 2900 660 460 0.2275862 0.15862069 \n", + "10 23931 107082 22067 16356 0.2060757 0.15274276 \n", + "11 13714226 29119393 10234952 8672030 0.3514823 0.29780944 \n", + "12 105127 217924 86513 66700 0.3969870 0.30607001 \n", + " RR non_past non_future\n", + "1 2.0771661 2423199 2764149 \n", + "2 1.4864843 388419 430226 \n", + "3 1.8790706 203973 223382 \n", + "4 1.6263172 1916681 2672882 \n", + "5 2.0920468 317503 464534 \n", + "6 0.7581619 29776 27413 \n", + "7 1.3395191 76295 88114 \n", + "8 1.3452915 6209 6671 \n", + "9 1.4347826 2240 2440 \n", + "10 1.3491685 85015 90726 \n", + "11 1.1802256 18884441 20447363 \n", + "12 1.2970465 131411 151224 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df <- read.csv(file = 'ref_counts_summary.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "
A escalc: 12 x 18
X...datasettypesourcefullnon.emptyis_equalpastfuturetotalcorrected_pastcorrected_futurepast_propfuture_propRRnon_pastnon_futureyivi
<chr><chr><chr><int><int><lgl><int><int><int><int><int><dbl><dbl><dbl><int><int><dbl><dbl>
IMSDb Scripted 1091 1091 TRUE 833026 472519 3080674 657475 3165250.21341920.102745372.0771661 2423199 2764149 0.73100454.031070e-06
Movies Scripted ConvoKit 304713 304446FALSE 179729 129622 516163 127744 859370.24748770.166491981.4864843 388419 430226 0.39641381.558984e-05
SwitchboardSpontaneousConvoKit 122646 122646 TRUE 62464 32372 245461 41488 220790.16902070.089949121.8790706 203973 223382 0.63077736.124733e-05
SCOTUS ConstrainedConvoKit 1700789 1700789 TRUE 3089509 1802239 3880259 196357812073770.50604300.311158871.6263172 1916681 2672882 0.48631818.220866e-07
Tennis ConstrainedConvoKit 163948 163948 TRUE 448444 193802 599172 281669 1346380.47009710.224706762.0920468 317503 464534 0.73814297.639650e-06
PfG ConstrainedConvoKit 20932 20932 TRUE 9695 15520 37184 7408 97710.19922550.262774310.7581619 29776 27413-0.27685831.835463e-04
IQ2 ConstrainedConvoKit 26562 26317FALSE 67626 51780 122925 46630 348110.37933700.283188941.3395191 76295 88114 0.29231073.390189e-05
GAP ConstrainedConvoKit 8009 8009 TRUE 2739 1958 8009 1800 13380.22474720.167062051.3452915 6209 6671 0.29661071.053221e-03
Chair Scripted 6 6 TRUE 909 663 2900 660 4600.22758620.158620691.4347826 2240 2440 0.36101332.999409e-03
Friends Scripted ConvoKit 67373 61310FALSE 32105 23931 107082 22067 163560.20607570.152742761.3491685 85015 90726 0.29948858.777890e-05
Gutenberg Scripted 1477374114773741 TRUE1461798313714226291193931023495286720300.35148230.297809441.18022561888444120447363 0.16570561.443349e-07
Reddit ConstrainedConvoKit 74468 72985FALSE 120512 105127 217924 86513 667000.39698700.306070011.2970465 131411 151224 0.26008971.737395e-05
\n" + ], + "text/latex": [ + "A escalc: 12 x 18\n", + "\\begin{tabular}{llllllllllllllllll}\n", + " X...dataset & type & source & full & non.empty & is\\_equal & past & future & total & corrected\\_past & corrected\\_future & past\\_prop & future\\_prop & RR & non\\_past & non\\_future & yi & vi\\\\\n", + " & & & & & & & & & & & & & & & & & \\\\\n", + "\\hline\n", + "\t IMSDb & Scripted & & 1091 & 1091 & TRUE & 833026 & 472519 & 3080674 & 657475 & 316525 & 0.2134192 & 0.10274537 & 2.0771661 & 2423199 & 2764149 & 0.7310045 & 4.031070e-06\\\\\n", + "\t Movies & Scripted & ConvoKit & 304713 & 304446 & FALSE & 179729 & 129622 & 516163 & 127744 & 85937 & 0.2474877 & 0.16649198 & 1.4864843 & 388419 & 430226 & 0.3964138 & 1.558984e-05\\\\\n", + "\t Switchboard & Spontaneous & ConvoKit & 122646 & 122646 & TRUE & 62464 & 32372 & 245461 & 41488 & 22079 & 0.1690207 & 0.08994912 & 1.8790706 & 203973 & 223382 & 0.6307773 & 6.124733e-05\\\\\n", + "\t SCOTUS & Constrained & ConvoKit & 1700789 & 1700789 & TRUE & 3089509 & 1802239 & 3880259 & 1963578 & 1207377 & 0.5060430 & 0.31115887 & 1.6263172 & 1916681 & 2672882 & 0.4863181 & 8.220866e-07\\\\\n", + "\t Tennis & Constrained & ConvoKit & 163948 & 163948 & TRUE & 448444 & 193802 & 599172 & 281669 & 134638 & 0.4700971 & 0.22470676 & 2.0920468 & 317503 & 464534 & 0.7381429 & 7.639650e-06\\\\\n", + "\t PfG & Constrained & ConvoKit & 20932 & 20932 & TRUE & 9695 & 15520 & 37184 & 7408 & 9771 & 0.1992255 & 0.26277431 & 0.7581619 & 29776 & 27413 & -0.2768583 & 1.835463e-04\\\\\n", + "\t IQ2 & Constrained & ConvoKit & 26562 & 26317 & FALSE & 67626 & 51780 & 122925 & 46630 & 34811 & 0.3793370 & 0.28318894 & 1.3395191 & 76295 & 88114 & 0.2923107 & 3.390189e-05\\\\\n", + "\t GAP & Constrained & ConvoKit & 8009 & 8009 & TRUE & 2739 & 1958 & 8009 & 1800 & 1338 & 0.2247472 & 0.16706205 & 1.3452915 & 6209 & 6671 & 0.2966107 & 1.053221e-03\\\\\n", + "\t Chair & Scripted & & 6 & 6 & TRUE & 909 & 663 & 2900 & 660 & 460 & 0.2275862 & 0.15862069 & 1.4347826 & 2240 & 2440 & 0.3610133 & 2.999409e-03\\\\\n", + "\t Friends & Scripted & ConvoKit & 67373 & 61310 & FALSE & 32105 & 23931 & 107082 & 22067 & 16356 & 0.2060757 & 0.15274276 & 1.3491685 & 85015 & 90726 & 0.2994885 & 8.777890e-05\\\\\n", + "\t Gutenberg & Scripted & & 14773741 & 14773741 & TRUE & 14617983 & 13714226 & 29119393 & 10234952 & 8672030 & 0.3514823 & 0.29780944 & 1.1802256 & 18884441 & 20447363 & 0.1657056 & 1.443349e-07\\\\\n", + "\t Reddit & Constrained & ConvoKit & 74468 & 72985 & FALSE & 120512 & 105127 & 217924 & 86513 & 66700 & 0.3969870 & 0.30607001 & 1.2970465 & 131411 & 151224 & 0.2600897 & 1.737395e-05\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A escalc: 12 x 18\n", + "\n", + "| X...dataset <chr> | type <chr> | source <chr> | full <int> | non.empty <int> | is_equal <lgl> | past <int> | future <int> | total <int> | corrected_past <int> | corrected_future <int> | past_prop <dbl> | future_prop <dbl> | RR <dbl> | non_past <int> | non_future <int> | yi <dbl> | vi <dbl> |\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "| IMSDb | Scripted | | 1091 | 1091 | TRUE | 833026 | 472519 | 3080674 | 657475 | 316525 | 0.2134192 | 0.10274537 | 2.0771661 | 2423199 | 2764149 | 0.7310045 | 4.031070e-06 |\n", + "| Movies | Scripted | ConvoKit | 304713 | 304446 | FALSE | 179729 | 129622 | 516163 | 127744 | 85937 | 0.2474877 | 0.16649198 | 1.4864843 | 388419 | 430226 | 0.3964138 | 1.558984e-05 |\n", + "| Switchboard | Spontaneous | ConvoKit | 122646 | 122646 | TRUE | 62464 | 32372 | 245461 | 41488 | 22079 | 0.1690207 | 0.08994912 | 1.8790706 | 203973 | 223382 | 0.6307773 | 6.124733e-05 |\n", + "| SCOTUS | Constrained | ConvoKit | 1700789 | 1700789 | TRUE | 3089509 | 1802239 | 3880259 | 1963578 | 1207377 | 0.5060430 | 0.31115887 | 1.6263172 | 1916681 | 2672882 | 0.4863181 | 8.220866e-07 |\n", + "| Tennis | Constrained | ConvoKit | 163948 | 163948 | TRUE | 448444 | 193802 | 599172 | 281669 | 134638 | 0.4700971 | 0.22470676 | 2.0920468 | 317503 | 464534 | 0.7381429 | 7.639650e-06 |\n", + "| PfG | Constrained | ConvoKit | 20932 | 20932 | TRUE | 9695 | 15520 | 37184 | 7408 | 9771 | 0.1992255 | 0.26277431 | 0.7581619 | 29776 | 27413 | -0.2768583 | 1.835463e-04 |\n", + "| IQ2 | Constrained | ConvoKit | 26562 | 26317 | FALSE | 67626 | 51780 | 122925 | 46630 | 34811 | 0.3793370 | 0.28318894 | 1.3395191 | 76295 | 88114 | 0.2923107 | 3.390189e-05 |\n", + "| GAP | Constrained | ConvoKit | 8009 | 8009 | TRUE | 2739 | 1958 | 8009 | 1800 | 1338 | 0.2247472 | 0.16706205 | 1.3452915 | 6209 | 6671 | 0.2966107 | 1.053221e-03 |\n", + "| Chair | Scripted | | 6 | 6 | TRUE | 909 | 663 | 2900 | 660 | 460 | 0.2275862 | 0.15862069 | 1.4347826 | 2240 | 2440 | 0.3610133 | 2.999409e-03 |\n", + "| Friends | Scripted | ConvoKit | 67373 | 61310 | FALSE | 32105 | 23931 | 107082 | 22067 | 16356 | 0.2060757 | 0.15274276 | 1.3491685 | 85015 | 90726 | 0.2994885 | 8.777890e-05 |\n", + "| Gutenberg | Scripted | | 14773741 | 14773741 | TRUE | 14617983 | 13714226 | 29119393 | 10234952 | 8672030 | 0.3514823 | 0.29780944 | 1.1802256 | 18884441 | 20447363 | 0.1657056 | 1.443349e-07 |\n", + "| Reddit | Constrained | ConvoKit | 74468 | 72985 | FALSE | 120512 | 105127 | 217924 | 86513 | 66700 | 0.3969870 | 0.30607001 | 1.2970465 | 131411 | 151224 | 0.2600897 | 1.737395e-05 |\n", + "\n" + ], + "text/plain": [ + " X...dataset type source full non.empty is_equal past \n", + "1 IMSDb Scripted 1091 1091 TRUE 833026\n", + "2 Movies Scripted ConvoKit 304713 304446 FALSE 179729\n", + "3 Switchboard Spontaneous ConvoKit 122646 122646 TRUE 62464\n", + "4 SCOTUS Constrained ConvoKit 1700789 1700789 TRUE 3089509\n", + "5 Tennis Constrained ConvoKit 163948 163948 TRUE 448444\n", + "6 PfG Constrained ConvoKit 20932 20932 TRUE 9695\n", + "7 IQ2 Constrained ConvoKit 26562 26317 FALSE 67626\n", + "8 GAP Constrained ConvoKit 8009 8009 TRUE 2739\n", + "9 Chair Scripted 6 6 TRUE 909\n", + "10 Friends Scripted ConvoKit 67373 61310 FALSE 32105\n", + "11 Gutenberg Scripted 14773741 14773741 TRUE 14617983\n", + "12 Reddit Constrained ConvoKit 74468 72985 FALSE 120512\n", + " future total corrected_past corrected_future past_prop future_prop\n", + "1 472519 3080674 657475 316525 0.2134192 0.10274537 \n", + "2 129622 516163 127744 85937 0.2474877 0.16649198 \n", + "3 32372 245461 41488 22079 0.1690207 0.08994912 \n", + "4 1802239 3880259 1963578 1207377 0.5060430 0.31115887 \n", + "5 193802 599172 281669 134638 0.4700971 0.22470676 \n", + "6 15520 37184 7408 9771 0.1992255 0.26277431 \n", + "7 51780 122925 46630 34811 0.3793370 0.28318894 \n", + "8 1958 8009 1800 1338 0.2247472 0.16706205 \n", + "9 663 2900 660 460 0.2275862 0.15862069 \n", + "10 23931 107082 22067 16356 0.2060757 0.15274276 \n", + "11 13714226 29119393 10234952 8672030 0.3514823 0.29780944 \n", + "12 105127 217924 86513 66700 0.3969870 0.30607001 \n", + " RR non_past non_future yi vi \n", + "1 2.0771661 2423199 2764149 0.7310045 4.031070e-06\n", + "2 1.4864843 388419 430226 0.3964138 1.558984e-05\n", + "3 1.8790706 203973 223382 0.6307773 6.124733e-05\n", + "4 1.6263172 1916681 2672882 0.4863181 8.220866e-07\n", + "5 2.0920468 317503 464534 0.7381429 7.639650e-06\n", + "6 0.7581619 29776 27413 -0.2768583 1.835463e-04\n", + "7 1.3395191 76295 88114 0.2923107 3.390189e-05\n", + "8 1.3452915 6209 6671 0.2966107 1.053221e-03\n", + "9 1.4347826 2240 2440 0.3610133 2.999409e-03\n", + "10 1.3491685 85015 90726 0.2994885 8.777890e-05\n", + "11 1.1802256 18884441 20447363 0.1657056 1.443349e-07\n", + "12 1.2970465 131411 151224 0.2600897 1.737395e-05" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dat <- escalc(measure=\"RR\", ai=corrected_past, bi=non_past, ci=corrected_future, di=non_future, data=df,\n", + " slab=X...dataset)\n", + "dat" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + "Random-Effects Model (k = 12; tau^2 estimator: REML)\n", + "\n", + "tau^2 (estimated amount of total heterogeneity): 0.0762 (SE = 0.0326)\n", + "tau (square root of estimated tau^2 value): 0.2760\n", + "I^2 (total heterogeneity / total variability): 100.00%\n", + "H^2 (total variability / sampling variability): 20907.29\n", + "\n", + "Test for Heterogeneity:\n", + "Q(df = 11) = 209989.5995, p-val < .0001\n", + "\n", + "Model Results:\n", + "\n", + "estimate se zval pval ci.lb ci.ub \n", + " 0.3653 0.0799 4.5732 <.0001 0.2087 0.5218 *** \n", + "\n", + "---\n", + "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "res1 <- rma(yi, vi, data=dat)\n", + "res1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\n", + " pred ci.lb ci.ub pi.lb pi.ub \n", + " 1.441 1.232 1.685 0.820 2.531 \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "predict(res1, transf=exp, digits=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "plot without title" + ] + }, + "metadata": { + "image/png": { + "height": 420, + "width": 420 + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# pdf(file='forestplot.pdf')\n", + "\n", + "forest(res1, header=c(\"Dataset\",\"Ratio [95% CI]\"), \n", + "atransf=exp, \n", + "shade=TRUE,\n", + "at=log(c(0.67, 1, 1.5, 2.25)),\n", + "ilab=cbind(type, total, corrected_past, corrected_future),\n", + "ilab.xpos=c(-2.05,-1.5,-0.95,-0.4),\n", + "cex=0.75,\n", + "xlim=c(-3.6,1.8),\n", + "# alim=c(0.5, 4),\n", + "xlab=c(\"Ratio (log scale)\"),\n", + "ilab.pos=2)\n", + "\n", + "op <- par(cex=0.8, font=2)\n", + "text(c(-2.05,-1.5,-0.95,-0.4), pos=2, res1$k+2.05, c(\"Type\", \"Total\", \"Past\", \"Future\"))\n", + "par(op)\n", + "\n", + "# dev.off() # Turn the PDF device off" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.3.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code/results_rep.ipynb b/code/results_rep.ipynb index 6e25039..2387605 100755 --- a/code/results_rep.ipynb +++ b/code/results_rep.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2021-11-05T18:10:25.867414Z", @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-11-05T18:10:35.374315Z", @@ -100,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2021-11-05T18:10:35.388571Z", @@ -2107,6 +2107,25 @@ "### read reference files" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# references count\n", + "df_refer_full = pd.read_excel('../data/rep/TheChair.xlsx', sheet_name='references_full')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df_refer_full.groupby(['episode','direction'])['time'].count().reset_index().to_csv(\"../data/the_chair/the_chair_manual_reference_counts.csv\", index=False)" + ] + }, { "cell_type": "code", "execution_count": 53, diff --git a/data/metaanalysis-datasets.xlsx b/data/metaanalysis-datasets.xlsx index 7c67d91..3e0a700 100644 Binary files a/data/metaanalysis-datasets.xlsx and b/data/metaanalysis-datasets.xlsx differ diff --git a/data/ref_counts_summary.csv b/data/ref_counts_summary.csv new file mode 100644 index 0000000..62199ea --- /dev/null +++ b/data/ref_counts_summary.csv @@ -0,0 +1,13 @@ +dataset,type,total,corrected_past,corrected_future,non_past,non_future +IMSDb,Scripted,3080674,657475,316525,2423199,2764149 +Movies,Scripted,516163,127744,85937,388419,430226 +Switchboard,Spontaneous,245461,41488,22079,203973,223382 +SCOTUS,Constrained,3880259,1963578,1207377,1916681,2672882 +Tennis,Constrained,599172,281669,134638,317503,464534 +PfG,Constrained,37184,7408,9771,29776,27413 +IQ2,Constrained,122925,46630,34811,76295,88114 +GAP,Constrained,8009,1800,1338,6209,6671 +Chair,Scripted,2900,660,460,2240,2440 +Friends,Scripted,107082,22067,16356,85015,90726 +Gutenberg,Scripted,29119393,10234952,8672030,18884441,20447363 +Reddit,Constrained,217924,86513,66700,131411,151224 \ No newline at end of file diff --git a/data/rep/TheChair.xlsx b/data/rep/TheChair.xlsx index c5f3039..bb478f1 100644 Binary files a/data/rep/TheChair.xlsx and b/data/rep/TheChair.xlsx differ diff --git a/data/the_chair/the_chair_auto_reference_counts.csv b/data/the_chair/the_chair_auto_reference_counts.csv new file mode 100644 index 0000000..f261e7a --- /dev/null +++ b/data/the_chair/the_chair_auto_reference_counts.csv @@ -0,0 +1,7 @@ +Episode,Total,Past,Future +1,457,112,74 +2,501,80,64 +3,518,123,82 +4,442,108,75 +5,508,116,66 +6,474,121,99 \ No newline at end of file diff --git a/data/the_chair/the_chair_manual_reference_counts.csv b/data/the_chair/the_chair_manual_reference_counts.csv index 76ef062..8427c43 100644 --- a/data/the_chair/the_chair_manual_reference_counts.csv +++ b/data/the_chair/the_chair_manual_reference_counts.csv @@ -1,7 +1,7 @@ -Past,Future -60,18 -30,14 -43,33 -31,21 -36,11 -27,12 +Episode,Past,Future +1,60,19 +2,29,14 +3,43,33 +4,30,20 +5,37,10 +6,27,12 \ No newline at end of file