Skip to content

Commit

Permalink
Merge pull request #70 from anopheles-genomic-surveillance/fix-dodgy-…
Browse files Browse the repository at this point in the history
…signals-alimanfoo-2023-07-12
  • Loading branch information
sanjaynagi authored Jul 15, 2023
2 parents 47b7082 + 1218072 commit 0589b15
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 60 deletions.
5 changes: 5 additions & 0 deletions workflow/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ cohorts_analysis: "20230223"
# Select the contig to use for H12 window size calibration.
h12_calibration_contig: "3L"

# Configuration for H12 signal detection.
h12_signal_detection_min_delta_aic: 1000
h12_signal_detection_min_stat_max: 0.1
h12_signal_detection_gflanks: [6]

# Set this value to False when running on datalab, because it's
# better to read data directly from GCS.
use_gcs_cache: False
Expand Down
38 changes: 26 additions & 12 deletions workflow/notebooks/cohort-page.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
"source": [
"# Notebook parameters. Values here are for development only and \n",
"# will be overridden when running via snakemake and papermill.\n",
"cohort_id = 'ML-2_Kati_colu_2014_Q3'\n",
"cohort_id = \"CD-NU_Gbadolite_gamb_2015_Q3\"\n",
"# cohort_id = 'ML-2_Kati_colu_2014_Q3'\n",
"# cohort_id = 'CI-LG_Agneby-Tiassa_colu_2012'\n",
"cohorts_analysis = \"20230223\"\n",
"contigs = ['3RL']\n",
"contigs = ['2RL', '3RL', 'X']\n",
"sample_sets = \"3.0\"\n",
"min_cohort_size = 20\n",
"max_cohort_size = 50\n",
Expand Down Expand Up @@ -277,22 +278,31 @@
"id": "7b10b224",
"metadata": {
"tags": [
"remove-input"
"remove-input",
"remove-output"
]
},
"outputs": [],
"source": [
"# load signals to overlay on H12 plots\n",
"import pandas as pd\n",
"\n",
"try:\n",
" df_signals = [\n",
" pd.read_csv(here() / \"build/h12-signal-detection/\" / f\"{cohort_id}_{contig}.csv\")\n",
" for contig in contigs\n",
" ]\n",
" df_signals = pd.concat(df_signals).assign(bottom=0, top=1)\n",
"except pd.errors.EmptyDataError:\n",
" df_signals = pd.DataFrame()"
"df_signals_contigs = []\n",
"for contig in contigs:\n",
"\n",
" try:\n",
" df_signals_contig = pd.read_csv(here() / \"build/h12-signal-detection/\" / f\"{cohort_id}_{contig}.csv\")\n",
" except pd.errors.EmptyDataError:\n",
" df_signals_contig = pd.DataFrame()\n",
" df_signals_contigs.append(df_signals_contig) \n",
" \n",
"df_signals = pd.concat(df_signals_contigs)\n",
"\n",
"# Add extra columns to help with overlaying signals on plots.\n",
"df_signals[\"bottom\"] = 0\n",
"df_signals[\"top\"] = 1\n",
"\n",
"df_signals"
]
},
{
Expand Down Expand Up @@ -329,7 +339,8 @@
" sizing_mode='stretch_width', \n",
" show=False, \n",
" width=800, \n",
" genes_height=100\n",
" track_height=150,\n",
" genes_height=90,\n",
" ):\n",
"\n",
" sample_query = cohort[\"sample_query\"]\n",
Expand All @@ -345,6 +356,7 @@
" sizing_mode=sizing_mode,\n",
" show=show,\n",
" width=width,\n",
" height=track_height,\n",
" )\n",
" fig1.xaxis.visible = False\n",
" \n",
Expand Down Expand Up @@ -391,6 +403,7 @@
" max_cohort_size=max_cohort_size,\n",
" sizing_mode=sizing_mode,\n",
" width=width,\n",
" height=track_height,\n",
" show=show,\n",
" title=\"\",\n",
" x_range=fig1.x_range,\n",
Expand All @@ -407,6 +420,7 @@
" max_cohort_size=max_cohort_size,\n",
" sizing_mode=sizing_mode,\n",
" width=width,\n",
" height=track_height,\n",
" show=show,\n",
" title=\"\",\n",
" x_range=fig1.x_range,\n",
Expand Down
33 changes: 23 additions & 10 deletions workflow/notebooks/h12-signal-detection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,18 @@
"source": [
"# Notebook parameters. Values here are for development only and \n",
"# will be overridden when running via snakemake and papermill.\n",
"cohort_id = 'BF-09_Houet_colu_2012_Q3'\n",
"# cohort_id = 'CD-NU_Gbadolite_gamb_2015_Q3'\n",
"cohort_id = 'ML-2_Kati_gamb_2014_Q3'\n",
"cohorts_analysis = \"20230223\"\n",
"contig = '2L'\n",
"contig = '2RL'\n",
"sample_sets = \"3.0\"\n",
"min_cohort_size = 20\n",
"max_cohort_size = 50\n",
"use_gcs_cache = False\n",
"dask_scheduler = \"threads\""
"dask_scheduler = \"threads\"\n",
"h12_signal_detection_min_delta_aic = 1000\n",
"h12_signal_detection_min_stat_max = 0.1\n",
"h12_signal_detection_gflanks = [6]"
]
},
{
Expand Down Expand Up @@ -207,9 +211,7 @@
"# set parameters for signal detection\n",
"filter_size = 20 # hampel filter parameter\n",
"filter_t = 2 # hampel filter parameter\n",
"gflanks = (4, 8) # sizes of flanks in cM\n",
"scan_interval = 1 # step in cM\n",
"min_delta_aic = 500 # minimum evidence to emit a signal\n",
"min_baseline = 0\n",
"max_baseline_percentile = 95\n",
"min_amplitude = 0.03\n",
Expand Down Expand Up @@ -334,14 +336,16 @@
"\n",
" # before filtering\n",
" fig, ax = plt.subplots(figsize=(8, 2))\n",
" ax.plot(gpos, h12, marker='o', linestyle=' ', mfc='none', markersize=2)\n",
" ax.plot(gpos, h12, marker='o', linestyle=' ', mfc='none', markersize=1, mew=.5, color='k')\n",
" ax.set_title('Unfiltered')\n",
" ax.set_ylim(0, 1)\n",
" fig.tight_layout()\n",
"\n",
" # after filtering\n",
" fig, ax = plt.subplots(figsize=(8, 2))\n",
" ax.plot(gpos, h12_filtered, marker='o', linestyle=' ', mfc='none', markersize=2)\n",
" ax.plot(gpos, h12_filtered, marker='o', linestyle=' ', mfc='none', markersize=1, mew=.5, color='k')\n",
" ax.set_title('Filtered')\n",
" ax.set_ylim(0, 1)\n",
" fig.tight_layout()\n"
]
},
Expand Down Expand Up @@ -385,9 +389,9 @@
"# main loop, iterate along the genome\n",
"for gcenter in np.arange(scan_start, scan_stop, scan_interval):\n",
"\n",
" for gflank in gflanks:\n",
" for gflank in h12_signal_detection_gflanks:\n",
"\n",
" print('center', gcenter, 'flank size', gflank)\n",
" # print('center', gcenter, 'flank size', gflank)\n",
"\n",
" result = fit_exponential_peak(\n",
" ppos=ppos, \n",
Expand All @@ -407,7 +411,8 @@
" init_baseline=init_baseline,\n",
" min_baseline=min_baseline,\n",
" max_baseline=max_baseline,\n",
" min_delta_aic=min_delta_aic,\n",
" min_delta_aic=h12_signal_detection_min_delta_aic,\n",
" min_stat_max=h12_signal_detection_min_stat_max,\n",
" debug=debug,\n",
" )\n",
"\n",
Expand Down Expand Up @@ -486,6 +491,14 @@
"with open(here() / outdir / f\"{cohort_id}_{contig}.csv\", mode=\"w\") as output_file:\n",
" df_signals_dedup.to_csv(output_file, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fe1c1b8-d620-48ca-8940-a7124f2653d4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
96 changes: 58 additions & 38 deletions workflow/notebooks/peak-utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@
" min_baseline,\n",
" max_baseline,\n",
" min_delta_aic,\n",
" min_stat_max,\n",
" debug=False,\n",
"):\n",
"\n",
Expand Down Expand Up @@ -326,61 +327,80 @@
"\n",
" peak_in_scan_interval = ((gcenter - scan_interval) < fit_gcenter < (gcenter + scan_interval))\n",
"\n",
" if peak_delta_i > min_delta_aic and peak_in_scan_interval:\n",
"\n",
" fit_params = peak_result.params\n",
" fit_skew = fit_params['skew'].value\n",
" fit_decay = fit_params['decay'].value\n",
" decay_right = 2**(-fit_skew) * fit_decay\n",
" decay_left = 2**fit_skew * fit_decay\n",
" focus_gstart = fit_gcenter - .25 * decay_left\n",
" focus_gstop = fit_gcenter + .25 * decay_right\n",
" span1_gstart = fit_gcenter - 1*decay_left\n",
" span1_gstop = fit_gcenter + 1*decay_right\n",
" span2_gstart = fit_gcenter - 2*decay_left\n",
" span2_gstop = fit_gcenter + 2*decay_right\n",
"\n",
" # Determine chromosome physical position.\n",
" fit_pcenter = ag_g2p(contig, fit_gcenter)\n",
" focus_pstart = ag_g2p(contig, focus_gstart)\n",
" focus_pstop = ag_g2p(contig, focus_gstop)\n",
" span1_pstart = ag_g2p(contig, span1_gstart)\n",
" span1_pstop = ag_g2p(contig, span1_gstop)\n",
" span2_pstart = ag_g2p(contig, span2_gstart)\n",
" span2_pstop = ag_g2p(contig, span2_gstop)\n",
"\n",
" # Determine max value, pos max value (genetic, physical).\n",
" loc_peak = slice(bisect_left(x, span2_gstart), \n",
" bisect_right(x, span2_gstop))\n",
" if loc_peak.stop == loc_peak.start:\n",
" # In some rare cases, there are no data points within this\n",
" # peak region. This is probably pathological, so don't output\n",
" # a signal in this case.\n",
" return\n",
" x_peak = x[loc_peak]\n",
" y_peak = y[loc_peak]\n",
" loc_max = np.argmax(y_peak)\n",
" gpos_max = x_peak[loc_max]\n",
" ppos_max = ag_g2p(contig, gpos_max)\n",
" stat_max = y_peak[loc_max]\n",
" fit_params = peak_result.params\n",
" fit_skew = fit_params['skew'].value\n",
" fit_decay = fit_params['decay'].value\n",
" decay_right = 2**(-fit_skew) * fit_decay\n",
" decay_left = 2**fit_skew * fit_decay\n",
" focus_gstart = fit_gcenter - .25 * decay_left\n",
" focus_gstop = fit_gcenter + .25 * decay_right\n",
" span1_gstart = fit_gcenter - 1*decay_left\n",
" span1_gstop = fit_gcenter + 1*decay_right\n",
" span2_gstart = fit_gcenter - 2*decay_left\n",
" span2_gstop = fit_gcenter + 2*decay_right\n",
"\n",
" # Determine chromosome physical position.\n",
" fit_pcenter = ag_g2p(contig, fit_gcenter)\n",
" focus_pstart = ag_g2p(contig, focus_gstart)\n",
" focus_pstop = ag_g2p(contig, focus_gstop)\n",
" span1_pstart = ag_g2p(contig, span1_gstart)\n",
" span1_pstop = ag_g2p(contig, span1_gstop)\n",
" span2_pstart = ag_g2p(contig, span2_gstart)\n",
" span2_pstop = ag_g2p(contig, span2_gstop)\n",
"\n",
" # Determine max value, pos max value (genetic, physical).\n",
" loc_peak = slice(bisect_left(x, span2_gstart), \n",
" bisect_right(x, span2_gstop))\n",
" if loc_peak.stop == loc_peak.start:\n",
" # In some rare cases, there are no data points within this\n",
" # peak region. This is probably pathological, so don't output\n",
" # a signal in this case.\n",
" return\n",
" x_peak = x[loc_peak]\n",
" y_peak = y[loc_peak]\n",
" loc_max = np.argmax(y_peak)\n",
" gpos_max = x_peak[loc_max]\n",
" ppos_max = ag_g2p(contig, gpos_max)\n",
" stat_max = y_peak[loc_max]\n",
"\n",
" if peak_delta_i > min_delta_aic and stat_max > min_stat_max and peak_in_scan_interval:\n",
"\n",
" if debug:\n",
"\n",
" x_fitted = np.linspace(x[0], x[-1], 1000)\n",
" y_fitted = skewed_exponential_peak(\n",
" x=x_fitted, \n",
" center=peak_result.params['center'].value, \n",
" amplitude=peak_result.params['amplitude'].value, \n",
" decay=peak_result.params['decay'].value, \n",
" skew=peak_result.params['skew'].value, \n",
" baseline=peak_result.params['baseline'].value, \n",
" floor=peak_result.params['floor'].value, \n",
" ceiling=peak_result.params['ceiling'].value,\n",
" )\n",
" delta_i = null_result.aic - peak_result.aic\n",
" fig, ax = plt.subplots(facecolor='w', figsize=(6, 4))\n",
" ax.plot(x, y, marker='o', linestyle=' ', mfc='none', markersize=2);\n",
" ax.plot(x_fitted, y_fitted, marker=None, linestyle='--', color='k');\n",
" ax.axvspan(fit_gcenter - decay_left, fit_gcenter + decay_right, zorder=0, color='red', alpha=.2)\n",
" ax.axvspan(fit_gcenter - 2*decay_left, fit_gcenter + 2*decay_right, zorder=0, color='red', alpha=.2)\n",
" ax.axvline(fit_gcenter, color='red', lw=2, zorder=0)\n",
" ax.axhline(min_stat_max, color='k', lw=1, linestyle='--', zorder=0)\n",
" ax.annotate(\n",
" f'$n={x.shape[0]}$\\n' + \n",
" f'$AIC={peak_result.aic:.0f}$\\n' +\n",
" f'$BIC={peak_result.bic:.0f}$\\n' +\n",
" f'$\\\\chi^{2}={peak_result.chisqr:.3f}$\\n' +\n",
" f'$\\\\Delta_{{i}}={null_result.aic - peak_result.aic:.0f}$',\n",
" f'$\\\\Delta_{{i}}={delta_i:.0f}$\\n' + \n",
" f'$\\\\Delta_{{i}} / n = {delta_i/x.shape[0]:.2f}$\\n',\n",
" xy=(0, 1), xycoords='axes fraction',\n",
" xytext=(5, -5), textcoords='offset points',\n",
" va='top', ha='left', fontsize=8,\n",
" )\n",
" ax.set_xlim(gcenter - gflank, gcenter + gflank)\n",
" ax.set_ylim(0, 1)\n",
" ax.set_ylabel(\"H12\")\n",
" ax.set_xlabel(f\"Contig {contig} position (cM)\")\n",
" ax.set_title(f\"center {gcenter}, flank {gflank}\")\n",
" fig.tight_layout()\n",
" plt.show()\n",
Expand Down

0 comments on commit 0589b15

Please sign in to comment.