From eb72f0dbd829d922bdb4ea79ba49087acf0b0e18 Mon Sep 17 00:00:00 2001 From: Christopher Cole Date: Thu, 18 Apr 2019 17:47:08 +0000 Subject: [PATCH] Added smcsmc to the analysis pipeline. --- README.md | 6 +++ n_t/Snakefile | 89 ++++++++++++++++++++++++++++++++-- n_t/plots.py | 132 ++++++++++++++++++++++++++++++++++---------------- n_t/readme.md | 32 ++++++------ 4 files changed, 200 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index d8c9e7f..b05f4b7 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make cd ../../ ``` +`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux. + +```sh +conda install -c luntergroup smcsmc +``` + Further instructions can be currently found in each task directory diff --git a/n_t/Snakefile b/n_t/Snakefile index 6d210d2..7706b1e 100644 --- a/n_t/Snakefile +++ b/n_t/Snakefile @@ -24,13 +24,14 @@ import smc import msmc import simulations import plots +import smcsmc +import smcsmc.popsim # ############################################################################### # KNOBS - # ############################################################################### - # A seed to replicate results # TODO mutation rates @@ -49,11 +50,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"] # Here is a list of sample sizes to run msmc on. # Each element counts as its own analysis # so there will be "replicates" runs for each size +# +# and a similar setup for SMCSMC num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split(",")] +num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split(",")] # The number of msmc Baumwelch(?) iterations to run, # typically 20 +# +# And again, similar for SMCSMC. Number of stochastic EM iterations. +# 15 is typical, but more is good too. Assess for convergence based on +# rainbow plots. num_msmc_iterations = config["num_msmc_iterations"] +num_smcsmc_iterations = config["num_smcsmc_iterations"] + +# Number of particles to use for SMCSMC +# +# A good starting point is 50k, and see if reducing +# significantly impacts the estimates that you are +# recieveing. +num_smcsmc_particles = config["num_smcsmc_particles"] # The number of replicates of each analysis you would like to run replicates = config["replicates"] @@ -299,6 +315,72 @@ rule compound_msmc: run: plots.plot_compound_msmc(model, input, output[0]) +# ############################################################################### +# SMCSMC +# ############################################################################### + +rule ts_to_seg: + input: rules.simulation.output + output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg" + run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc) + +rule run_smcsmc: + input: + expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg", + chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc) + output: + output_dir + "/Intermediate/{seeds}/{samps}.run/result.out" + run: + inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg", + seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list) + + input_file_string = " ".join(inputs) + args = { + 'EM': str(num_smcsmc_iterations), + 'Np': str(num_smcsmc_particles), + # Submission Parameters + 'chunks': '100', + 'no_infer_recomb': '', + # Other inference parameters + 'mu': str(species.genome.mean_mutation_rate), + 'N0': '14312', + 'rho': '3e-9', + 'calibrate_lag': '1.0', + 'tmax': '3.5', + 'alpha': '0', + 'apf': '2', + 'P': '133 133016 31*1', + 'VB': '', + 'nsam': str(wildcards.samps), + # This should be in the conda bin + 'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc') + } + args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run" + args['segs'] = input_file_string + + smcsmc.run_smcsmc(args) + +rule convert_smcsmc: + input: rules.run_smcsmc.output + output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv" + run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations) + + +def ne_files_smcsmc(wildcards): + return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv", + seeds=seed_array, samps=num_sampled_genomes_smcsmc) + +rule plot_by_sample: + input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array) + output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png" + run: + plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model) + +rule compound_smcsmc: + input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc) + + + # ############################################################################### # # ############################################################################### @@ -309,10 +391,11 @@ rule all_plot: f1 = ne_files, f2 = ne_files_smcpp, f3 = ne_files_msmc, + f4 = ne_files_smcsmc, output: output_dir + "/Results/all_estimated_Ne.pdf" - run: - plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0], + run: + plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0], model=model, n_samp=num_sampled_genomes_per_replicate, generation_time=generation_time, species=config["species"], pop_id=population_id) diff --git a/n_t/plots.py b/n_t/plots.py index 8ead5f6..cf3ea8f 100644 --- a/n_t/plots.py +++ b/n_t/plots.py @@ -8,6 +8,7 @@ import os import matplotlib.patches as mpatches from matplotlib import pyplot as plt +import stdpopsim import numpy as np # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') @@ -61,52 +62,99 @@ def plot_compound_msmc(infiles, outfile): ax.plot(nt['x'], nt['y'], c="red") f.savefig(outfile, bbox_inches='tight') +def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None): + f, ax = plt.subplots(figsize=(7, 7)) + ax.set(xscale="log", yscale="log") -def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile, - model, n_samp, generation_time, species, - pop_id=0, steps=None): + if model is not None: + ddb = msprime.DemographyDebugger(**model.asdict()) + if steps is None: + end_time = ddb.epochs[-2].end_time + 10000 + steps = np.exp(np.linspace(1,np.log(end_time),31)) + num_samples = [0 for _ in range(ddb.num_populations)] + num_samples[pop_id] = 20 + coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps, + num_samples=num_samples, double_step_validation=False) + steps = steps * generation_time + ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre') - ddb = msprime.DemographyDebugger(**model.asdict()) - if steps is None: - end_time = ddb.epochs[-2].end_time + 10000 - steps = np.linspace(1, end_time, end_time+1) - num_samples = [0 for _ in range(ddb.num_populations)] - num_samples[pop_id] = n_samp - coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps, - num_samples=num_samples, - double_step_validation=False) - steps = steps * generation_time - num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles]) - num_msmc = sorted([int(x) for x in num_msmc]) - f, ax = plt.subplots(1, 2+len(num_msmc), sharex=True, sharey=True, figsize=(14, 7)) - for infile in smcpp_infiles: + + for infile in infiles: nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) - line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8) - ax[0].plot(steps, 1/(2*coal_rate), c="black") - ax[0].set_title("smc++") - for infile in sp_infiles: - nt = pandas.read_csv(infile, sep="\t", skiprows=5) - line2, = ax[1].plot(nt['year'], nt['Ne_median'], alpha=0.8) - ax[1].plot(steps, 1/(2*coal_rate), c="black") - ax[1].set_title("stairwayplot") - for i, sample_size in enumerate(num_msmc): - for infile in msmc_infiles: - fn = os.path.basename(infile) - samp = fn.split(".")[0] - if(int(samp) == sample_size): - nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) - line3, = ax[2+i].plot(nt['x'], nt['y'], alpha=0.8) - ax[2+i].plot(steps, 1/(2*coal_rate), c="black") - ax[2+i].set_title(f"msmc, ({sample_size} samples)") - plt.suptitle(f"{species}, population id {pop_id}", fontsize=16) - for i in range(2+len(num_msmc)): - ax[i].set(xscale="log", yscale="log") - ax[i].set_xlabel("time (years ago)") - red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne') - ax[0].legend(frameon=False, fontsize=10, handles=[red_patch]) - ax[0].set_ylabel("population size") - f.savefig(outfile, bbox_inches='tight', alpha=0.8) + ax.step(nt['x'], nt['y'], c="red") + + ax.set_ylim([1e3,1e6]) + ax.set_xlabel('Years before present') + ax.set_ylabel('Effective population size') + h_string = "".join(nhaps) + ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)") + + f.savefig(outfile, bbox_inches='tight') + +def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile, + model, n_samp, generation_time, species, + pop_id = 0, steps=None): + + ddb = msprime.DemographyDebugger(**model.asdict()) + if steps is None: + end_time = ddb.epochs[-2].end_time + 10000 + steps = np.linspace(1,end_time,end_time+1) + num_samples = [0 for _ in range(ddb.num_populations)] + num_samples[pop_id] = n_samp + coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps, + num_samples=num_samples, double_step_validation=False) + steps = steps * generation_time + + num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles]) + num_smcsmc = set([infile.split("/")[-2].split(".")[0] for infile in smcsmc_infiles]) + + num_msmc = sorted([int(x) for x in num_msmc]) + num_smcsmc = sorted([int(x) for x in num_smcsmc]) + + f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7)) + for infile in smcpp_infiles: + nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) + line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8) + ax[0].plot(steps, 1/(2*coal_rate), c="black") + ax[0].set_title("smc++") + for infile in sp_infiles: + nt = pandas.read_csv(infile, sep="\t", skiprows=5) + line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8) + ax[1].plot(steps, 1/(2*coal_rate), c="black") + ax[1].set_title("stairwayplot") + + plot_counter=2 + for i,sample_size in enumerate(num_msmc): + for infile in msmc_infiles: + fn = os.path.basename(infile) + samp = fn.split(".")[0] + if(int(samp) == sample_size): + nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) + line3, = ax[plot_counter].plot(nt['x'], nt['y'],alpha=0.8) + ax[plot_counter].plot(steps, 1/(2*coal_rate), c="black") + ax[plot_counter].set_title(f"msmc, ({sample_size} samples)") + plot_counter+=1 + + for i,sample_size in enumerate(num_smcsmc): + for infile in smcsmc_infiles: + samp = infile.split("/")[-2].split(".")[0] + if(int(samp) == sample_size): + nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) + line3, = ax[plot_counter].plot(nt['x'], nt['y'],alpha=0.8) + ax[plot_counter].plot(steps, 1/(2*coal_rate), c="black") + ax[plot_counter].set_title(f"smcsmc, ({sample_size} samples)") + plot_counter+=1 + plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16) + for i in range(2+len(num_msmc)+len(num_smcsmc)): + ax[i].set(xscale="log", yscale="log") + ax[i].set_xlabel("time (years ago)") + + + red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne') + ax[0].legend(frameon=False, fontsize=10, handles=[red_patch]) + ax[0].set_ylabel("population size") + f.savefig(outfile, bbox_inches='tight', alpha=0.8) def plot_stairwayplot_coalrate(sp_infiles, outfile, model, n_samp, generation_time, species, diff --git a/n_t/readme.md b/n_t/readme.md index 2d512f0..6223c8c 100644 --- a/n_t/readme.md +++ b/n_t/readme.md @@ -18,15 +18,16 @@ moment. ## Workflow -The analysis includes three programs for predicting effective population +The analysis includes four programs for predicting effective population size through time(`n_t`): [msmc](https://github.com/stschiff/msmc/issues/23), -[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and +[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and [smc++](https://github.com/popgenmethods/smcpp). -There are four target rules that can be executed with the given parameters: +There are five target rules that can be executed with the given parameters: `compound_msmc`, `compound_smcpp`, `compound_stairwayplot`, +`compound_smcsmc`, or you can run all three on the same simulated data with rule `all`. To run an analysis, create a directory (wherever you want) @@ -37,18 +38,21 @@ might look like this: ```json { - "seed" : 12345, - "population_id" : 0, - "num_sampled_genomes_per_replicate" : 20, - "num_sampled_genomes_msmc" : "2,8", - "num_msmc_iterations" : 20, - "replicates" : 10, - "species" : "homo_sapiens", - "model" : "GutenkunstThreePopOutOfAfrica", - "genetic_map" : "HapmapII_GRCh37", - "chrm_list" : "chr22,chrX", - "mask_file" : "masks/HapmapII_GRCh37.mask.bed", + "seed" : 12345, + "population_id" : 0, + "num_sampled_genomes_per_replicate" : 20, + "num_sampled_genomes_msmc" : "2 8", + "num_sampled_genomes_smcsmc" : "4", + "num_smcsmc_particles": 10000, + "num_msmc_iterations" : 20, + "num_smcsmc_iterations": 15, + "replicates" : 1, + "species" : "homo_sapiens", + "model" : "GutenkunstThreePopOutOfAfrica", + "genetic_map" : "HapmapII_GRCh37", + "chrm_list" : "chr22,chrX" } + ``` Once you have creates a directory which contains the config file