From c0b35c06952ccd165b381008385dafda69dde422 Mon Sep 17 00:00:00 2001 From: Christopher Cole Date: Thu, 18 Apr 2019 17:47:08 +0000 Subject: [PATCH] Added smcsmc to the analysis pipeline. --- README.md | 6 ++++ n_t/Snakefile | 89 +++++++++++++++++++++++++++++++++++++++++++++++++-- n_t/plots.py | 59 ++++++++++++++++++++++++++++++++-- n_t/readme.md | 34 ++++++++++++-------- 4 files changed, 169 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index d8c9e7f..b05f4b7 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make cd ../../ ``` +`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux. + +```sh +conda install -c luntergroup smcsmc +``` + Further instructions can be currently found in each task directory diff --git a/n_t/Snakefile b/n_t/Snakefile index f06874d..b5dc9af 100644 --- a/n_t/Snakefile +++ b/n_t/Snakefile @@ -24,13 +24,14 @@ import smc import msmc import simulations import plots +import smcsmc +import smcsmc.popsim # ############################################################################### # KNOBS - # ############################################################################### - # A seed to replicate results # TODO mutation rates @@ -49,11 +50,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"] # Here is a list of sample sizes to run msmc on. # Each element counts as its own analysis # so there will be "replicates" runs for each size +# +# and a similar setup for SMCSMC num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split(",")] +num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split(",")] # The number of msmc Baumwelch(?) iterations to run, # typically 20 +# +# And again, similar for SMCSMC. Number of stochastic EM iterations. +# 15 is typical, but more is good too. Assess for convergence based on +# rainbow plots. num_msmc_iterations = config["num_msmc_iterations"] +num_smcsmc_iterations = config["num_smcsmc_iterations"] + +# Number of particles to use for SMCSMC +# +# A good starting point is 50k, and see if reducing +# significantly impacts the estimates that you are +# recieveing. +num_smcsmc_particles = config["num_smcsmc_particles"] # The number of replicates of each analysis you would like to run replicates = config["replicates"] @@ -297,6 +313,72 @@ rule compound_msmc: run: plots.plot_compound_msmc(model, input, output[0]) +# ############################################################################### +# SMCSMC +# ############################################################################### + +rule ts_to_seg: + input: rules.simulation.output + output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg" + run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc) + +rule run_smcsmc: + input: + expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg", + chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc) + output: + output_dir + "/Intermediate/{seeds}/{samps}.run/result.out" + run: + inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg", + seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list) + + input_file_string = " ".join(inputs) + args = { + 'EM': str(num_smcsmc_iterations), + 'Np': str(num_smcsmc_particles), + # Submission Parameters + 'chunks': '100', + 'no_infer_recomb': '', + # Other inference parameters + 'mu': str(species.genome.mean_mutation_rate), + 'N0': '14312', + 'rho': '3e-9', + 'calibrate_lag': '1.0', + 'tmax': '3.5', + 'alpha': '0', + 'apf': '2', + 'P': '133 133016 31*1', + 'VB': '', + 'nsam': str(wildcards.samps), + # This should be in the conda bin + 'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc') + } + args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run" + args['segs'] = input_file_string + + smcsmc.run_smcsmc(args) + +rule convert_smcsmc: + input: rules.run_smcsmc.output + output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv" + run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations) + + +def ne_files_smcsmc(wildcards): + return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv", + seeds=seed_array, samps=num_sampled_genomes_smcsmc) + +rule plot_by_sample: + input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array) + output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png" + run: + plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model) + +rule compound_smcsmc: + input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc) + + + # ############################################################################### # # ############################################################################### @@ -307,10 +389,11 @@ rule all_plot: f1 = ne_files, f2 = ne_files_smcpp, f3 = ne_files_msmc, + f4 = ne_files_smcsmc, output: output_dir + "/Results/all_estimated_Ne.pdf" - run: - plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0], + run: + plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0], model=model, n_samp=num_sampled_genomes_per_replicate, generation_time=generation_time, species=config["species"], pop_id=population_id) diff --git a/n_t/plots.py b/n_t/plots.py index b2a707f..2d1dc19 100644 --- a/n_t/plots.py +++ b/n_t/plots.py @@ -8,6 +8,7 @@ import os import matplotlib.patches as mpatches from matplotlib import pyplot as plt +import stdpopsim import numpy as np # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') @@ -59,8 +60,40 @@ def plot_compound_msmc(infiles, outfile): ax.plot(nt['x'], nt['y'], c="red") f.savefig(outfile, bbox_inches='tight') +def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None): + f, ax = plt.subplots(figsize=(7, 7)) + ax.set(xscale="log", yscale="log") + + if model == "ooa": + model = getattr(stdpopsim.homo_sapiens,"GutenkunstThreePopOutOfAfrica")() + + if model is not None: + ddb = msprime.DemographyDebugger(**model.asdict()) + if steps is None: + end_time = ddb.epochs[-2].end_time + 10000 + steps = np.exp(np.linspace(1,np.log(end_time),31)) + num_samples = [0 for _ in range(ddb.num_populations)] + num_samples[pop_id] = 20 + coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps, + num_samples=num_samples, double_step_validation=False) + steps = steps * generation_time + ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre') + + + for infile in infiles: + nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) + ax.step(nt['x'], nt['y'], c="red") + + ax.set_ylim([1e3,1e6]) + ax.set_xlabel('Years before present') + ax.set_ylabel('Effective population size') + h_string = "".join(nhaps) + ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)") + + f.savefig(outfile, bbox_inches='tight') -def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile, + +def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile, model, n_samp, generation_time, species, pop_id = 0, steps=None): @@ -73,9 +106,14 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile, coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps, num_samples=num_samples, double_step_validation=False) steps = steps * generation_time + num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles]) + num_smcsmc = set([os.path.basename(infile).split(".")[0] for infile in smcsmc_infiles]) + num_msmc = sorted([int(x) for x in num_msmc]) - f, ax = plt.subplots(1,2+len(num_msmc),sharex=True,sharey=True,figsize=(14, 7)) + num_smcsmc = sorted([int(x) for x in num_msmc]) + + f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7)) for infile in smcpp_infiles: nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8) @@ -86,6 +124,7 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile, line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8) ax[1].plot(steps, 1/(2*coal_rate), c="black") ax[1].set_title("stairwayplot") + for i,sample_size in enumerate(num_msmc): for infile in msmc_infiles: fn = os.path.basename(infile) @@ -99,6 +138,22 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile, for i in range(2+len(num_msmc)): ax[i].set(xscale="log", yscale="log") ax[i].set_xlabel("time (years ago)") + + for i,sample_size in enumerate(num_smcsmc): + for infile in smcsmc_infiles: + fn = os.path.basename(infile) + samp = fn.split(".")[0] + if(int(samp) == sample_size): + nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0) + line3, = ax[2+len(num_msmc) + i].plot(nt['x'], nt['y'],alpha=0.8) + ax[2+i].plot(steps, 1/(2*coal_rate), c="black") + ax[2+i].set_title(f"smcsmc, ({sample_size} samples)") + plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16) + for i in range(2+len(num_msmc)): + ax[i].set(xscale="log", yscale="log") + ax[i].set_xlabel("time (years ago)") + + red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne') ax[0].legend(frameon=False, fontsize=10, handles=[red_patch]) ax[0].set_ylabel("population size") diff --git a/n_t/readme.md b/n_t/readme.md index 1faf990..065cf2b 100644 --- a/n_t/readme.md +++ b/n_t/readme.md @@ -18,15 +18,16 @@ moment. ## Workflow -The analysis includes three programs for predicting effective population +The analysis includes four programs for predicting effective population size through time(`n_t`): [msmc](https://github.com/stschiff/msmc/issues/23), -[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and +[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and [smc++](https://github.com/popgenmethods/smcpp). -There are four target rules that can be executed with the given parameters: +There are five target rules that can be executed with the given parameters: `compound_msmc`, `compound_smcpp`, `compound_stairwayplot`, +`compound_smcsmc`, or you can run all three on the same simulated data with rule `all`. To run an analysis, create a directory (wherever you want) @@ -37,18 +38,23 @@ might look like this: ```json { - "seed" : 12345, - "population_id" : 0, - "num_sampled_genomes_per_replicate" : 20, - "num_sampled_genomes_msmc" : "2,8", - "num_msmc_iterations" : 20, - "replicates" : 10, - "species" : "homo_sapiens", - "model" : "GutenkunstThreePopOutOfAfrica", - "genetic_map" : "HapmapII_GRCh37", - "chrm_list" : "chr22,chrX", - "ld_thresh" : 0.1, + "seed" : 12345, + "population_id" : 0, + "num_sampled_genomes_per_replicate" : 20, + "num_sampled_genomes_msmc" : "2 8", + "num_sampled_genomes_smcsmc" : "4", + "num_smcsmc_particles": 10000, + "num_msmc_iterations" : 20, + "num_smcsmc_iterations": 15, + "replicates" : 1, + "species" : "homo_sapiens", + "model" : "GutenkunstThreePopOutOfAfrica", + "genetic_map" : "HapmapII_GRCh37", + "chrm_list" : "all", + "generation_time" : 30, + "output_dir": "output" } + ``` Once you have creates a directory which contains the config file