Skip to content

Commit

Permalink
Added smcsmc to the analysis pipeline.
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris1221 committed Jun 24, 2019
1 parent b61ce4c commit e9a11c5
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 20 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make
cd ../../
```
`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux.
```sh
conda install -c luntergroup smcsmc
```

Further instructions can be currently found in each task directory
91 changes: 88 additions & 3 deletions n_t/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,14 @@ import smc
import msmc
import simulations
import plots
import smcsmc
import smcsmc.popsim


# ###############################################################################
# KNOBS -
# ###############################################################################


# A seed to replicate results
# TODO mutation rates

Expand All @@ -48,11 +49,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"]
# Here is a list of sample sizes to run msmc on.
# Each element counts as its own analysis
# so there will be "replicates" runs for each size
#
# and a similar setup for SMCSMC
num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split()]
num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split()]

# The number of msmc Baumwelch(?) iterations to run,
# typically 20
#
# And again, similar for SMCSMC. Number of stochastic EM iterations.
# 15 is typical, but more is good too. Assess for convergence based on
# rainbow plots.
num_msmc_iterations = config["num_msmc_iterations"]
num_smcsmc_iterations = config["num_smcsmc_iterations"]

# Number of particles to use for SMCSMC
#
# A good starting point is 50k, and see if reducing
# significantly impacts the estimates that you are
# recieveing.
num_smcsmc_particles = config["num_smcsmc_particles"]

# The number of replicates of each analysis you would like to run
replicates = config["replicates"]
Expand Down Expand Up @@ -92,6 +108,7 @@ mutation_rate = species.genome.mean_mutation_rate


seed_array = np.random.random_integers(1,2**31,replicates)
#seed_array=np.array([1675701734])
genetic_map_downloaded_flag= ".genetic_map_downloaded"
msmc_exec = "../extern/msmc/build/msmc"
stairwayplot_code = "stairwayplot/swarmops.jar"
Expand Down Expand Up @@ -273,6 +290,73 @@ rule compound_msmc:
run: plots.plot_compound_msmc(model, input, output[0])


# ###############################################################################
# SMCSMC
# ###############################################################################

rule ts_to_seg:
input: rules.simulation.output
output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg"
run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc)

rule run_smcsmc:
input:
expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc)
output:
output_dir + "/Intermediate/{seeds}/{samps}.run/result.out"
run:
inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list)

input_file_string = " ".join(inputs)
args = {
'EM': str(num_smcsmc_iterations),
'Np': str(num_smcsmc_particles),
# Submission Parameters
'chunks': '100',
'c': '',
'no_infer_recomb': '',
# Other inference parameters
'mu': str(species.genome.mean_mutation_rate),
'N0': '14312',
'rho': '3e-9',
'calibrate_lag': '1.0',
'tmax': '3.5',
'alpha': '0',
'apf': '2',
'P': '133 133016 31*1',
'VB': '',
'nsam': str(wildcards.samps),
# This should be in the conda bin
'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc')
}
args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run"
args['segs'] = input_file_string

smcsmc.run_smcsmc(args)

rule convert_smcsmc:
input: rules.run_smcsmc.output
output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv"
run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations)


def ne_files_smcsmc(wildcards):
return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv",
seeds=seed_array)

rule plot_by_sample:
input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array)
output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png"
run:
plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model)

rule compound_smcsmc:
input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc)



# ###############################################################################
#
# ###############################################################################
Expand All @@ -283,10 +367,11 @@ rule all_plot:
f1 = ne_files,
f2 = ne_files_smcpp,
f3 = ne_files_msmc,
f4 = ne_files_smcsmc,
output:
output_dir + "/Results/all_estimated_Ne.pdf"
run:
plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0],
run:
plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0],
model=model, n_samp=num_sampled_genomes_per_replicate,
generation_time=generation_time, species=config["species"],
pop_id=population_id)
Expand Down
59 changes: 57 additions & 2 deletions n_t/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import matplotlib.patches as mpatches
from matplotlib import pyplot as plt
import stdpopsim
import numpy as np
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
Expand Down Expand Up @@ -59,8 +60,40 @@ def plot_compound_msmc(infiles, outfile):
ax.plot(nt['x'], nt['y'], c="red")
f.savefig(outfile, bbox_inches='tight')

def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None):
f, ax = plt.subplots(figsize=(7, 7))
ax.set(xscale="log", yscale="log")

if model == "ooa":
model = getattr(stdpopsim.homo_sapiens,"GutenkunstThreePopOutOfAfrica")()

if model is not None:
ddb = msprime.DemographyDebugger(**model.asdict())
if steps is None:
end_time = ddb.epochs[-2].end_time + 10000
steps = np.exp(np.linspace(1,np.log(end_time),31))
num_samples = [0 for _ in range(ddb.num_populations)]
num_samples[pop_id] = 20
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
num_samples=num_samples, double_step_validation=False)
steps = steps * generation_time
ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre')


for infile in infiles:
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
ax.step(nt['x'], nt['y'], c="red")

ax.set_ylim([1e3,1e6])
ax.set_xlabel('Years before present')
ax.set_ylabel('Effective population size')
h_string = "".join(nhaps)
ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)")

f.savefig(outfile, bbox_inches='tight')

def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,

def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile,
model, n_samp, generation_time, species,
pop_id = 0, steps=None):

Expand All @@ -73,9 +106,14 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
num_samples=num_samples, double_step_validation=False)
steps = steps * generation_time

num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
num_smcsmc = set([os.path.basename(infile).split(".")[0] for infile in smcsmc_infiles])

num_msmc = sorted([int(x) for x in num_msmc])
f, ax = plt.subplots(1,2+len(num_msmc),sharex=True,sharey=True,figsize=(14, 7))
num_smcsmc = sorted([int(x) for x in num_msmc])

f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7))
for infile in smcpp_infiles:
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
Expand All @@ -86,6 +124,7 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8)
ax[1].plot(steps, 1/(2*coal_rate), c="black")
ax[1].set_title("stairwayplot")

for i,sample_size in enumerate(num_msmc):
for infile in msmc_infiles:
fn = os.path.basename(infile)
Expand All @@ -99,6 +138,22 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
for i in range(2+len(num_msmc)):
ax[i].set(xscale="log", yscale="log")
ax[i].set_xlabel("time (years ago)")

for i,sample_size in enumerate(num_smcsmc):
for infile in smcsmc_infiles:
fn = os.path.basename(infile)
samp = fn.split(".")[0]
if(int(samp) == sample_size):
nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
line3, = ax[2+len(num_msmc) + i].plot(nt['x'], nt['y'],alpha=0.8)
ax[2+i].plot(steps, 1/(2*coal_rate), c="black")
ax[2+i].set_title(f"smcsmc, ({sample_size} samples)")
plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16)
for i in range(2+len(num_msmc)):
ax[i].set(xscale="log", yscale="log")
ax[i].set_xlabel("time (years ago)")


red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
ax[0].set_ylabel("population size")
Expand Down
34 changes: 20 additions & 14 deletions n_t/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@ moment.

## Workflow

The analysis includes three programs for predicting effective population
The analysis includes four programs for predicting effective population
size through time(`n_t`):
[msmc](https://github.com/stschiff/msmc/issues/23),
[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and
[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and
[smc++](https://github.com/popgenmethods/smcpp).
There are four target rules that can be executed with the given parameters:
There are five target rules that can be executed with the given parameters:
`compound_msmc`,
`compound_smcpp`,
`compound_stairwayplot`,
`compound_smcsmc`,
or you can run all three on the same simulated data with rule `all`.

To run an analysis, create a directory (wherever you want)
Expand All @@ -37,18 +38,23 @@ might look like this:

```json
{
"seed" : 12345,
"population_id" : 0,
"num_sampled_genomes_per_replicate" : 20,
"num_sampled_genomes_msmc" : "2 8",
"num_msmc_iterations" : 20,
"replicates" : 10,
"species" : "homo_sapiens",
"model" : "GutenkunstThreePopOutOfAfrica",
"genetic_map" : "HapmapII_GRCh37",
"chrm_list" : "chr22,chrX",
"generation_time" : 25,
"seed" : 12345,
"population_id" : 0,
"num_sampled_genomes_per_replicate" : 20,
"num_sampled_genomes_msmc" : "2 8",
"num_sampled_genomes_smcsmc" : "4",
"num_smcsmc_particles": 10000,
"num_msmc_iterations" : 20,
"num_smcsmc_iterations": 15,
"replicates" : 1,
"species" : "homo_sapiens",
"model" : "GutenkunstThreePopOutOfAfrica",
"genetic_map" : "HapmapII_GRCh37",
"chrm_list" : "all",
"generation_time" : 30,
"output_dir": "output"
}

```

Once you have creates a directory which contains the config file
Expand Down
4 changes: 3 additions & 1 deletion n_t/simulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import msprime
import os
from stdpopsim import homo_sapiens


Expand All @@ -17,4 +18,5 @@ def simulate(out_path, species, model, genetic_map, seed, chrmStr,
mutation_rate=chrom.default_mutation_rate,
random_seed=seed,
**model.asdict())
ts.dump(out_path)
ts.dump(out_path)

0 comments on commit e9a11c5

Please sign in to comment.