From c0b35c06952ccd165b381008385dafda69dde422 Mon Sep 17 00:00:00 2001
From: Christopher Cole <ccole@well.ox.ac.uk>
Date: Thu, 18 Apr 2019 17:47:08 +0000
Subject: [PATCH] Added smcsmc to the analysis pipeline.

---
 README.md     |  6 ++++
 n_t/Snakefile | 89 +++++++++++++++++++++++++++++++++++++++++++++++++--
 n_t/plots.py  | 59 ++++++++++++++++++++++++++++++++--
 n_t/readme.md | 34 ++++++++++++--------
 4 files changed, 169 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index d8c9e7f..b05f4b7 100644
--- a/README.md
+++ b/README.md
@@ -35,4 +35,10 @@ cat msmc_makefile_stdpopsim_patch > msmc/Makefile && cd msmc && make
 cd ../../
 ```
 
+`smcsmc` can be [installed manually](https://github.com/luntergroup/smcsmc) or through `conda` on linux. 
+
+```sh
+conda install -c luntergroup smcsmc 
+```
+
 Further instructions can be currently found in each task directory
diff --git a/n_t/Snakefile b/n_t/Snakefile
index f06874d..b5dc9af 100644
--- a/n_t/Snakefile
+++ b/n_t/Snakefile
@@ -24,13 +24,14 @@ import smc
 import msmc
 import simulations
 import plots
+import smcsmc
+import smcsmc.popsim
 
 
 # ###############################################################################
 # KNOBS - 
 # ###############################################################################
 
-
 # A seed to replicate results
 # TODO mutation rates
 
@@ -49,11 +50,26 @@ num_sampled_genomes_per_replicate = config["num_sampled_genomes_per_replicate"]
 # Here is a list of sample sizes to run msmc on. 
 # Each element counts as its own analysis
 # so there will be "replicates" runs for each size
+#
+# and a similar setup for SMCSMC
 num_sampled_genomes_msmc = [int(x) for x in config["num_sampled_genomes_msmc"].split(",")]
+num_sampled_genomes_smcsmc = [int(x) for x in config["num_sampled_genomes_smcsmc"].split(",")]
 
 # The number of msmc Baumwelch(?) iterations to run,
 # typically 20
+#
+# And again, similar for SMCSMC. Number of stochastic EM iterations.
+# 15 is typical, but more is good too. Assess for convergence based on 
+# rainbow plots.
 num_msmc_iterations = config["num_msmc_iterations"]
+num_smcsmc_iterations = config["num_smcsmc_iterations"]
+
+# Number of particles to use for SMCSMC
+# 
+# A good starting point is 50k, and see if reducing 
+# significantly impacts the estimates that you are
+# recieveing. 
+num_smcsmc_particles = config["num_smcsmc_particles"]
 
 # The number of replicates of each analysis you would like to run
 replicates = config["replicates"]
@@ -297,6 +313,72 @@ rule compound_msmc:
     run: plots.plot_compound_msmc(model, input, output[0])
 
 
+# ###############################################################################
+#  SMCSMC
+# ###############################################################################
+
+rule ts_to_seg:
+	input: rules.simulation.output
+	output: output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg"
+	run: smcsmc.utils.ts_to_seg(input[0], num_sampled_genomes_smcsmc)
+
+rule run_smcsmc:
+	input:	
+		expand(output_dir + "/Intermediate/{seeds}/{samps}.{chrms}.trees.seg",
+        		chrms=chrm_list, seeds=seed_array, samps=num_sampled_genomes_smcsmc)
+	output: 
+		output_dir + "/Intermediate/{seeds}/{samps}.run/result.out"
+	run: 
+		inputs = expand(output_dir+"/Intermediate/{seeds}/{samps}.{chrms}.trees.seg", 
+			seeds=wildcards.seeds, samps=wildcards.samps, chrms=chrm_list)
+		
+		input_file_string = " ".join(inputs)
+		args = {	
+			'EM': str(num_smcsmc_iterations),
+			'Np': str(num_smcsmc_particles),	
+			# Submission Parameters
+			'chunks': '100',	
+			'no_infer_recomb': '',
+			# Other inference parameters
+			'mu': str(species.genome.mean_mutation_rate),
+			'N0': '14312',
+			'rho': '3e-9',
+			'calibrate_lag': '1.0',
+			'tmax':	'3.5',
+			'alpha': '0',
+			'apf': '2',
+			'P': '133 133016 31*1',
+			'VB': '',
+			'nsam':	str(wildcards.samps),		
+			# This should be in the conda bin
+			'smcsmcpath': os.path.expandvars('${CONDA_PREFIX}/bin/smcsmc')
+		}
+		args['o'] = output_dir + f"/Intermediate/{wildcards.seeds}/{wildcards.samps}.run"
+		args['segs'] = input_file_string
+		
+		smcsmc.run_smcsmc(args)	
+
+rule convert_smcsmc:
+    input: rules.run_smcsmc.output
+    output: output_dir + "/Results/{seeds}/{samps}.run/results.out.csv"
+    run: smcsmc.popsim.convert_smcsmc_output(input[0], output[0], generation_time, num_smcsmc_iterations)
+
+
+def ne_files_smcsmc(wildcards):
+    return expand(output_dir + "/Results/{seeds}/{samps}.run/results.out.csv",
+            seeds=seed_array, samps=num_sampled_genomes_smcsmc)
+
+rule plot_by_sample:
+    input: expand(output_dir + "/Results/{seeds}/{{samps}}.run/results.out.csv", seeds=seed_array)
+    output: output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png"
+    run: 
+    	plots.plot_compound_smcsmc_with_guide(input, output[0], 30, 1, nhaps ={wildcards.samps}, model = model)
+
+rule compound_smcsmc:
+    input: expand(output_dir+"/Results/smcsmc_estimated_Ne_{samps}.png", samps = num_sampled_genomes_smcsmc)
+
+
+
 # ###############################################################################
 # 
 # ###############################################################################
@@ -307,10 +389,11 @@ rule all_plot:
         f1 = ne_files,
         f2 = ne_files_smcpp,
         f3 = ne_files_msmc,
+	f4 = ne_files_smcsmc,
     output:
         output_dir + "/Results/all_estimated_Ne.pdf"
-    run:
-        plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, output[0],
+    run: 
+        plots.plot_all_ne_estimates(input.f1, input.f2, input.f3, input.f4, output[0],
                     model=model, n_samp=num_sampled_genomes_per_replicate,
                     generation_time=generation_time, species=config["species"],
                     pop_id=population_id)
diff --git a/n_t/plots.py b/n_t/plots.py
index b2a707f..2d1dc19 100644
--- a/n_t/plots.py
+++ b/n_t/plots.py
@@ -8,6 +8,7 @@
 import os
 import matplotlib.patches as mpatches
 from matplotlib import pyplot as plt
+import stdpopsim
 import numpy as np
 # Force matplotlib to not use any Xwindows backend.
 matplotlib.use('Agg')
@@ -59,8 +60,40 @@ def plot_compound_msmc(infiles, outfile):
         ax.plot(nt['x'], nt['y'], c="red")
     f.savefig(outfile, bbox_inches='tight')
 
+def plot_compound_smcsmc_with_guide(infiles, outfile, generation_time, pop_id = 0, nhaps = 1, model = None, steps = None):         
+    f, ax = plt.subplots(figsize=(7, 7))
+    ax.set(xscale="log", yscale="log")
+
+    if model == "ooa": 
+        model = getattr(stdpopsim.homo_sapiens,"GutenkunstThreePopOutOfAfrica")() 
+
+    if model is not None: 
+        ddb = msprime.DemographyDebugger(**model.asdict())
+        if steps is None:
+            end_time = ddb.epochs[-2].end_time + 10000
+            steps = np.exp(np.linspace(1,np.log(end_time),31))
+        num_samples = [0 for _ in range(ddb.num_populations)]
+        num_samples[pop_id] = 20
+        coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
+            num_samples=num_samples, double_step_validation=False)
+        steps = steps * generation_time
+        ax.plot(steps, 1/(2*coal_rate), c="black", drawstyle = 'steps-pre')
+
+
+    for infile in infiles:
+        nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
+        ax.step(nt['x'], nt['y'], c="red")
+
+    ax.set_ylim([1e3,1e6])
+    ax.set_xlabel('Years before present')
+    ax.set_ylabel('Effective population size')
+    h_string = "".join(nhaps)
+    ax.set_title(f"SMCSMC Estimated Ne ({h_string} samples)")
+
+    f.savefig(outfile, bbox_inches='tight')
 
-def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
+
+def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, smcsmc_infiles, outfile,
                             model, n_samp, generation_time, species,
                             pop_id = 0, steps=None):
 
@@ -73,9 +106,14 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
     coal_rate, P = ddb.coalescence_rate_trajectory(steps=steps,
         num_samples=num_samples, double_step_validation=False)
     steps = steps * generation_time
+
     num_msmc = set([os.path.basename(infile).split(".")[0] for infile in msmc_infiles])
+    num_smcsmc = set([os.path.basename(infile).split(".")[0] for infile in smcsmc_infiles])
+
     num_msmc = sorted([int(x) for x in num_msmc])
-    f, ax = plt.subplots(1,2+len(num_msmc),sharex=True,sharey=True,figsize=(14, 7))
+    num_smcsmc = sorted([int(x) for x in num_msmc])
+
+    f, ax = plt.subplots(1,2+len(num_msmc) + len(num_smcsmc), sharex=True,sharey=True,figsize=(14, 7))
     for infile in smcpp_infiles:
         nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
         line1, = ax[0].plot(nt['x'], nt['y'], alpha=0.8)
@@ -86,6 +124,7 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
         line2, = ax[1].plot(nt['year'], nt['Ne_median'],alpha=0.8)
     ax[1].plot(steps, 1/(2*coal_rate), c="black")
     ax[1].set_title("stairwayplot")
+
     for i,sample_size in enumerate(num_msmc):
         for infile in msmc_infiles:
             fn = os.path.basename(infile)
@@ -99,6 +138,22 @@ def plot_all_ne_estimates(sp_infiles, smcpp_infiles, msmc_infiles, outfile,
     for i in range(2+len(num_msmc)):
         ax[i].set(xscale="log", yscale="log")
         ax[i].set_xlabel("time (years ago)")
+
+    for i,sample_size in enumerate(num_smcsmc):
+        for infile in smcsmc_infiles:
+            fn = os.path.basename(infile)
+            samp = fn.split(".")[0]
+            if(int(samp) == sample_size):
+                nt = pandas.read_csv(infile, usecols=[1, 2], skiprows=0)
+                line3, = ax[2+len(num_msmc) + i].plot(nt['x'], nt['y'],alpha=0.8)
+        ax[2+i].plot(steps, 1/(2*coal_rate), c="black")
+        ax[2+i].set_title(f"smcsmc, ({sample_size} samples)")
+    plt.suptitle(f"{species}, population id {pop_id}", fontsize = 16)
+    for i in range(2+len(num_msmc)):
+        ax[i].set(xscale="log", yscale="log")
+        ax[i].set_xlabel("time (years ago)")
+
+
     red_patch = mpatches.Patch(color='black', label='Coalescence rate derived Ne')
     ax[0].legend(frameon=False, fontsize=10, handles=[red_patch])
     ax[0].set_ylabel("population size")
diff --git a/n_t/readme.md b/n_t/readme.md
index 1faf990..065cf2b 100644
--- a/n_t/readme.md
+++ b/n_t/readme.md
@@ -18,15 +18,16 @@ moment.
 
 ## Workflow
 
-The analysis includes three programs for predicting effective population 
+The analysis includes four programs for predicting effective population 
 size through time(`n_t`): 
 [msmc](https://github.com/stschiff/msmc/issues/23),
-[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), and
+[stairwayplot](https://sites.google.com/site/jpopgen/stairway-plot), [smcsmc](https://github.com/luntergroup/smcsmc), and
 [smc++](https://github.com/popgenmethods/smcpp).
-There are four target rules that can be executed with the given parameters: 
+There are five target rules that can be executed with the given parameters: 
 `compound_msmc`,
 `compound_smcpp`,
 `compound_stairwayplot`, 
+`compound_smcsmc`,
 or you can run all three on the same simulated data with rule `all`.
 
 To run an analysis, create a directory (wherever you want)
@@ -37,18 +38,23 @@ might look like this:
 
 ```json
 {
-    "seed" : 12345,
-    "population_id" : 0,
-    "num_sampled_genomes_per_replicate" : 20,  
-    "num_sampled_genomes_msmc" : "2,8",
-    "num_msmc_iterations" : 20,
-    "replicates" : 10,
-    "species" : "homo_sapiens",
-    "model" : "GutenkunstThreePopOutOfAfrica",
-    "genetic_map" : "HapmapII_GRCh37",
-    "chrm_list" : "chr22,chrX",
-    "ld_thresh" : 0.1,
+	"seed" : 12345,
+	"population_id" : 0,
+	"num_sampled_genomes_per_replicate" : 20,
+	"num_sampled_genomes_msmc" : "2 8",
+	"num_sampled_genomes_smcsmc" : "4",
+	"num_smcsmc_particles": 10000,
+	"num_msmc_iterations" : 20,
+	"num_smcsmc_iterations": 15,
+	"replicates" : 1,
+	"species" : "homo_sapiens",
+	"model" : "GutenkunstThreePopOutOfAfrica",
+	"genetic_map" : "HapmapII_GRCh37",
+	"chrm_list" : "all",
+	"generation_time" : 30,
+	"output_dir": "output"
 }
+
 ```
 
 Once you have creates a directory which contains the config file