First commit, uploading README and code files

johnomics · Dec 10, 2013 · ae8af88 · ae8af88
commit ae8af88
Show file tree

Hide file tree

Showing 4 changed files with 639 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,41 @@
+This repository contains code and data for the manuscript "Evaluating statistics for the identification of introgressed loci" by Simon H. Martin, John W. Davey and Chris D. Jiggins, available on bioRxiv.
+
+The results in the paper were generated as follows.
+
+Models are generated using the `shared_ancestry_simulator.R` script. A single combined model can be generated like this:
+
+```
+./shared_ancestry_simulator.R -w 10000 -T 60 -c Alternate_t123-0.4_t23-0.2.yml:0.1,Background_t123-0.6_t21-0.4.yml:0.9
+```
+
+This will generate 10000 windows, 10% of which will be generated using the model described in the file `Alternate_t123-0.4_t23-0.2.yml` and 90% of which will be generated using `Background_t123-0.6_t21-0.4.yml`, using 60 threads. See the model files folders for the YAML files generated for this paper.
+
+These YAML files were generated by the `run_model_combinations.pl` script, using the following commands for the 5kb, 10kb and 20kb simulations:
+
+```
+./run_model_combinations.pl -T 60 -n 10000 -w 5000 -p 1000000 -m Model_parameter_list.csv
+./run_model_combinations.pl -T 60 -n 5000 -w 10000 -p 1000000 -m Model_parameter_list.csv
+./run_model_combinations.pl -T 60 -n 2500 -w 20000 -p 1000000 -m Model_parameter_list.csv -s path/to/simulator
+```
+
+Option 'p' defines the effective population size and option 'm' specifies a file containing the list of models to simulate (available in the repository). Option 's' is the directory path to the simulator script, `shared_ancestry_simulator.R`. If 's' is not provided, `run_model_combinations.pl` assumes `shared_ancestry_simulator.R` is in the same folder as itself.
+
+Summary statistics for the models found in the `partition.summary` and `dxy.summary` files were generated as follows:
+
+```
+./generate_summary_statistics.R -m model_files_win10000_size5000 -T 60
+./generate_summary_statistics.R -m model_files_win5000_size10000 -T 60
+./generate_summary_statistics.R -m model_files_win2500_size20000 -T 60
+```
+
+Real data for the Heliconius genome, split into 5 kb windows, can be found in the file `Heliconius_genome_windows.csv`.
+
+The figures were generated using these summary statistics files, using the following scripts:
+
+```
+./Figure_2.R -i model_files_win10000_size5000/Alternate_t123-1.2_t23-0.2.Background_t123-2_t21-1.2.10000.csv
+./Figure_6_S5.R
+./Figure_7.R -i model_files_win10000_size5000.dxy.summary.tsv
+./Figure_S1_S2.R -i model_files_win10000_size5000.partition.summary.tsv
+./Figure_S3_S4.R
+```
diff --git a/generate_summary_statistics.R b/generate_summary_statistics.R
@@ -0,0 +1,147 @@
+#!/usr/bin/env Rscript
+
+# generate_summary_statistics.R
+# Generate dXY and partitioning statistics for models generated using run_model_combinations.pl
+
+# Written for "Evaluating statistics for the identification of introgressed loci"
+# by Simon H. Martin, John W. Davey and Chris D. Jiggins
+# John Davey:   [email protected]
+# Simon Martin: [email protected]
+# November-December 2013
+
+library(parallel)
+suppressMessages(library(reshape))
+library(plyr)
+library(optparse)
+library(stringr)
+library(tools)
+
+options<-list(make_option(c("-T","--threads"),default=1,help="Number of threads [default %default]",metavar="numthreads"),
+              make_option(c("-m","--modelfiles"),default="",type="character",help="Folder of CSV files with model output"),
+              make_option(c("-r","--realdata"),default="",type="character",help="CSV file containing windows from real genome"),
+              make_option(c("-l","--listofmodels"),default="",type="character",help="CSV file of model parameter combinations")
+         )
+
+opt<-parse_args(OptionParser(option_list=options))
+
+if (opt$listofmodels == "") stop("Please specify a list of model parameter combinations in CSV format with -l")
+if (opt$modelfiles == "" & opt$realdata == "") stop("Please specify a folder of model files with -m or a CSV file containing real data with -r")
+if (opt$modelfiles != "" & !file.exists(opt$modelfiles)) stop(paste("Directory", opt$modelfiles, "does not exist! Please specify a valid directory with -m"))
+if (opt$realdata != "" & !file.exists(opt$realdata)) stop(paste("Real data file", opt$realdata, "does not exist! Please specify a valid file with -r"))
+
+partition.x<-function(model,stat,threshold=0.1,win) {
+    outnum<-threshold*win
+    Partition<-sapply(rank(-model[,stat]), function(x) if (x<=outnum) "_Outlier" else "_Background")
+    Partition<-paste0(stat,Partition)
+    model<-cbind(model,Partition)
+    names(model)[names(model)=="Partition"]<-paste0(stat,"_Partition")
+    model
+}
+
+run.tests<-function(values,groups,test.name,test.func) {
+    # Assumes order of groups is Background, Outlier
+    formula<-values~groups
+    two.tailed<-test.func(formula,alternative="two.sided",conf.int=TRUE)
+    if (length(two.tailed$estimate)==1) mean.diff<-two.tailed$estimate else mean.diff<-two.tailed$estimate[1] - two.tailed$estimate[2]
+    out<-data.frame(
+        mean.diff=mean.diff,
+        p.two.tailed=two.tailed$p.value,
+        p.background.higher=test.func(formula,alternative="greater")$p.value,
+        p.background.lower=test.func(formula,alternative="less")$p.value
+    )
+    names(out)<-paste(test.name,names(out),sep=".")
+    out
+}
+
+summarise.tests<-function(model.sum,model.stats,model.partition,test.name,test.func) {
+    model.tests<-t(apply(model.stats,2,function(x) unlist(run.tests(x,model.partition,test.name,test.func))))
+    model.tests<-cbind(Variable=rownames(model.tests),model.tests)
+    merge(model.sum,model.tests,by="Variable",sort=FALSE)
+}
+
+get.model.summary<-function(models,column,pars=NULL) {
+    model.stats<-models[c("P1P2_dxy","P1P3_dxy","P2P3_dxy")]
+    model.partition<-models[,column]
+
+    model.means<-melt(aggregate(model.stats,list(model.partition),mean,na.rm=TRUE),id="Group.1")
+    model.sds<-melt(aggregate(model.stats,list(model.partition),sd,na.rm=TRUE),id="Group.1")
+    model.ses<-melt(aggregate(model.stats,list(model.partition), function(x) {x<-na.omit(x);sqrt(var(x)/length(x))}),id="Group.1")
+    model.sum<-cbind(pars,model.means,model.sds$value,model.ses$value)
+    names(model.sum)<-c(names(pars),"Model","Variable","Mean","SD","SE")
+
+    model.sum<-summarise.tests(model.sum,model.stats,model.partition,"t",t.test)
+    model.sum<-summarise.tests(model.sum,model.stats,model.partition,"w",wilcox.test)
+
+    model.sum
+}
+
+calc.mfD0<-function(df) {
+    apply(df,1,function(x) if ((is.na(x["D"])) || (as.numeric(x["D"])<0)) NA else as.numeric(x["mf"]))
+}
+
+
+summarise.model<-function(filename) {
+    pars<-data.frame(str_match(filename,"Alternate_t123-(.+)_t23-(.+)\\.Background_t123-(.+)_t21-(.+)\\.(.+)\\.csv"),stringsAsFactors=FALSE)
+    names(pars)<-c("File","Alternate_t123","Alternate_t23","Background_t123","Background_t21","Windows")
+    pars$Windows<-as.numeric(pars$Windows)
+
+    models<-read.csv(filename)
+
+    models$Model<-factor(str_match(models$Model,"(.+)_t123")[,2],levels=c("Background","Alternate"))
+    models$Model<-revalue(models$Model,c(Background="Real_Background",Alternate="Real_Outlier"))
+
+    models$mfD0<-calc.mfD0(models)
+
+    models<-partition.x(models,"D",0.1,pars$Windows)
+    models<-partition.x(models,"mfD0",0.1,pars$Windows)
+
+    model.summary<-get.model.summary(models,"Model",pars)
+    D.summary<-get.model.summary(models,"D_Partition",pars)
+    mfD0.summary<-get.model.summary(models,"mfD0_Partition",pars)
+
+    model.sum<-rbind(model.summary,D.summary,mfD0.summary)
+
+    is.D.positive<-models$D>=0
+    model.comp<-data.frame(pars,
+        Stat=c("D","mfD0"),
+        Stat.Mean=c(mean(models$D,na.rm=TRUE)*100,mean(models$mfD0,na.rm=TRUE)*100),
+        Stat.SD=c(sd(models$D,na.rm=TRUE)*100,sd(models$mfD0,na.rm=TRUE)*100),
+        Dpos=sum(is.D.positive,na.rm=TRUE),
+        DposAlternate=sum(is.D.positive & models$Model=="Real_Outlier",na.rm=TRUE),
+        DposOutlier=c(sum(is.D.positive & models$D_Partition=="D_Outlier", na.rm=TRUE),
+                      sum(is.D.positive & models$mfD0_Partition=="mfD0_Outlier", na.rm=TRUE)),
+        OutlierAlternate=c(sum(models$Model=="Real_Outlier" & models$D_Partition=="D_Outlier",na.rm=TRUE),
+                            sum(models$Model=="Real_Outlier" & models$mfD0_Partition=="mfD0_Outlier",na.rm=TRUE))
+    )
+    model.comp$OutlierAlternatePC<-model.comp$OutlierAlternate/model.comp$DposOutlier*100
+
+    list(model.sum,model.comp)
+}
+
+options(width=200)
+
+if (opt$modelfiles != "") {
+    csv.summary<-mclapply(dir(opt$modelfiles,"*.csv",full.names=TRUE),summarise.model,mc.cores=opt$threads)
+
+    models<-rbind.fill(lapply(csv.summary,function(x) x[[1]]))
+    model.comp<-rbind.fill(lapply(csv.summary,function(x) x[[2]]))
+
+    model.list<-read.csv(opt$listofmodels,stringsAsFactors=FALSE)
+
+    models<-merge(models,unique(model.list[,c("Background_t123","Background_t21","Alternate_t123","Alternate_t23","ModelType")]),by=c("Background_t123","Background_t21","Alternate_t123","Alternate_t23"))
+    write.table(models,file=paste0(opt$modelfiles,".dxy.summary.tsv"),sep="\t",row.names=FALSE)
+
+    model.comp<-merge(model.comp,unique(model.list[,c("Background_t123","Background_t21","Alternate_t123","Alternate_t23","ModelType")]),by=c("Background_t123","Background_t21","Alternate_t123","Alternate_t23"))
+    write.table(model.comp,file=paste0(opt$modelfiles,".partition.summary.tsv"),sep="\t",row.names=FALSE)
+}
+
+if (opt$realdata != "") {
+    real<-read.csv(opt$realdata)
+    real$mfD0<-calc.mfD0(real)
+    real<-partition.x(real,"D",0.1,nrow(real))
+    real<-partition.x(real,"mfD0",0.1,nrow(real))
+
+    pars<-data.frame(File=opt$realdata,Windows=nrow(real))
+    real.stats<-rbind(get.model.summary(real,"D_Partition",pars),get.model.summary(real,"mfD0_Partition",pars))
+    write.table(real.stats,file=paste0(file_path_sans_ext(basename(opt$realdata)),".dxy.summary.tsv"),sep="\t",row.names=FALSE)
+}
diff --git a/run_model_combinations.pl b/run_model_combinations.pl
@@ -0,0 +1,126 @@
+#!/usr/bin/env perl
+
+# run_model_combinations.pl
+# Run a set of combined models using shared_ancestry_simulator.R based on a CSV file of model parameter combinations
+
+# Written for "Evaluating statistics for the identification of introgressed loci"
+# by Simon H. Martin, John W. Davey and Chris D. Jiggins
+# John Davey:   [email protected]
+# Simon Martin: [email protected]
+# November-December 2013
+
+use strict;
+use warnings;
+use Carp;
+use English;
+use Data::Dumper;
+use Getopt::Long;
+
+
+my $numwindows = 1000;
+my $windowsize = 5000;
+my $modelfile = "";
+my $threads = 1;
+my $popsize = 1000000;
+my $simulator_path = "..";
+my $options_okay = GetOptions(
+	'numwindows=i' => \$numwindows,
+	'windowsize=i' => \$windowsize,
+	'modelfile=s' => \$modelfile,
+	'threads=i'   => \$threads,
+    'popsize=i'   => \$popsize,
+    'simulator_path=s' => \$simulator_path,
+);
+
+croak "Please supply a CSV file containing model parameters with -m" if $modelfile eq "";
+
+croak "Simulator path $simulator_path does not exist!\n" if !-d $simulator_path;
+
+my $model = <<EOM;
+seqgen:
+    m: HKY
+    l: $windowsize
+ms:
+    pops:
+        - {name: 1, seqs: 8}
+        - {name: 2, seqs: 8}
+        - {name: 3, seqs: 8}
+        - {name: 4, seqs: 8}
+    opts:
+        genperyear: 4
+        n0: $popsize
+        mu: 2.5e-9
+        split:
+            - {time: 3,   pops: [4,1]}
+EOM
+
+sub output_models {
+    my ($model, $modeltype, $params) = @_;
+    my $pops = $modeltype eq "Background" ? "2,1" : $modeltype eq "Alternate" ? "2,3" : "0,0";
+    my $printpops = $pops;
+    $printpops =~ s/,//g;
+    foreach my $t123 (sort {$a<=>$b} keys %{$params}) {
+        my $model_t123 = $model . "            - {time: $t123, pops: [3,1]}\n";
+        foreach my $txx (keys %{$params->{$t123}}) {
+            open my $model_file, ">", "$modeltype\_t123-$t123\_t$printpops-$txx.yml" or croak "Can't open model file: $OS_ERROR!\n";
+            my $model_t123_txx = $model_t123 . "            - {time: $txx, pops: [$pops]}\n";
+            print $model_file $model_t123_txx;
+            close $model_file;
+        }
+    }
+}
+
+open my $combined, "<", $modelfile or croak "Can't open model file: $OS_ERROR!\n";
+
+my $header = <$combined>;
+chomp $header;
+my @header = split /,/, $header;
+my %background;
+my %alternate;
+my @combinations;
+while (my $model = <$combined>) {
+    chomp $model;
+    my @f = split /,/, $model;
+    $background{$f[1]}{$f[2]}++;
+    $alternate{$f[3]}{$f[4]}++;
+    my %comb;
+	$comb{type}=$f[0];
+    map {
+        my ($type, $var) = split /_/, $header[$_];
+        $comb{pars}{$type}{$var} = $f[$_]
+    } 1..$#header;
+    push @combinations, \%comb;
+}
+
+close $combined;
+
+my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime();
+$mon++; # originally in 0..11
+$year+=1900;
+my $outdir = "model_files_win$numwindows\_size$windowsize\_ne$popsize\_$year-$mon-$mday";
+mkdir($outdir) if !-d $outdir;
+chdir($outdir);
+
+open my $log, ">", "$outdir.log" or croak "Can't open log file! $OS_ERROR\n";
+
+output_models($model, "Background", \%background);
+output_models($model, "Alternate", \%alternate);
+
+
+foreach my $combination (@combinations) {
+    my $command = "$simulator_path/shared_ancestry_simulator.R -w $numwindows -T $threads -c ";
+    foreach my $model_type (keys %{$combination->{pars}}) {
+        my $prop = $model_type eq "Background" ? 0.9 : 0.1;
+        $command .= $model_type . "_t123-";
+        $command .= $combination->{pars}{$model_type}{t123};
+        $command .= "_";
+        delete $combination->{pars}{$model_type}{t123};
+        my $var2 = (keys %{$combination->{pars}{$model_type}})[0];
+        $command .= "$var2-$combination->{pars}{$model_type}{$var2}.yml:$prop,";
+    }
+    chop $command;
+    print $log "$command\t" . localtime() . "\n";
+    my $simout = `$command`;
+    croak "Couldn't run command: $command\n(Is shared_ancestry_simulator.R in your path?)\n" if !defined $simout;
+    croak "Command failed: $command\n$simout\n" if ($simout ne "");
+}