Skip to content

Commit

Permalink
First commit, uploading README and code files
Browse files Browse the repository at this point in the history
  • Loading branch information
johnomics committed Dec 10, 2013
0 parents commit ae8af88
Show file tree
Hide file tree
Showing 4 changed files with 639 additions and 0 deletions.
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
This repository contains code and data for the manuscript "Evaluating statistics for the identification of introgressed loci" by Simon H. Martin, John W. Davey and Chris D. Jiggins, available on bioRxiv.

The results in the paper were generated as follows.

Models are generated using the `shared_ancestry_simulator.R` script. A single combined model can be generated like this:

```
./shared_ancestry_simulator.R -w 10000 -T 60 -c Alternate_t123-0.4_t23-0.2.yml:0.1,Background_t123-0.6_t21-0.4.yml:0.9
```

This will generate 10000 windows, 10% of which will be generated using the model described in the file `Alternate_t123-0.4_t23-0.2.yml` and 90% of which will be generated using `Background_t123-0.6_t21-0.4.yml`, using 60 threads. See the model files folders for the YAML files generated for this paper.

These YAML files were generated by the `run_model_combinations.pl` script, using the following commands for the 5kb, 10kb and 20kb simulations:

```
./run_model_combinations.pl -T 60 -n 10000 -w 5000 -p 1000000 -m Model_parameter_list.csv
./run_model_combinations.pl -T 60 -n 5000 -w 10000 -p 1000000 -m Model_parameter_list.csv
./run_model_combinations.pl -T 60 -n 2500 -w 20000 -p 1000000 -m Model_parameter_list.csv -s path/to/simulator
```

Option 'p' defines the effective population size and option 'm' specifies a file containing the list of models to simulate (available in the repository). Option 's' is the directory path to the simulator script, `shared_ancestry_simulator.R`. If 's' is not provided, `run_model_combinations.pl` assumes `shared_ancestry_simulator.R` is in the same folder as itself.

Summary statistics for the models found in the `partition.summary` and `dxy.summary` files were generated as follows:

```
./generate_summary_statistics.R -m model_files_win10000_size5000 -T 60
./generate_summary_statistics.R -m model_files_win5000_size10000 -T 60
./generate_summary_statistics.R -m model_files_win2500_size20000 -T 60
```

Real data for the Heliconius genome, split into 5 kb windows, can be found in the file `Heliconius_genome_windows.csv`.

The figures were generated using these summary statistics files, using the following scripts:

```
./Figure_2.R -i model_files_win10000_size5000/Alternate_t123-1.2_t23-0.2.Background_t123-2_t21-1.2.10000.csv
./Figure_6_S5.R
./Figure_7.R -i model_files_win10000_size5000.dxy.summary.tsv
./Figure_S1_S2.R -i model_files_win10000_size5000.partition.summary.tsv
./Figure_S3_S4.R
```
147 changes: 147 additions & 0 deletions generate_summary_statistics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env Rscript

# generate_summary_statistics.R
# Generate dXY and partitioning statistics for models generated using run_model_combinations.pl

# Written for "Evaluating statistics for the identification of introgressed loci"
# by Simon H. Martin, John W. Davey and Chris D. Jiggins
# John Davey: [email protected]
# Simon Martin: [email protected]
# November-December 2013

library(parallel)
suppressMessages(library(reshape))
library(plyr)
library(optparse)
library(stringr)
library(tools)

options<-list(make_option(c("-T","--threads"),default=1,help="Number of threads [default %default]",metavar="numthreads"),
make_option(c("-m","--modelfiles"),default="",type="character",help="Folder of CSV files with model output"),
make_option(c("-r","--realdata"),default="",type="character",help="CSV file containing windows from real genome"),
make_option(c("-l","--listofmodels"),default="",type="character",help="CSV file of model parameter combinations")
)

opt<-parse_args(OptionParser(option_list=options))

if (opt$listofmodels == "") stop("Please specify a list of model parameter combinations in CSV format with -l")
if (opt$modelfiles == "" & opt$realdata == "") stop("Please specify a folder of model files with -m or a CSV file containing real data with -r")
if (opt$modelfiles != "" & !file.exists(opt$modelfiles)) stop(paste("Directory", opt$modelfiles, "does not exist! Please specify a valid directory with -m"))
if (opt$realdata != "" & !file.exists(opt$realdata)) stop(paste("Real data file", opt$realdata, "does not exist! Please specify a valid file with -r"))

partition.x<-function(model,stat,threshold=0.1,win) {
outnum<-threshold*win
Partition<-sapply(rank(-model[,stat]), function(x) if (x<=outnum) "_Outlier" else "_Background")
Partition<-paste0(stat,Partition)
model<-cbind(model,Partition)
names(model)[names(model)=="Partition"]<-paste0(stat,"_Partition")
model
}

run.tests<-function(values,groups,test.name,test.func) {
# Assumes order of groups is Background, Outlier
formula<-values~groups
two.tailed<-test.func(formula,alternative="two.sided",conf.int=TRUE)
if (length(two.tailed$estimate)==1) mean.diff<-two.tailed$estimate else mean.diff<-two.tailed$estimate[1] - two.tailed$estimate[2]
out<-data.frame(
mean.diff=mean.diff,
p.two.tailed=two.tailed$p.value,
p.background.higher=test.func(formula,alternative="greater")$p.value,
p.background.lower=test.func(formula,alternative="less")$p.value
)
names(out)<-paste(test.name,names(out),sep=".")
out
}

summarise.tests<-function(model.sum,model.stats,model.partition,test.name,test.func) {
model.tests<-t(apply(model.stats,2,function(x) unlist(run.tests(x,model.partition,test.name,test.func))))
model.tests<-cbind(Variable=rownames(model.tests),model.tests)
merge(model.sum,model.tests,by="Variable",sort=FALSE)
}

get.model.summary<-function(models,column,pars=NULL) {
model.stats<-models[c("P1P2_dxy","P1P3_dxy","P2P3_dxy")]
model.partition<-models[,column]

model.means<-melt(aggregate(model.stats,list(model.partition),mean,na.rm=TRUE),id="Group.1")
model.sds<-melt(aggregate(model.stats,list(model.partition),sd,na.rm=TRUE),id="Group.1")
model.ses<-melt(aggregate(model.stats,list(model.partition), function(x) {x<-na.omit(x);sqrt(var(x)/length(x))}),id="Group.1")
model.sum<-cbind(pars,model.means,model.sds$value,model.ses$value)
names(model.sum)<-c(names(pars),"Model","Variable","Mean","SD","SE")

model.sum<-summarise.tests(model.sum,model.stats,model.partition,"t",t.test)
model.sum<-summarise.tests(model.sum,model.stats,model.partition,"w",wilcox.test)

model.sum
}

calc.mfD0<-function(df) {
apply(df,1,function(x) if ((is.na(x["D"])) || (as.numeric(x["D"])<0)) NA else as.numeric(x["mf"]))
}


summarise.model<-function(filename) {
pars<-data.frame(str_match(filename,"Alternate_t123-(.+)_t23-(.+)\\.Background_t123-(.+)_t21-(.+)\\.(.+)\\.csv"),stringsAsFactors=FALSE)
names(pars)<-c("File","Alternate_t123","Alternate_t23","Background_t123","Background_t21","Windows")
pars$Windows<-as.numeric(pars$Windows)

models<-read.csv(filename)

models$Model<-factor(str_match(models$Model,"(.+)_t123")[,2],levels=c("Background","Alternate"))
models$Model<-revalue(models$Model,c(Background="Real_Background",Alternate="Real_Outlier"))

models$mfD0<-calc.mfD0(models)

models<-partition.x(models,"D",0.1,pars$Windows)
models<-partition.x(models,"mfD0",0.1,pars$Windows)

model.summary<-get.model.summary(models,"Model",pars)
D.summary<-get.model.summary(models,"D_Partition",pars)
mfD0.summary<-get.model.summary(models,"mfD0_Partition",pars)

model.sum<-rbind(model.summary,D.summary,mfD0.summary)

is.D.positive<-models$D>=0
model.comp<-data.frame(pars,
Stat=c("D","mfD0"),
Stat.Mean=c(mean(models$D,na.rm=TRUE)*100,mean(models$mfD0,na.rm=TRUE)*100),
Stat.SD=c(sd(models$D,na.rm=TRUE)*100,sd(models$mfD0,na.rm=TRUE)*100),
Dpos=sum(is.D.positive,na.rm=TRUE),
DposAlternate=sum(is.D.positive & models$Model=="Real_Outlier",na.rm=TRUE),
DposOutlier=c(sum(is.D.positive & models$D_Partition=="D_Outlier", na.rm=TRUE),
sum(is.D.positive & models$mfD0_Partition=="mfD0_Outlier", na.rm=TRUE)),
OutlierAlternate=c(sum(models$Model=="Real_Outlier" & models$D_Partition=="D_Outlier",na.rm=TRUE),
sum(models$Model=="Real_Outlier" & models$mfD0_Partition=="mfD0_Outlier",na.rm=TRUE))
)
model.comp$OutlierAlternatePC<-model.comp$OutlierAlternate/model.comp$DposOutlier*100

list(model.sum,model.comp)
}

options(width=200)

if (opt$modelfiles != "") {
csv.summary<-mclapply(dir(opt$modelfiles,"*.csv",full.names=TRUE),summarise.model,mc.cores=opt$threads)

models<-rbind.fill(lapply(csv.summary,function(x) x[[1]]))
model.comp<-rbind.fill(lapply(csv.summary,function(x) x[[2]]))

model.list<-read.csv(opt$listofmodels,stringsAsFactors=FALSE)

models<-merge(models,unique(model.list[,c("Background_t123","Background_t21","Alternate_t123","Alternate_t23","ModelType")]),by=c("Background_t123","Background_t21","Alternate_t123","Alternate_t23"))
write.table(models,file=paste0(opt$modelfiles,".dxy.summary.tsv"),sep="\t",row.names=FALSE)

model.comp<-merge(model.comp,unique(model.list[,c("Background_t123","Background_t21","Alternate_t123","Alternate_t23","ModelType")]),by=c("Background_t123","Background_t21","Alternate_t123","Alternate_t23"))
write.table(model.comp,file=paste0(opt$modelfiles,".partition.summary.tsv"),sep="\t",row.names=FALSE)
}

if (opt$realdata != "") {
real<-read.csv(opt$realdata)
real$mfD0<-calc.mfD0(real)
real<-partition.x(real,"D",0.1,nrow(real))
real<-partition.x(real,"mfD0",0.1,nrow(real))

pars<-data.frame(File=opt$realdata,Windows=nrow(real))
real.stats<-rbind(get.model.summary(real,"D_Partition",pars),get.model.summary(real,"mfD0_Partition",pars))
write.table(real.stats,file=paste0(file_path_sans_ext(basename(opt$realdata)),".dxy.summary.tsv"),sep="\t",row.names=FALSE)
}
126 changes: 126 additions & 0 deletions run_model_combinations.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env perl

# run_model_combinations.pl
# Run a set of combined models using shared_ancestry_simulator.R based on a CSV file of model parameter combinations

# Written for "Evaluating statistics for the identification of introgressed loci"
# by Simon H. Martin, John W. Davey and Chris D. Jiggins
# John Davey: [email protected]
# Simon Martin: [email protected]
# November-December 2013

use strict;
use warnings;
use Carp;
use English;
use Data::Dumper;
use Getopt::Long;


my $numwindows = 1000;
my $windowsize = 5000;
my $modelfile = "";
my $threads = 1;
my $popsize = 1000000;
my $simulator_path = "..";
my $options_okay = GetOptions(
'numwindows=i' => \$numwindows,
'windowsize=i' => \$windowsize,
'modelfile=s' => \$modelfile,
'threads=i' => \$threads,
'popsize=i' => \$popsize,
'simulator_path=s' => \$simulator_path,
);

croak "Please supply a CSV file containing model parameters with -m" if $modelfile eq "";

croak "Simulator path $simulator_path does not exist!\n" if !-d $simulator_path;

my $model = <<EOM;
seqgen:
m: HKY
l: $windowsize
ms:
pops:
- {name: 1, seqs: 8}
- {name: 2, seqs: 8}
- {name: 3, seqs: 8}
- {name: 4, seqs: 8}
opts:
genperyear: 4
n0: $popsize
mu: 2.5e-9
split:
- {time: 3, pops: [4,1]}
EOM

sub output_models {
my ($model, $modeltype, $params) = @_;
my $pops = $modeltype eq "Background" ? "2,1" : $modeltype eq "Alternate" ? "2,3" : "0,0";
my $printpops = $pops;
$printpops =~ s/,//g;
foreach my $t123 (sort {$a<=>$b} keys %{$params}) {
my $model_t123 = $model . " - {time: $t123, pops: [3,1]}\n";
foreach my $txx (keys %{$params->{$t123}}) {
open my $model_file, ">", "$modeltype\_t123-$t123\_t$printpops-$txx.yml" or croak "Can't open model file: $OS_ERROR!\n";
my $model_t123_txx = $model_t123 . " - {time: $txx, pops: [$pops]}\n";
print $model_file $model_t123_txx;
close $model_file;
}
}
}

open my $combined, "<", $modelfile or croak "Can't open model file: $OS_ERROR!\n";

my $header = <$combined>;
chomp $header;
my @header = split /,/, $header;
my %background;
my %alternate;
my @combinations;
while (my $model = <$combined>) {
chomp $model;
my @f = split /,/, $model;
$background{$f[1]}{$f[2]}++;
$alternate{$f[3]}{$f[4]}++;
my %comb;
$comb{type}=$f[0];
map {
my ($type, $var) = split /_/, $header[$_];
$comb{pars}{$type}{$var} = $f[$_]
} 1..$#header;
push @combinations, \%comb;
}

close $combined;

my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime();
$mon++; # originally in 0..11
$year+=1900;
my $outdir = "model_files_win$numwindows\_size$windowsize\_ne$popsize\_$year-$mon-$mday";
mkdir($outdir) if !-d $outdir;
chdir($outdir);

open my $log, ">", "$outdir.log" or croak "Can't open log file! $OS_ERROR\n";

output_models($model, "Background", \%background);
output_models($model, "Alternate", \%alternate);


foreach my $combination (@combinations) {
my $command = "$simulator_path/shared_ancestry_simulator.R -w $numwindows -T $threads -c ";
foreach my $model_type (keys %{$combination->{pars}}) {
my $prop = $model_type eq "Background" ? 0.9 : 0.1;
$command .= $model_type . "_t123-";
$command .= $combination->{pars}{$model_type}{t123};
$command .= "_";
delete $combination->{pars}{$model_type}{t123};
my $var2 = (keys %{$combination->{pars}{$model_type}})[0];
$command .= "$var2-$combination->{pars}{$model_type}{$var2}.yml:$prop,";
}
chop $command;
print $log "$command\t" . localtime() . "\n";
my $simout = `$command`;
croak "Couldn't run command: $command\n(Is shared_ancestry_simulator.R in your path?)\n" if !defined $simout;
croak "Command failed: $command\n$simout\n" if ($simout ne "");
}
Loading

0 comments on commit ae8af88

Please sign in to comment.