-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First commit, uploading README and code files
- Loading branch information
0 parents
commit ae8af88
Showing
4 changed files
with
639 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
This repository contains code and data for the manuscript "Evaluating statistics for the identification of introgressed loci" by Simon H. Martin, John W. Davey and Chris D. Jiggins, available on bioRxiv. | ||
|
||
The results in the paper were generated as follows. | ||
|
||
Models are generated using the `shared_ancestry_simulator.R` script. A single combined model can be generated like this: | ||
|
||
``` | ||
./shared_ancestry_simulator.R -w 10000 -T 60 -c Alternate_t123-0.4_t23-0.2.yml:0.1,Background_t123-0.6_t21-0.4.yml:0.9 | ||
``` | ||
|
||
This will generate 10000 windows, 10% of which will be generated using the model described in the file `Alternate_t123-0.4_t23-0.2.yml` and 90% of which will be generated using `Background_t123-0.6_t21-0.4.yml`, using 60 threads. See the model files folders for the YAML files generated for this paper. | ||
|
||
These YAML files were generated by the `run_model_combinations.pl` script, using the following commands for the 5kb, 10kb and 20kb simulations: | ||
|
||
``` | ||
./run_model_combinations.pl -T 60 -n 10000 -w 5000 -p 1000000 -m Model_parameter_list.csv | ||
./run_model_combinations.pl -T 60 -n 5000 -w 10000 -p 1000000 -m Model_parameter_list.csv | ||
./run_model_combinations.pl -T 60 -n 2500 -w 20000 -p 1000000 -m Model_parameter_list.csv -s path/to/simulator | ||
``` | ||
|
||
Option 'p' defines the effective population size and option 'm' specifies a file containing the list of models to simulate (available in the repository). Option 's' is the directory path to the simulator script, `shared_ancestry_simulator.R`. If 's' is not provided, `run_model_combinations.pl` assumes `shared_ancestry_simulator.R` is in the same folder as itself. | ||
|
||
Summary statistics for the models found in the `partition.summary` and `dxy.summary` files were generated as follows: | ||
|
||
``` | ||
./generate_summary_statistics.R -m model_files_win10000_size5000 -T 60 | ||
./generate_summary_statistics.R -m model_files_win5000_size10000 -T 60 | ||
./generate_summary_statistics.R -m model_files_win2500_size20000 -T 60 | ||
``` | ||
|
||
Real data for the Heliconius genome, split into 5 kb windows, can be found in the file `Heliconius_genome_windows.csv`. | ||
|
||
The figures were generated using these summary statistics files, using the following scripts: | ||
|
||
``` | ||
./Figure_2.R -i model_files_win10000_size5000/Alternate_t123-1.2_t23-0.2.Background_t123-2_t21-1.2.10000.csv | ||
./Figure_6_S5.R | ||
./Figure_7.R -i model_files_win10000_size5000.dxy.summary.tsv | ||
./Figure_S1_S2.R -i model_files_win10000_size5000.partition.summary.tsv | ||
./Figure_S3_S4.R | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# generate_summary_statistics.R | ||
# Generate dXY and partitioning statistics for models generated using run_model_combinations.pl | ||
|
||
# Written for "Evaluating statistics for the identification of introgressed loci" | ||
# by Simon H. Martin, John W. Davey and Chris D. Jiggins | ||
# John Davey: [email protected] | ||
# Simon Martin: [email protected] | ||
# November-December 2013 | ||
|
||
library(parallel) | ||
suppressMessages(library(reshape)) | ||
library(plyr) | ||
library(optparse) | ||
library(stringr) | ||
library(tools) | ||
|
||
options<-list(make_option(c("-T","--threads"),default=1,help="Number of threads [default %default]",metavar="numthreads"), | ||
make_option(c("-m","--modelfiles"),default="",type="character",help="Folder of CSV files with model output"), | ||
make_option(c("-r","--realdata"),default="",type="character",help="CSV file containing windows from real genome"), | ||
make_option(c("-l","--listofmodels"),default="",type="character",help="CSV file of model parameter combinations") | ||
) | ||
|
||
opt<-parse_args(OptionParser(option_list=options)) | ||
|
||
if (opt$listofmodels == "") stop("Please specify a list of model parameter combinations in CSV format with -l") | ||
if (opt$modelfiles == "" & opt$realdata == "") stop("Please specify a folder of model files with -m or a CSV file containing real data with -r") | ||
if (opt$modelfiles != "" & !file.exists(opt$modelfiles)) stop(paste("Directory", opt$modelfiles, "does not exist! Please specify a valid directory with -m")) | ||
if (opt$realdata != "" & !file.exists(opt$realdata)) stop(paste("Real data file", opt$realdata, "does not exist! Please specify a valid file with -r")) | ||
|
||
partition.x<-function(model,stat,threshold=0.1,win) { | ||
outnum<-threshold*win | ||
Partition<-sapply(rank(-model[,stat]), function(x) if (x<=outnum) "_Outlier" else "_Background") | ||
Partition<-paste0(stat,Partition) | ||
model<-cbind(model,Partition) | ||
names(model)[names(model)=="Partition"]<-paste0(stat,"_Partition") | ||
model | ||
} | ||
|
||
run.tests<-function(values,groups,test.name,test.func) { | ||
# Assumes order of groups is Background, Outlier | ||
formula<-values~groups | ||
two.tailed<-test.func(formula,alternative="two.sided",conf.int=TRUE) | ||
if (length(two.tailed$estimate)==1) mean.diff<-two.tailed$estimate else mean.diff<-two.tailed$estimate[1] - two.tailed$estimate[2] | ||
out<-data.frame( | ||
mean.diff=mean.diff, | ||
p.two.tailed=two.tailed$p.value, | ||
p.background.higher=test.func(formula,alternative="greater")$p.value, | ||
p.background.lower=test.func(formula,alternative="less")$p.value | ||
) | ||
names(out)<-paste(test.name,names(out),sep=".") | ||
out | ||
} | ||
|
||
summarise.tests<-function(model.sum,model.stats,model.partition,test.name,test.func) { | ||
model.tests<-t(apply(model.stats,2,function(x) unlist(run.tests(x,model.partition,test.name,test.func)))) | ||
model.tests<-cbind(Variable=rownames(model.tests),model.tests) | ||
merge(model.sum,model.tests,by="Variable",sort=FALSE) | ||
} | ||
|
||
get.model.summary<-function(models,column,pars=NULL) { | ||
model.stats<-models[c("P1P2_dxy","P1P3_dxy","P2P3_dxy")] | ||
model.partition<-models[,column] | ||
|
||
model.means<-melt(aggregate(model.stats,list(model.partition),mean,na.rm=TRUE),id="Group.1") | ||
model.sds<-melt(aggregate(model.stats,list(model.partition),sd,na.rm=TRUE),id="Group.1") | ||
model.ses<-melt(aggregate(model.stats,list(model.partition), function(x) {x<-na.omit(x);sqrt(var(x)/length(x))}),id="Group.1") | ||
model.sum<-cbind(pars,model.means,model.sds$value,model.ses$value) | ||
names(model.sum)<-c(names(pars),"Model","Variable","Mean","SD","SE") | ||
|
||
model.sum<-summarise.tests(model.sum,model.stats,model.partition,"t",t.test) | ||
model.sum<-summarise.tests(model.sum,model.stats,model.partition,"w",wilcox.test) | ||
|
||
model.sum | ||
} | ||
|
||
calc.mfD0<-function(df) { | ||
apply(df,1,function(x) if ((is.na(x["D"])) || (as.numeric(x["D"])<0)) NA else as.numeric(x["mf"])) | ||
} | ||
|
||
|
||
summarise.model<-function(filename) { | ||
pars<-data.frame(str_match(filename,"Alternate_t123-(.+)_t23-(.+)\\.Background_t123-(.+)_t21-(.+)\\.(.+)\\.csv"),stringsAsFactors=FALSE) | ||
names(pars)<-c("File","Alternate_t123","Alternate_t23","Background_t123","Background_t21","Windows") | ||
pars$Windows<-as.numeric(pars$Windows) | ||
|
||
models<-read.csv(filename) | ||
|
||
models$Model<-factor(str_match(models$Model,"(.+)_t123")[,2],levels=c("Background","Alternate")) | ||
models$Model<-revalue(models$Model,c(Background="Real_Background",Alternate="Real_Outlier")) | ||
|
||
models$mfD0<-calc.mfD0(models) | ||
|
||
models<-partition.x(models,"D",0.1,pars$Windows) | ||
models<-partition.x(models,"mfD0",0.1,pars$Windows) | ||
|
||
model.summary<-get.model.summary(models,"Model",pars) | ||
D.summary<-get.model.summary(models,"D_Partition",pars) | ||
mfD0.summary<-get.model.summary(models,"mfD0_Partition",pars) | ||
|
||
model.sum<-rbind(model.summary,D.summary,mfD0.summary) | ||
|
||
is.D.positive<-models$D>=0 | ||
model.comp<-data.frame(pars, | ||
Stat=c("D","mfD0"), | ||
Stat.Mean=c(mean(models$D,na.rm=TRUE)*100,mean(models$mfD0,na.rm=TRUE)*100), | ||
Stat.SD=c(sd(models$D,na.rm=TRUE)*100,sd(models$mfD0,na.rm=TRUE)*100), | ||
Dpos=sum(is.D.positive,na.rm=TRUE), | ||
DposAlternate=sum(is.D.positive & models$Model=="Real_Outlier",na.rm=TRUE), | ||
DposOutlier=c(sum(is.D.positive & models$D_Partition=="D_Outlier", na.rm=TRUE), | ||
sum(is.D.positive & models$mfD0_Partition=="mfD0_Outlier", na.rm=TRUE)), | ||
OutlierAlternate=c(sum(models$Model=="Real_Outlier" & models$D_Partition=="D_Outlier",na.rm=TRUE), | ||
sum(models$Model=="Real_Outlier" & models$mfD0_Partition=="mfD0_Outlier",na.rm=TRUE)) | ||
) | ||
model.comp$OutlierAlternatePC<-model.comp$OutlierAlternate/model.comp$DposOutlier*100 | ||
|
||
list(model.sum,model.comp) | ||
} | ||
|
||
options(width=200) | ||
|
||
if (opt$modelfiles != "") { | ||
csv.summary<-mclapply(dir(opt$modelfiles,"*.csv",full.names=TRUE),summarise.model,mc.cores=opt$threads) | ||
|
||
models<-rbind.fill(lapply(csv.summary,function(x) x[[1]])) | ||
model.comp<-rbind.fill(lapply(csv.summary,function(x) x[[2]])) | ||
|
||
model.list<-read.csv(opt$listofmodels,stringsAsFactors=FALSE) | ||
|
||
models<-merge(models,unique(model.list[,c("Background_t123","Background_t21","Alternate_t123","Alternate_t23","ModelType")]),by=c("Background_t123","Background_t21","Alternate_t123","Alternate_t23")) | ||
write.table(models,file=paste0(opt$modelfiles,".dxy.summary.tsv"),sep="\t",row.names=FALSE) | ||
|
||
model.comp<-merge(model.comp,unique(model.list[,c("Background_t123","Background_t21","Alternate_t123","Alternate_t23","ModelType")]),by=c("Background_t123","Background_t21","Alternate_t123","Alternate_t23")) | ||
write.table(model.comp,file=paste0(opt$modelfiles,".partition.summary.tsv"),sep="\t",row.names=FALSE) | ||
} | ||
|
||
if (opt$realdata != "") { | ||
real<-read.csv(opt$realdata) | ||
real$mfD0<-calc.mfD0(real) | ||
real<-partition.x(real,"D",0.1,nrow(real)) | ||
real<-partition.x(real,"mfD0",0.1,nrow(real)) | ||
|
||
pars<-data.frame(File=opt$realdata,Windows=nrow(real)) | ||
real.stats<-rbind(get.model.summary(real,"D_Partition",pars),get.model.summary(real,"mfD0_Partition",pars)) | ||
write.table(real.stats,file=paste0(file_path_sans_ext(basename(opt$realdata)),".dxy.summary.tsv"),sep="\t",row.names=FALSE) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#!/usr/bin/env perl | ||
|
||
# run_model_combinations.pl | ||
# Run a set of combined models using shared_ancestry_simulator.R based on a CSV file of model parameter combinations | ||
|
||
# Written for "Evaluating statistics for the identification of introgressed loci" | ||
# by Simon H. Martin, John W. Davey and Chris D. Jiggins | ||
# John Davey: [email protected] | ||
# Simon Martin: [email protected] | ||
# November-December 2013 | ||
|
||
use strict; | ||
use warnings; | ||
use Carp; | ||
use English; | ||
use Data::Dumper; | ||
use Getopt::Long; | ||
|
||
|
||
my $numwindows = 1000; | ||
my $windowsize = 5000; | ||
my $modelfile = ""; | ||
my $threads = 1; | ||
my $popsize = 1000000; | ||
my $simulator_path = ".."; | ||
my $options_okay = GetOptions( | ||
'numwindows=i' => \$numwindows, | ||
'windowsize=i' => \$windowsize, | ||
'modelfile=s' => \$modelfile, | ||
'threads=i' => \$threads, | ||
'popsize=i' => \$popsize, | ||
'simulator_path=s' => \$simulator_path, | ||
); | ||
|
||
croak "Please supply a CSV file containing model parameters with -m" if $modelfile eq ""; | ||
|
||
croak "Simulator path $simulator_path does not exist!\n" if !-d $simulator_path; | ||
|
||
my $model = <<EOM; | ||
seqgen: | ||
m: HKY | ||
l: $windowsize | ||
ms: | ||
pops: | ||
- {name: 1, seqs: 8} | ||
- {name: 2, seqs: 8} | ||
- {name: 3, seqs: 8} | ||
- {name: 4, seqs: 8} | ||
opts: | ||
genperyear: 4 | ||
n0: $popsize | ||
mu: 2.5e-9 | ||
split: | ||
- {time: 3, pops: [4,1]} | ||
EOM | ||
|
||
sub output_models { | ||
my ($model, $modeltype, $params) = @_; | ||
my $pops = $modeltype eq "Background" ? "2,1" : $modeltype eq "Alternate" ? "2,3" : "0,0"; | ||
my $printpops = $pops; | ||
$printpops =~ s/,//g; | ||
foreach my $t123 (sort {$a<=>$b} keys %{$params}) { | ||
my $model_t123 = $model . " - {time: $t123, pops: [3,1]}\n"; | ||
foreach my $txx (keys %{$params->{$t123}}) { | ||
open my $model_file, ">", "$modeltype\_t123-$t123\_t$printpops-$txx.yml" or croak "Can't open model file: $OS_ERROR!\n"; | ||
my $model_t123_txx = $model_t123 . " - {time: $txx, pops: [$pops]}\n"; | ||
print $model_file $model_t123_txx; | ||
close $model_file; | ||
} | ||
} | ||
} | ||
|
||
open my $combined, "<", $modelfile or croak "Can't open model file: $OS_ERROR!\n"; | ||
|
||
my $header = <$combined>; | ||
chomp $header; | ||
my @header = split /,/, $header; | ||
my %background; | ||
my %alternate; | ||
my @combinations; | ||
while (my $model = <$combined>) { | ||
chomp $model; | ||
my @f = split /,/, $model; | ||
$background{$f[1]}{$f[2]}++; | ||
$alternate{$f[3]}{$f[4]}++; | ||
my %comb; | ||
$comb{type}=$f[0]; | ||
map { | ||
my ($type, $var) = split /_/, $header[$_]; | ||
$comb{pars}{$type}{$var} = $f[$_] | ||
} 1..$#header; | ||
push @combinations, \%comb; | ||
} | ||
|
||
close $combined; | ||
|
||
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(); | ||
$mon++; # originally in 0..11 | ||
$year+=1900; | ||
my $outdir = "model_files_win$numwindows\_size$windowsize\_ne$popsize\_$year-$mon-$mday"; | ||
mkdir($outdir) if !-d $outdir; | ||
chdir($outdir); | ||
|
||
open my $log, ">", "$outdir.log" or croak "Can't open log file! $OS_ERROR\n"; | ||
|
||
output_models($model, "Background", \%background); | ||
output_models($model, "Alternate", \%alternate); | ||
|
||
|
||
foreach my $combination (@combinations) { | ||
my $command = "$simulator_path/shared_ancestry_simulator.R -w $numwindows -T $threads -c "; | ||
foreach my $model_type (keys %{$combination->{pars}}) { | ||
my $prop = $model_type eq "Background" ? 0.9 : 0.1; | ||
$command .= $model_type . "_t123-"; | ||
$command .= $combination->{pars}{$model_type}{t123}; | ||
$command .= "_"; | ||
delete $combination->{pars}{$model_type}{t123}; | ||
my $var2 = (keys %{$combination->{pars}{$model_type}})[0]; | ||
$command .= "$var2-$combination->{pars}{$model_type}{$var2}.yml:$prop,"; | ||
} | ||
chop $command; | ||
print $log "$command\t" . localtime() . "\n"; | ||
my $simout = `$command`; | ||
croak "Couldn't run command: $command\n(Is shared_ancestry_simulator.R in your path?)\n" if !defined $simout; | ||
croak "Command failed: $command\n$simout\n" if ($simout ne ""); | ||
} |
Oops, something went wrong.