-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpipeline_simulated_ooa_data.r
192 lines (164 loc) · 7.11 KB
/
pipeline_simulated_ooa_data.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# This pipeline simulates the out of africa model and finds AIMs using naiveBayes
masterdir = "./"
source(paste(masterdir, "code/tools.r", sep=""))
require("parallel")
setcores(cores = 3)
seed = 11 # in order to have reproducable results
testindsperdeme = 200
demes=3
informativenessBound = 0.9
stepsize = 10000
numAIM = 15 # number of AIMs
method = "naiveBayes"
error.type="logloss"
diploid = TRUE
set.seed(seed)
size = rep(testindsperdeme, demes) # size of the test set in all demes
num_ch = 1
simulate = FALSE # Simulate datasets
prepare = TRUE # Make Training and Test sets
step1 = TRUE # Get frequencies for all SNPs in all populations for the training set
step2 = TRUE # Filter most informative SNPs and write a new vcf File
step3 = TRUE # Cut files in smaller pieces for better handling
step4 = TRUE # Get nss for all chromosomes
step5 = TRUE # Store training data in RDS-Files
step6 = TRUE # Read data in chunks and find possible AIMs
step7 = TRUE # Create a vcf.gz File for all SNPs from step6; this is called AIMs.vcf.gz
step8 = TRUE # Create classification and posterior probabilities
if(simulate){
print("simulating SNP data for out of africa model -- depending on your system this may take a while (several hours or days)...")
globalcommand = "python3 code/simulate4biogeo.py ooa"
commands = c()
allseeds = seed
# # to simulate all seeds used for the publication at once uncomment:
# allseeds = c(11,12,13,14,15,16,17,18,19,20)
# # to simulate all 20 chromosomes as used for the publication uncomment:
# num_ch = 20
for(itseed in allseeds){
commands = c(commands,paste(globalcommand, itseed, num_ch, sep= " "))
}
exec<-function(commandline){
system(commandline)
}
library(parallel)
mclapply(commands,exec)
for(itseed in allseeds){
command = paste("mv ooa_chromosome_*_seed_",itseed,".vcf.gz data/sim/ooa/tmp/.",sep="")
system(command)
}
}
# Make Training and Test sets
filenameInds = "ooa_SampleListWithLocations.txt"
filenameIndsTraining = "training_ooa_SampleListWithLocations.txt"
filenameIndsTest = "test_ooa_SampleListWithLocations.txt"
filenameData = NULL
for(i in 1:num_ch) {
filenameData = c(filenameData, paste("data/sim/ooa/tmp/ooa_chromosome_", i, "_seed_", seed, sep=""))
}
if(prepare) {
a = read.vcfR(paste("data/sim/ooa/tmp/ooa_chromosome_1_seed_", seed, ".vcf.gz", sep=""), nrows=1)
inds = colnames(a@gt)[-1]
inds = cbind(inds, rep(0, 2400), c(rep("AFR", 800), rep("EUR", 800), rep("ASI", 800)))
write.table(inds, filenameInds, col.names= FALSE, row.names = FALSE)
getTestandtrainingset(filenameInds, filenameIndsTraining, filenameIndsTest, size)
}
# load frequencies in all pops into R
if(step1) { # Get frequencies in all superpopulations
for(i in num_ch:1) {
getfreqs.from.vcf(filenameData[i], filenameIndsTraining, snplist=FALSE)
}
# mclapply(filenameData,FUN = function(x){getfreqs.from.vcf(x,filenameIndsTraining,snplist = FALSE)}) # this does not work yet
}
# get positions of most informative SNPs and create new vcf.gz files
if(step2) {
# source("../biogeoanc-prediction/util.r")
for(i in num_ch:1) {
load(paste(filenameData[i], "_freqs", sep=""))
info = informativeness.global(freqs)
nss = ncol(freqs)
informativenessQuantile = sort(info, decreasing = TRUE)[(1-informativenessBound) * nss]
chr.informative = chr[info>informativenessQuantile]
pos.informative = pos[info>informativenessQuantile]
options(scipen=999) # This prevents 48000000 to be written as 4.8e+07 which cannot be read again
write.table(cbind(chr.informative, pos.informative), file = paste("tmp/chr", i, "_informativePositions", sep=""), quote=FALSE, col.names=FALSE, row.names=FALSE)
options(scipen=0)
call = paste("vcftools --gzvcf ",filenameData[i],".vcf.gz --chr ", i-1, " --positions tmp/chr", i, "_informativePositions --recode-INFO-all --recode --out tmp/chr", i, "_seed_", seed, "_informative", sep="")
cat("Getting most informative positions on chromosome ", i, "\n")
system(call)
cat("Zipping file for chromosome ", i, "\n")
call=paste("gzip -f tmp/chr", i, "_seed_", seed, "_informative.recode.vcf", sep="")
system(call)
}
}
# Cut files in smaller pieces and get rid of unnamed SNPs
# This results in tmp/chr1_segment0.recode.vcf.gz etc...
if(step3) {
filenames = NULL
for(i in 1:num_ch) {
filenames = c(filenames, paste("tmp/chr", i, "_seed_", seed, "_informative.recode.vcf.gz", sep=""))
}
cut.VCF(filenames, stepsize, removeUnnamed = TRUE)
}
# get file nss, which stores the number of informative segregating sites for all chromosomes
if(step4) {
nss = NULL
for(i in 1:num_ch) {
nss = c(nss, getNss(paste("tmp/chr", i, "_seed_", seed, "_informative.recode.vcf.gz", sep="")))
}
write.table(nss, file = paste("nss_seed_", seed, sep=""), quote=FALSE, col.names=FALSE, row.names=FALSE)
}
# store data for trainingset in RDS-Files
if(step5) {
a = read.csv(filenameIndsTraining, sep="", header=FALSE)
training.inds = a[,c(1,3)]
nss = read.table(paste("nss_seed_", seed, sep=""))[,1]
filenames = NULL
for(i in 1:num_ch) {
for(j in 0:((nss[i]-1)/stepsize)) {
filenames = c(filenames, paste("tmp/chr", i, "_segment", j, ".recode.vcf.gz", sep=""))
}
}
for(i in 1:length(filenames)) {
dat = getData(filenames[i], inds = training.inds, diploid = TRUE)
saveRDS(dat$sample, file = paste(filenames[i], ".rds", sep=""), version = 2)
}
}
# dat.training contains data for the training set. The order of rows is the same as in filenameIndsTraining
if(step6) {
a = read.csv(filenameIndsTraining, sep="", header=FALSE)
training.inds = a[,c(1,3)]
nss = read.table(paste("nss_seed_", seed, sep=""))[,1]
filenames = NULL
for(i in 1:num_ch) {
for(j in 0:((nss[i]-1)/stepsize)) {
filenames = c(filenames, paste("tmp/chr", i, "_segment", j, ".recode.vcf.gz.rds", sep=""))
}
}
sampleAIMs = NULL
sample.AIMs = find.AIMs.fromRDS(filenames, sampleAIMs, training.inds, numAIM, method, error.type, diploid)
}
# We create a vcf.gz File for all SNPs from step6; this is called AIMs.vcf.gz
if(step7) {
filenameDataVCFGZ = filenameData
for(i in 1:num_ch) filenameDataVCFGZ[i] = paste(filenameData[i], ".vcf.gz", sep="")
extract.and.merge(filenameDataVCFGZ, "AIMs", outfile="AIMs.vcf")
system("gzip -f AIMs.vcf")
}
# compute posterior probabilities with final AIMset and classify samples with naive Bayes classifier
if(step8) {
a = read.csv(filenameInds, sep=" ", header=FALSE)
inds = a[,c(1,2,3)]
samLoc = a[,3]
a = getData("AIMs.vcf.gz", skip=0, windowSize=-1, inds = inds, snps = TRUE, diploid = TRUE)
sample = a$sample
freqs = getfreqs(sample, samLoc, diploid = TRUE)
samplesInDeme = table(samLoc)
predictions = prediction.naiveBayes(sample, freqs, samplesInDeme, diploid = TRUE)
rownames(predictions) = inds[,1]
colnames(predictions) = names(table(samLoc))
classifications = rownames(freqs)[apply(predictions, 1, which.max)]
names(classifications) = inds[,1]
write.csv(predictions, file = "predictions.csv" )
write.table(classifications, file = "classifications.tab", col.names = F )
cat("pipeline has finished. AIM set, predictions, and classifications have been generated.\n")
}