-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpipeline_example.r
148 lines (119 loc) · 5.38 KB
/
pipeline_example.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
masterdir = "./"
source(paste(masterdir, "code/tools.r", sep=""))
require("parallel")
setcores(cores = 3)
seed = 11 # in order to have reproducable results
testindsperdeme = 20
demes=3
informativenessBound = 0.9
stepsize = 10000
numAIM = 10 # number of AIMs
method = "naiveBayes"
error.type="logloss"
diploid = TRUE
set.seed(seed)
size = rep(testindsperdeme, demes) # size of the test set in all demes
prepare0 = TRUE # Make Training and Test sets
step1 = TRUE # Get frequencies for all SNPs in all populations for the training set
step2 = TRUE # Filter most informative SNPs and write a new vcf File
step3 = TRUE # Cut files in smaller pieces for better handling
step4 = TRUE # Get nss for all chromosomes
step5 = TRUE # Store training data in RDS-Files
step6 = TRUE # Read data in chunks and find possible AIMs
step7 = TRUE # Create a vcf.gz File for all SNPs from step6; this is called AIMs.vcf.gz
step8 = TRUE # Create classification and posterior probabilities
# Make Training and Test sets
filenameInds = "ooa_SampleListWithLocations.txt"
filenameIndsTraining = "training_ooa_SampleListWithLocations.txt"
filenameIndsTest = "test_ooa_SampleListWithLocations.txt"
filenameData = "data/sim/ooa/ooa_chromosome_1_example"
if(prepare0) {
a = read.vcfR(paste(filenameData,".vcf.gz",sep=""), nrows=1)
inds = colnames(a@gt)[-1]
inds = cbind(inds, rep(0, 240), c(rep("AFR", 80), rep("EUR", 80), rep("ASI", 80)))
write.table(inds, filenameInds, col.names= FALSE, row.names = FALSE)
getTestandtrainingset(filenameInds, filenameIndsTraining, filenameIndsTest, size)
}
# load frequencies in all pops into R
if(step1) { # Get frequencies in all superpopulations
getfreqs.from.vcf(filenameData, filenameIndsTraining, snplist=FALSE)
}
# get positions of most informative SNPs and create new vcf.gz files
if(step2) {
load(paste(filenameData, "_freqs", sep=""))
info = informativeness.global(freqs)
nss = ncol(freqs)
informativenessQuantile = sort(info, decreasing = TRUE)[(1-informativenessBound) * nss]
chr.informative = chr[info>informativenessQuantile]
pos.informative = pos[info>informativenessQuantile]
options(scipen=999) # This prevents 48000000 to be written as 4.8e+07 which cannot be read again
write.table(cbind(chr.informative, pos.informative), file = "tmp/chr1_informativePositions", quote=FALSE, col.names=FALSE, row.names=FALSE)
options(scipen=0)
call = paste("vcftools --gzvcf ", filenameData, ".vcf.gz --positions tmp/chr1_informativePositions --recode-INFO-all --recode --out tmp/chr1_informative", sep="")
cat("Getting most informative positions on chromosome ", 1, "\n")
system(call)
cat("Zipping file for chromosome ", 1, "\n")
call=paste("gzip -f tmp/chr1_informative.recode.vcf", sep="")
system(call)
}
# Cut files in smaller pieces and get rid of unnamed SNPs
# This results in tmp/chr1_segment0.recode.vcf.gz etc...
if(step3) {
filenames = paste("tmp/chr1_informative.recode.vcf.gz", sep="")
cut.VCF(filenames, stepsize, removeUnnamed = TRUE)
}
# get file nss, which stores the number of informative segregating sites for all chromosomes
if(step4) {
nss = getNss("tmp/chr1_informative.recode.vcf.gz")
write.table(nss, file = "nss.tab", quote=FALSE, col.names=FALSE, row.names=FALSE)
}
# store data for trainingset in RDS-Files
if(step5) {
a = read.csv(filenameIndsTraining, sep="", header=FALSE)
training.inds = a[,c(1,3)]
nss = read.table("nss.tab")[,1]
filenames = NULL
for(j in 0:((nss-1)/stepsize)) {
filenames = c(filenames, paste("tmp/chr1_segment", j, ".recode.vcf.gz", sep=""))
}
for(i in 1:length(filenames)) {
dat = getData(filenames[i], inds = training.inds, diploid = TRUE)
saveRDS(dat$sample, file = paste(filenames[i], ".rds", sep=""), version = 2)
}
}
# dat.training contains data for the training set. The order of rows is the same as in filenameIndsTraining
if(step6) {
a = read.csv(filenameIndsTraining, sep="", header=FALSE)
training.inds = a[,c(1,3)]
nss = read.table("nss.tab")[,1]
filenames = NULL
for(j in 0:((nss[i]-1)/stepsize)) {
filenames = c(filenames, paste("tmp/chr1_segment", j, ".recode.vcf.gz.rds", sep=""))
}
sampleAIMs = NULL
sample.AIMs = find.AIMs.fromRDS(filenames, sampleAIMs, training.inds, numAIM, method, error.type, diploid)
}
# We create a vcf.gz File for all SNPs from step6; this is called AIMs.vcf.gz
if(step7) {
filenameDataVCFGZ = paste(filenameData, ".vcf.gz", sep="")
extract.and.merge(filenameDataVCFGZ, "AIMs", outfile="AIMs.vcf")
system("gzip -f AIMs.vcf")
}
# compute posterior probabilities with final AIMset and classify samples with naive Bayes classifier
if(step8) {
a = read.csv(filenameInds, sep=" ", header=FALSE)
inds = a[,c(1,2,3)]
samLoc = a[,3]
a = getData("AIMs.vcf.gz", skip=0, windowSize=-1, inds = inds, snps = TRUE, diploid = TRUE)
sample = a$sample
freqs = getfreqs(sample, samLoc, diploid = TRUE)
samplesInDeme = table(samLoc)
predictions = prediction.naiveBayes(sample, freqs, samplesInDeme, diploid = TRUE)
rownames(predictions) = inds[,1]
colnames(predictions) = names(table(samLoc))
classifications = rownames(freqs)[apply(predictions, 1, which.max)]
names(classifications) = inds[,1]
write.csv(predictions, file = "predictions.csv" )
write.table(classifications, file = "classifications.tab", col.names = F )
cat("pipeline has finished. AIM set, predictions, and classifications have been generated.\n")
}