-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsintaxOrigBootstrapv2WR.R
113 lines (75 loc) · 2.68 KB
/
sintaxOrigBootstrapv2WR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
load('rdpDataframe.RData')
args = (commandArgs(TRUE))
if(length(args)==0){
print("No arguments supplied.")
##supply default values
start = 1
end = 20000
k = 8
s = 32
}else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
# loadfilename<-paste0(c(k,"mersPredictionsOrig.RData"), collapse = "")
load('origPredictions.RData')
rank <- rdp$genus
names(rank) <- rank
sequences <- rdp$sequences
uniqueRank <- unique(rank)
names(uniqueRank) <- uniqueRank
mers <- lapply(sequences,
function (x) {
substring(x,
1:(nchar(x) - k + 1L),
k:nchar(x))
})
names(mers) <- rank
# -------------------------- THE ALGORTHM -----------------------------------
# Singleton sequences must be present in our query database but
# not in our
query_ranks <- rank
query_seqs <- mers
# Removing the singleton sequences from our reference database.
refernece_db_ranks <- rank
refernece_db_seqs <- mers
bs_confidence_vector <- vector(mode = 'integer', length=length(mers))
names(bs_confidence_vector) <- rdp$genus
for(i in start:end)
{
testSeq <- mers[[i]]
testRank <- rank[[i]]
predictedRankFromPredictions <- predictions[i]
training_db_rank <- rank[[-i]]
training_db_seqs <- mers[[-i]]
confidenceVector <- vector(mode = 'integer', length = length(uniqueRank))
names(confidenceVector) <- uniqueRank
samp_matrix_w <- matrix(sample(length(testSeq),3200,replace = TRUE),nrow=100,ncol=32)
cat('testRank ', testRank, '\n')
for(j in 1:100) {
cat('bootstrapNo : ', j, '\n')
testSeq <- unlist(testSeq)
sampleKmerIndices <- samp_matrix_w[j,]#sample(length(testSeq), s, replace = FALSE)
bootstrappedKmers <- testSeq[sampleKmerIndices]
overlapVector <- sapply(training_db_seqs, k = bootstrappedKmers, FUN = function(X,k) {
t1 = unlist(X)
t2 = unlist(k)
length(intersect(t1, t2))
})
predicted <- training_db_rank[sample(which(overlapVector == max(overlapVector)))[1]]
confidenceVector[predicted] <- confidenceVector[predicted] + 1
cat('Predicted In bootstrap : ', predicted,'\n')
}
bs_confidence_vector[i] <- confidenceVector[predictedRankFromPredictions]
cat('Query Seq : ', i,'\n')
cat('Final Prediction : ', testRank, '\n')
}
# Now we can use the existing bootstrap to begin
# our predictions using overlapping k-mers
# Now instead of using only part of the sequence,
# we will use full length sequences and find out
# the correct genus using the annotation.
savelink <- paste(c('origSintaxBootstrap_',k,'mers_s',s,'_',end,'.RData'), collapse = "")
save(bs_confidence_vector, file = savelink)
# ----------------------------------------------------------------------------------------