-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsintaxTfidfBootstrapv3.2WR.R
229 lines (164 loc) · 6.86 KB
/
sintaxTfidfBootstrapv3.2WR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# tfidf contains the tf and idf functions, whose values will be returned
# into the current namespace.
# About the functions :
# ----- tf(query, dataset, mode)
# This returns the term frequency of the kmers in the query seqeunce as
# a vector, where the names of the vector is the kmers and the values
# of the vector is the term frequency.
# >>>> Possible modes : rawsmooth, lognormal, doublenormK (default k = 0.5)
# ---- idf(query, dataset, mode)
# This returns the inverse document frequency as a vector that contains,
# the idf values for every query in our sequence against the dataset.
# The vector names are the names of the kmers and their respective
# idf values.
# >>>> Possible modes : log, logsmooth, logmax, probfreq
# Refer to tfidf wikipedia page for more information on the tfidf.
# source('tfidf.R')
args = (commandArgs(TRUE))
if(length(args)==0){
print("No arguments supplied.")
##supply default values
start = 1
end = 13212
k = 8
s = 32
}else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
loadfilename <- paste(c('tfidf',k,'mers.RData'),collapse = "")
load(loadfilename)
load('rdpDataframe.RData')
rank <- rdp$genus
names(rank) <- rank
sequences <- rdp$sequences
uniqueRank <- unique(rank)
names(uniqueRank) <- uniqueRank
mers <- lapply(sequences,
function (x) {
substring(x,
1:(nchar(x) - k + 1L),
k:nchar(x))
})
names(mers) <- rank
# MODIFICATION TO EXISTING SINTAX ALGORITHM
# -------------------------- THE ALGORTHM -----------------------------------
# Singleton sequences must be present in our query database but
# not in our
query_ranks <- rank
query_seqs <- mers
# Removing the singleton sequences from our reference database.
refernece_db_ranks <- rank
refernece_db_seqs <- mers
bs_confidence_vector <- vector(mode = 'integer', length=length(mers))
names(bs_confidence_vector) <- rdp$genus
tfidfVals <- eval(parse(text = paste(c('tfidf',k,'mers'), collapse = '')))
predictionVector <- vector(mode = 'character', length = length(rank))
for(i in start:end)
{
tfidfSeq <- tfidfVals[[i]]
testSeq <- mers[[i]]
testRank <- rank[[i]]
# predictedRankFromPredictions <- predicted_RDP_sintax[i]
training_db_rank <- rank[-i]
training_db_seqs <- mers[-i]
confidenceVector <- vector(mode = 'integer', length = length(uniqueRank))
names(confidenceVector) <- uniqueRank
sequence_df <- data.frame(matrix(NA, nrow = 100, ncol = 5))
samp_matrix_w <- matrix(sample(length(testSeq),3200,replace = TRUE),nrow=100,ncol=32)
cat('testRank ', testRank, '\n')
for(j in 1:100) {
cat('bootstrapNo : ', j, '\n')
testSeq <- unlist(testSeq)
sampleKmerIndices <- samp_matrix_w[j,]#sample(length(testSeq), s, replace = FALSE)
bootstrappedKmers <- testSeq[sampleKmerIndices]
# The following is our overlap vector, which we can use to find the
overlapVector <- sapply(training_db_seqs, k = bootstrappedKmers, FUN = function(X,k) {
t1 = unlist(X)
names(t1) <- t1
t2 = unlist(k)
names(t2) <- t2
# So instead of just getting the overlaps as 1s and 0s
# you would have to find the overlap in the tfidfseq
# we could subtract the scorees of those kmers that didnt match herer
# so if we calculate the tfidf scores of those that matched, we simply
# sum up for those values, but we can also subtract the tfidf values
# of those kmers that didnt match.
# furthermore, if we have a geoup that is big, then its influence has to be smaller
# than that of a group that is smaller but we dont want it to affect so much
# that it completely throws off the confidence.
# so for example, if we have a really large group, then its influence on the confidence
# would be much smaller because we would be multiplying the final confidence by
# 1-(breadth of group)/13212
# So a smaller group would leave the confidence at near one but a bigger group could skew it
# lower.
common <- intersect(t1, t2)
uncommon <- t2[-which(t2 %in% common)]
#cat('number of common kmers ', length(common),'\n')
#cat('number of uncommon kmers ', length(uncommon),'\n')
s1 <- sum(tfidfSeq[common])
s2 <- sum(tfidfSeq[uncommon])
#cat('s1 = ', s1, '| s2 = ',s2,'\n')
score <- s1 - s2
#cat('score = ', score,'\n')
return(score)
})
# This will return the overlap vector with the hi*di product
# the next step is to divide them into the
maxPos <- which(overlapVector == max(overlapVector))
if(length(maxPos) > 1) {
maxPos <- sample(maxPos)[1]
} else {
maxPos <- maxPos[1]
}
predicted <- training_db_rank[maxPos]
hi <- max(overlapVector)
# confidenceVector[predicted] <- confidenceVector[predicted] + 1
cat('Predicted In bootstrap : ', predicted,'\n')
cat('Score Value : ', hi,'\n')
# Now that we have the common kmers, we can use the same kmers to get the di from the tfidfSeq
# we can use the common kmers to get the values that we need and store it in a dataframe
di <- sum(tfidfSeq[bootstrappedKmers])
sequence_df[j,1] <- predicted
sequence_df[j,2] <- hi
sequence_df[j,3] <- di
}
# Once we have the values for all the bootstraps. we need to calculate te di/davg fore every bootstrap
# we do this by doing the following
davg <- sum(sequence_df[,3])/100
sequence_df[,4] <- sequence_df[,3]/davg
# In this version, our confidence is determined by the fact that we tie the
# goodness of our bootstrap with how good our prediction was in that bootstrap
sequence_df[,5] <- sequence_df[,4] * (sequence_df[,2]/sequence_df[,3])
# we run into an issue where our actual rank may not be present in one of the predicted ranks,
# if thats the case, then we need to control for it by assigning the value of tha bootstrap for that
# to 0.
uniquePredictions <- unique(sequence_df[,1])
cvec <- sapply(uniquePredictions, function(x) {
sum(sequence_df[which(sequence_df[,1] == x),5])
})
names(cvec) <- uniquePredictions
maxPos2 <- which(cvec == max(cvec))
if(length(maxPos2) > 1) {
maxPos2 <- sample(maxPos2)[1]
}else{
maxPos2 <- maxPos2[1]
}
confidence <- cvec[maxPos2]
prediction <- uniquePredictions[maxPos2]
bs_confidence_vector[i] <- confidence
predictionVector[i] <- prediction
cat('Query Seq : ', i,'\n')
cat('Final Prediction : ', testRank, '\n')
}
# Now we can use the existing bootstrap to begin
# our predictions using overlapping k-mers
# Now instead of using only part of the sequence,
# we will use full length sequences and find out
# the correct genus using the annotation.
savelink <- paste(c('confidence_',end,'_v3.2.RData'), collapse = "")
savelink2 <- paste(c('predictions_',end,'_v3.2.RData'), collapse = "")
save(bs_confidence_vector, file = savelink)
save(predictionVector, file = savelink2)
# ----------------------------------------------------------------------------------------