-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlotov1Predictor.R
101 lines (75 loc) · 2.71 KB
/
lotov1Predictor.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# In version 1 of loto, we have a balanced training set where
# in our training set. for every prediction, we have to leave out
# the entire taxon. In order to do this, we find out the genus of
# our sequence and check the training set for the indices with that
# genus remove that genus from the entire thing.
args = (commandArgs(TRUE))
if(length(args)==0){
print("No arguments supplied.")
##supply default values
start = 1
end = 13212
k = 8
s = 32
}else{
for(i in 1:length(args)){
eval(parse(text=args[[i]]))
}
}
load('rdpDataframe.RData')
loadfilename <- paste(c('tfidf',k,'mers.RData'),collapse = "")
load(loadfilename)
rank <- rdp$genus
names(rank) <- rank
sequences <- rdp$sequences
uniqueRank <- unique(rank)
names(uniqueRank) <- uniqueRank
mers <- lapply(sequences,
function (x) {
substring(x,
1:(nchar(x) - k + 1L),
k:nchar(x))
})
names(mers) <- rank
# start and end will be split by the number of unique sequences/
# We now proceed to get the unique seqeunces. So in order to do this, we
# get the indices of every unique sequence and take any one of them into our
# testing sequence (here we take the first one for convenience).
testingSeqsIndices <- lapply(uniqueGenera, function(x) {
which(x == rdp$genus)[1]
})
testingSeqsIndices <- unlist(testingSeqsIndices)
testingSeqs <- mers[testingSeqsIndices]
# Now that we have our testing sequences, we have to set our training sequences
# to be the set of seqeuences without the genus of our testing seqeunce.
tfidfVals <- eval(parse(text = paste(c('tfidf',k,'mers'), collapse = '')))
predictionVector <- vector(mode = 'character', length = length(testingSeqs))
for(i in start:end) {
tfidfSeq <- tfidfVals[[testingSeqsIndices[i]]]
testSeq <- testingSeqs[i]
# Now that we have the test genus, we to edit our training sequences such that
# we remove those sequences that dont have the same sequences as the genus
testGenus <- uniqueRank[i]
testGenusIndices <- which(rank %in% testGenus)
training_db_seqs <- mers[-testGenusIndices]
training_db_rank <- rank[-testGenusIndices]
overlapVector <- sapply(training_db_seqs, k = testSeq, FUN = function(X,k) {
t1 = unlist(k)
t2 = unlist(X)
# length(intersect(t1,t2))
sum(tfidfSeq[intersect(t1, t2)])
})
maxPos <- which(overlapVector == max(overlapVector))
if(length(maxPos) > 1) {
maxPos <- sample(maxPos)[1]
} else {
maxPos <- maxPos[1]
}
predicted <- training_db_rank[maxPos]
predictionVector[i] <- predicted
cat('Number : ', i ,'\n')
cat('Test Rank ', testGenus,'\n')
cat('Predicted ', predicted, '\n')
}
savelink <- paste(c('LOTOv1Prediction_',k,'mers_',end,'_v3.RData'), collapse = "")
save(predictionVector, file = savelink)