-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkTSP.R
372 lines (309 loc) · 12.8 KB
/
kTSP.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
library(switchBox)
############################################
#
# TUTORIAL
#
############################################
### LOADING DATA
### Load gene expression data for the test set
data(trainingData)
### Show the class of the ``matTesting'' object
class(matTraining)
### Show the dimentions of the ``matTesting'' matrix
dim(matTraining)
### Show the first 10 sample names of the ``matTest'' matrix
head(colnames(matTraining), n=10)
### ENTRAINEMENT
### Train a classifier using default filtering function based on the Wilcoxon test
classifier <- SWAP.KTSP.Train(matTraining, trainingGroup)
#scores = SWAP.Calculate.BasicTSPScores(testingGroup, matTesting)
classifier
### Apply the classifier to the TRAINING set using default decision rule
trainingPrediction <- SWAP.KTSP.Classify(matTraining, classifier)
### Resubstitution performance in the TRAINING set
### Define a "positive" test result if needed
table(trainingPrediction, trainingGroup)
table(trainingPrediction, trainingGroup)
############################################
#
# NOS DONNEES
#
############################################
### PREPARE DATA
setwd("C:/Users/lmgue/OneDrive/M2/PROJET/")
#setwd("/home/lmgueguen/Documents/M2/PROJET/")
df = read.csv("projet_data/minusRef_fc2.csv", header = TRUE, dec = ",")
dim(df)
df = unique.data.frame(df)
dim(df)
df = df[,2:58] #removes first column
colnames(df)[1] <- "Genes"
unknown = which(df['Genes'] == '?') #removes genes with "?" as names
df = df[-unknown,]
df$Genes <- make.names(df$Genes, unique = TRUE)
rownames(df) <- df$Genes
df <- subset(df, select = -c(Genes))
col_names = colnames(df)
col_names = gsub('^SLS.*', 'autre', col_names) #uses regex to change lines starting with SLS (followed by any character any number of times), by irritant
col_names = gsub('^[^[:lower:]].*' ,'allergique' , col_names) #changes lines starting NOT by a lower character and followed by any any number of times by allergique
colnames(df) <- col_names
colnames(df)
dim(df)
### MATRICE
# prepare train and test matrix, 80% and 20% of the data set (respectively 46 and 11 samples)
trainmat = as.matrix(df[,1:49])
rownames(trainmat) <- rownames(df)
testmat = as.matrix(df[,50:56])
rownames(testmat) <- rownames(df)
### ENTRAINEMENT
classifier <- SWAP.KTSP.Train(trainmat, as.factor(colnames(df)[1:49]), krange = c(7))
classifier
# $name
# [1] "7TSPs"
#
# $TSPs
# [,1] [,2]
# [1,] "CLDN23" "GBP1"
# [2,] "PLA2G4F" "PARP9.2"
# [3,] "SCCPDH" "PLSCR1.1"
# [4,] "GAL3ST4.1" "CDKN2D"
# [5,] "LY6K" "NMI"
# [6,] "ANKK1" "DTX3L.1"
# [7,] "GSTA3" "ERAP2.1"
#
# $score
# [1] 1.0000076 1.0000058 1.0000050 1.0000048 1.0000044 1.0000035 0.9743641
#
# $labels
# [1] "allergique" "non-allergique"
prediction1 = SWAP.KTSP.Classify(trainmat, classifier)
reality1 <- as.factor(colnames(df)[1:49])
table(prediction1, reality1)
prediction2 = SWAP.KTSP.Classify(testmat, classifier)
reality2 <- as.factor(colnames(df)[50:56])
table(prediction2, reality2)
# reality2
# prediction2 allergique non-allergique
# allergique 6 0
# non-allergique 0 1
############################################
#
# CROSS-VALIDATION
#
############################################
### KFOLD STRATIFIE
#la premiere fonction separe le jeu de donnees en dataframes par categories, en calculant la proportion du jeu de donnees initiale pour chaque categorie
sep_by_label = function(df) {
#
#' df : dataframe. Genes as rows and individuals as columns.
#' This function separates the dataframe into a list of dataframes, each list corresponding to a class
#
if (is.data.frame(df) == FALSE) {
stop("ERROR: df must be a dataframe.")
}
list_prop <- list() #list of the proportion of each class
class_labels <- unique(colnames(df))
for (i in 1:(length(class_labels))) {
prop_class <- length(which(colnames(df) == class_labels[i])) / dim(df)[2]
list_prop <- append(list_prop, prop_class)
}
S3_object <- list(class_labels, list_prop) #initialise S3 object with the proportions for each class
for (i in 1:length(class_labels)) {
S3_object <- append(S3_object, list(subset(df, select = which(colnames(df) == class_labels[i])) )) #adds a separated dataframe for each class
}
names(S3_object) <- as.list(c("labels", "proportion", class_labels)) #renames the S3 object attributes
for (i in S3_object$labels) {
colnames(S3_object[[i]]) <- rep(i, dim(S3_object[[i]])[2])
}
return(S3_object) #returns S3 class object with proportions for each class, dataframes for the data from each class, in the same order
}
#WORKS GREAT HELL TEAH BABY
rename_ <- function(x, i) {
#
#' x : dataframe
#' i : a class label
#' This function renames the classes correctly after some manipulation leads to the addition of unwanted characters to column's names
#
col_names <- colnames(x)
col_names <- gsub(paste('^.*', i[1], '.*$', sep = ""), i[1], col_names)
col_names <- gsub(paste('^.*', i[2], '.*$', sep = ""), i[2], col_names)
colnames(x) <- col_names
return(x)
}
strat_kfold = function(df, k) {
#
#' df : a dataframe
#' k : numnber of subsets wanted
#' This function uses the data separated by class to yield a list of dataframes, each being a sub-dataset with the original proportions of each class.
#
if (is.data.frame(df) == FALSE) {
stop("ERROR: df must be a dataframe.")
}
data_by_label <- sep_by_label(df) #splits dataframe by class in sub dataframe as S3 class object attributes
sets_index = list() #list of indexes for the k subsets for kfold cv
sets_index <- lapply(data_by_label[3:length(data_by_label)], function(x) split(sample(c(1:dim(x)[2]), dim(x)[2]), sort(c(1:dim(x)[2])%%k +1))) #list, for each class there are k lists containing the indexes
names(sets_index) <- as.list(data_by_label$labels)
data_by_fold <- lapply(data_by_label$labels, function(x) lapply(sets_index[[x]], function(y) data_by_label[[x]][,y])) #here the indexes are used to make a list of lists : level one is the class, level 2 (sublevel) is the fold
names(data_by_fold) <- as.list(data_by_label$labels)
folds <- list() #initialises the final object
folds_names <- list()
for (i in c(1:k)) {
folds <- append(folds, list(as.data.frame(lapply(data_by_label$labels, function(x) data_by_fold[[x]][i]) )) ) #adds class1 and class2 datasets together to for k sub datasets with equal proportions of each class
folds <- lapply(folds, function(x) rename_(x, data_by_label$labels))
folds_names <- append(folds_names, paste("fold", i, sep = ""))
}
names(folds) <- folds_names
return(folds)
}
classify <- function(df_by_fold, data_by_label, fold, k = 3, N) {
#
#' df_by_fold : list of dataframes. Each dataframe is a fold for k-fold cross-validation
#' data_by_label : list of labels, proportions of each label, and dataframes for each class
#' fold : element of the list df_by_fold. Stratified subset of data to use as validation data for the cross-validation round.
#' k : number of folds for cross-validation
#' N : number of TSPs to retain.
#' This function uses the fold as validation data and the other subsets as training data. Returns a confusion matrix, the metrics and the k TSPs from the classifier.
#
require(switchBox)
validation_data <- fold #validation set
validation_matrix <- as.matrix(validation_data)
reality <- as.factor(colnames(validation_data)) #truth about validation set labels
train_data <- df_by_fold
train_data$i <- NULL #eliminates the validation dataset from the list of all folds
train_data <- do.call("cbind", train_data) #binds together the training folds
train_data <- rename_(train_data, data_by_label$labels)
train_matrix <- as.matrix(train_data)
classifier <- SWAP.KTSP.Train(train_matrix, as.factor(colnames(train_data)), krange = N) #trains classifier
prediction <- SWAP.KTSP.Classify(validation_matrix, classifier) #uses classifier to ... classify the validation set
result_table <- table(prediction, reality)
print(result_table)
error_metrics(result_table)
print(classifier$TSPs)
}
cross_validation <- function(df, k = 3, N = 3) {
#
#' df : dataframe
#' k : number of folds for cross-validation
#' N : number of TSPs to retain
#' This function applies cross-validation using k-TSP to the data. It returns the confusion matrix, metrics and N TSPs for each cross-validation round.
#
data_by_label <- sep_by_label(df)
data_by_fold <- strat_kfold(df, k)
lapply(data_by_fold, function(x) classify(data_by_fold, data_by_label, x, k, N))
}
error_metrics = function(table) {
#
#' table : confusion matrix (actually a table)
#' This function calculates precision, accuracy, and recall scores of a classification with known labels/classes.
#
TP <- table[1,1]
TN <- table[2,2]
FP <- table[1,2]
FN <- table[2,1]
precision <- (TP) / (TP + FP)
accuracy_model <- (TP + TN) / (TP + TN + FP + FN)
recall <- (TP) / (TP + FN)
print( paste("Precision value of the model: ", round(precision, 3)))
print( paste("Accuracy of the model: ", round(accuracy_model, 3)))
print( paste("Recall of the model: ", round(recall, 3)))
}
data_by_label = sep_by_label(df) #Marche pas avec axis = 1
data_by_fold <- strat_kfold(df, k = 3)
cross_validation(df, k = 4, N = 5)
############################################
#
# CLASSIFIER
#
############################################
compare_rules <- function(individual, confidence = FALSE) {
#
#' individual : columns name of a dataframe that we want to assign to a class
#' confidence : boolean. Shows confidence in class attribution
#' This function uses kTSP rules from the stratified cross-validation to assign individuals to a class of eczema.
#
autre = 0
allergique = 0
if (df["GBP1", individual] < df["LY6K", individual]) {
autre = autre + 1
} else {
allergique = allergique + 1
}
if (df["PLSCR1.1", individual] < df["SCCPDH", individual]) {
autre = autre + 1
} else {
allergique = allergique + 1
}
if (df["CDKN2D", individual] < df["PLA2G4F", individual]) {
autre = autre + 1
} else {
allergique = allergique + 1
}
if (df["DTX3L.1", individual] < df["ANKK1", individual]) {
autre = autre + 1
} else {
allergique = allergique + 1
}
if (df["ERAP2.1", individual] < df["GSTA3", individual]) {
autre = autre + 1
} else {
allergique = allergique + 1
}
if (autre > allergique) {
if (confidence) {
print(autre/5)
return("autre")
} else {
return("autre")
}
} else {
if (confidence) {
print(allergique/5)
return("allergique")
} else {
return("allergique")
}
}
}
eczema_classifier <- function(df, confidence = FALSE) {
#
#' df : dataframe. Genes must be in rows and individuals in columns
#' confidence : boolean. Shows confidence in class attribution
#' This function is the actual classifier to distinguish allergic eczema from other types. Returns a list with classes for each individual.
#
df <- df[ which(rownames(df) %in% c("LY6K", "GBP1", "SCCPDH", "PLSCR1.1", "PLA2G4F", "CDKN2D", "ANKK1", "DTX3L.1", "GSTA3", "ERAP2.1")) , ]
classes <- lapply(colnames(df), function(x) compare_rules(x, confidence))
return(classes)
}
############################################
#
# TEST SUR DONNEES MICROARRAY
#
############################################
test = eczema_classifier(df)
test = eczema_classifier(df, confidence = TRUE) #permet de visualiser la confiance en l'attribution de la classe. Compris dans [0;1]
#test == colnames(df) #permet de comparer les prédictions avec les classes déjà attribuées du jeu de données
sum(test == colnames(df)) #la valeur TRUE vaut 1, et FALSE vaut 0, donc faire la sum d'un array de TRUE et FALSE permet de connaître le nombre de TRUE
sum(test == colnames(df))/dim(df)[2]#par extension, on peut transformer cette somme en % pour connaître la justesse d'autant de prédictions que l'on veut (sans regarder "à la main", ce qui est horrible sur bcp de données)
############################################
#
# CROSS VALIDATION qRT-PCR
#
############################################
#traitement des donnees qrt-pcr
df1 = read.csv("projet_data/jeu de donn?es classif apprentissage.csv", dec = ",") #chargement
df1 <- df1[ , ! names(df1) %in% c("patients")] #suppression colonne index patients
col_names <- as.list(df1["Diagnosis"])$Diagnosis #sauvegarde des classes
df1 <- df1[ , ! names(df1) %in% c("Diagnosis")] ##suppression de la colonne diagnostic
row_names <- colnames(df1) #rotation de la dataframe
df1 <- t(df1)
df1 <- as.data.frame(df1)
df1 <- lapply(df1, function(x) as.numeric(x))
df1 <- as.data.frame(df1)
colnames(df1) <- col_names
rownames(df1) <- row_names
#la dataframe est prête
# test des foncitons sur les donnees traitees
data_by_label = sep_by_label(df1) #
data_by_fold <- strat_kfold(df1, k = 3)
cross_validation(df1, k = 4, N = 5)
#les résultatas sont moins bons, donc on garde ceux des micro-array.