-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
68 lines (57 loc) · 1.93 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""utils file"""
from typing import List, Tuple
import numpy as np
import pandas as pd
import math
def create_dataset(data_path: str) -> Tuple[List[str], List[int]]:
dataset = pd.read_csv(data_path)
dataset = dataset.sample(frac=1).reset_index(drop=True) # shuffle the dataset
return list(dataset["sequence"]), list(dataset["label"])
def word_embedding(
sequence: str,
max_seq_length: int = 200,
CONSIDERED_AA: str = "ACDEFGHIKLMNPQRSTVWYX",
):
# amino acids encoding
aa_mapping = {aa: i + 1 for i, aa in enumerate(CONSIDERED_AA)}
for i, val in enumerate(CONSIDERED_AA):
if val == "X":
aa_mapping[val]=0
else:
aa_mapping
# adapt sequence size
if len(sequence) < max_seq_length:
# extent the sequence
sequence = sequence.zfill(max_seq_length)
# encode sequence
encoded_sequence = np.zeros((max_seq_length,)) # (200,)
for i, amino_acid in enumerate(sequence):
if amino_acid in CONSIDERED_AA:
encoded_sequence[i] = aa_mapping[amino_acid]
model_input = np.expand_dims(encoded_sequence, 0) # add batch dimension
return model_input # (1, 200)
def categorical_probas_to_classes(p):
return np.argmax(p, axis=1)
def calculate_performace(test_num, pred_y, labels):
tp = 0
fp = 0
tn = 0
fn = 0
for index in range(test_num):
if labels[index] == 1:
if labels[index] == pred_y[index]:
tp = tp + 1
else:
fn = fn + 1
else:
if labels[index] == pred_y[index]:
tn = tn + 1
else:
fp = fp + 1
acc = float(tp + tn) / test_num
sensitivity = float(tp) / (tp + fn + 1e-06)
specificity = float(tn) / (tn + fp + 1e-06)
mcc = float(tp * tn - fp * fn) / (
math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + 1e-06
)
return acc, sensitivity, specificity, mcc