-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
Copy pathprocessor.py
69 lines (59 loc) · 1.92 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import io
import numpy as np
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
with io.open(file_path, 'r', encoding='utf8') as f:
for i, line in enumerate(f):
vocab[line.rstrip()] = int(i)
return vocab
def get_predict_label(pos_prob):
"""
Convert the prediction probabilities to label
"""
# threshold should be (1, 0.5)
neu_threshold = 0.5
if pos_prob >= neu_threshold:
label, key = 1, "porn"
else:
label, key = 0, "not_porn"
return label, key
def preprocess(predicted_data, tokenizer, vocab, sequence_max_len=256):
"""
Convert the word str to word id and pad the text
"""
result = []
padding = vocab['<PAD>']
unknown = vocab['<UNK>']
for index, text in enumerate(predicted_data):
data_arr = tokenizer.tokenize(''.join(text.split()))
wids = [vocab.get(w, unknown) for w in data_arr[:sequence_max_len]]
if len(wids) < sequence_max_len:
wids = wids + [padding] * (sequence_max_len - len(wids))
result_i = {'processed': []}
result_i['origin'] = predicted_data[index]
result_i['processed'] += wids
result.append(result_i)
return result
def postprocess(predict_out, texts):
"""
Convert model's output tensor to pornography label
"""
result = []
predict_out = predict_out.as_ndarray()
for index in range(len(texts)):
result_i = {}
result_i['text'] = texts[index]['origin']
label = int(np.argmax(predict_out[index]))
if label == 0:
key = 'not_porn'
else:
key = 'porn'
result_i['porn_detection_label'] = label
result_i['porn_detection_key'] = key
result_i['porn_probs'] = float('%.4f' % predict_out[index, 1])
result_i['not_porn_probs'] = float('%.4f' % (predict_out[index, 0]))
result.append(result_i)
return result