-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocessing.py
95 lines (62 loc) · 2.72 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
import os
import numpy as np
from sklearn import preprocessing
from nltk.stem.snowball import SnowballStemmer
from keras.preprocessing.text import Tokenizer
def load_dataset(args):
df = pd.read_csv(os.path.join(args.dataset_path, "kpris_data.csv"))
# label encoder
le = preprocessing.LabelEncoder()
# categories select
if args.dataset != "5_categories":
categories = args.dataset.split("_")
target_index = np.where(np.isin(df.target.values, categories) == 1)[0]
x_data = df.abstract.values[target_index]
y_data = le.fit_transform(df.target.values[target_index])
# use all categories of KPRIS dataset
else:
x_data = df.abstract.values
y_data = le.fit_transform(df.target.values)
assert len(x_data) == len(y_data)
print("Number of Abstract : {} , Target : {}".format(len(x_data), len(y_data)))
return x_data, y_data
def stemming(sentences):
stemmer = SnowballStemmer("english")
stemming_sentences = []
for i, sent in enumerate(sentences):
stem_sent = " ".join([stemmer.stem(word) for word in sent.split()])
stemming_sentences.append(stem_sent)
print("Stemming process done")
return stemming_sentences
def get_sequences(sentences, args):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
# get word index
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# get vocab size, use index zero to padding
vocab_size = len(word_index) + 1
print("Vocab size is {}".format(vocab_size))
# padding sequence same as window size.
sequences = [[0]*args.window_size+sequence+[0]*args.window_size for sequence in sequences]
# check how many training sampling we get
instances = np.sum([len(sequence)-2*args.window_size for sequence in sequences])
print("Training sampling : {}".format(instances))
return sequences, word_index, vocab_size, instances
def get_trainable_data(sequences, instances, args):
context = np.zeros(shape=(instances, args.window_size*2+1), dtype=np.int32)
target = np.zeros(shape=(instances, 1), dtype=np.int32)
document = np.zeros(shape=(instances, 1), dtype=np.int32)
k = 0
for doc_id, sequence in enumerate(sequences):
for i in range(args.window_size, len(sequence)-args.window_size):
context[k] = sequence[i-args.window_size:i+args.window_size+1]
target[k] = sequence[i]
document[k] = doc_id
k += 1
# delete target word in context
context = np.delete(context, args.window_size, axis=1)
print("trainable data settting finish")
return context, target, document