generated from aaivu/aaivu-project-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFeatureTransformer.py
58 lines (49 loc) · 2.01 KB
/
FeatureTransformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
import numpy as np
from MemoryTagger import MemoryTagger
class FeatureTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.memory_tagger = MemoryTagger()
self.tag_encoder = LabelEncoder()
self.pos_encoder = LabelEncoder()
def fit(self, X, y):
words = X["word"].values.tolist()
self.pos = X["POS"].values.tolist()
tags = X["tag"].values.tolist()
self.memory_tagger.fit(words, tags)
self.tag_encoder.fit(tags)
self.pos_encoder.fit(self.pos)
return self
def transform(self, X, y=None):
def pos_default(p):
if p in self.pos:
return self.pos_encoder.transform([p])[0]
else:
return -1
pos = X["POS"].values.tolist()
words = X["word"].values.tolist()
out = []
for i in range(len(words)):
w = words[i]
p = pos[i]
if i < len(words) - 1:
wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i + 1]]))[0]
posp = pos_default(pos[i + 1])
else:
wp = self.tag_encoder.transform(['EXTRA'])[0]
posp = pos_default(".")
if i > 0:
if words[i - 1] != ".":
wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i - 1]]))[0]
posm = pos_default(pos[i - 1])
else:
wm = self.tag_encoder.transform(['EXTRA'])[0]
posm = pos_default(".")
else:
posm = pos_default(".")
wm = self.tag_encoder.transform(['EXTRA'])[0]
out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
pos_default(p), wp, wm, posp, posm]))
return out