Skip to content

Commit

Permalink
update rnn
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiao9905 committed Sep 1, 2019
1 parent ecf0905 commit 351da18
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 139 deletions.
47 changes: 18 additions & 29 deletions core/rnn/data_loader.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
import json
import codecs
from sklearn.model_selection import train_test_split
import _pickle

from core.utils import data_utils
from core.utils import settings

from keras.preprocessing.text import text
from keras.preprocessing.sequence import pad_sequences

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') # include timestamp

Expand All @@ -24,7 +28,7 @@ def __init__(self, file_dir, seed, shuffle, max_sequence_length, max_key_sequenc
self.msl = max_sequence_length
self.mksl = max_key_sequence_length
self.train_data = json.load(codecs.open(join(settings.VENUE_DATA_DIR, 'train.txt'), 'r', 'utf-8'))
self.vocab = {}
self.tokenizer = _pickle.load(open(join(settings.DATA_DIR, 'venues', "tokenizer"), "rb"))
self.vocab_size = self.split_and_tokenize()
self.stop_list = []
self.batch_size = batch_size
Expand All @@ -40,53 +44,38 @@ def __init__(self, file_dir, seed, shuffle, max_sequence_length, max_key_sequenc
self.length_aminer = []
self.jaccard = []
self.inverse_pairs = []
for pair in self.train_data:
len_mag, len_aminer, keyword_mag, keyword_aminer, jaccard, inverse_pairs = self.preprocess(pair[1], pair[2])
pair[1] = pair[1] + [0] * (self.msl - len(pair[1])) if len(pair[1]) <= self.msl else pair[1][:self.msl]
pair[2] = pair[2] + [0] * (self.msl - len(pair[2])) if len(pair[2]) <= self.msl else pair[2][:self.msl]
self.mag = self.tokenizer.texts_to_sequences([p[1] for p in self.train_data])
self.aminer = self.tokenizer.texts_to_sequences([p[2] for p in self.train_data])
for i, pair in enumerate(self.train_data):
len_mag, len_aminer, keyword_mag, keyword_aminer, jaccard, inverse_pairs = self.preprocess(self.mag[i], self.aminer[i])
self.labels.append(pair[0])
self.mag.append(pair[1])
self.aminer.append(pair[2])
self.length_mag.append(len_mag)
self.length_aminer.append(len_aminer)
self.keyword_mag.append(keyword_mag)
self.keyword_aminer.append(keyword_aminer)
self.jaccard.append([np.float32(jaccard)] * (multiple * 2))
self.inverse_pairs.append([np.float32(inverse_pairs)] * multiple)
self.mag = pad_sequences(self.mag, maxlen=self.msl)
self.aminer = pad_sequences(self.aminer, maxlen=self.msl)
self.labels, self.mag, self.aminer, self.length_mag, self.length_aminer, self.keyword_mag, self.keyword_aminer, self.jaccard, self.inverse_pairs = np.array(
self.labels), np.array(self.mag), np.array(self.aminer), np.array(self.length_mag), np.array(
self.length_aminer), np.array(self.keyword_mag), np.array(self.keyword_aminer), np.array(
self.jaccard), np.array(self.inverse_pairs)
logger.info('training pairs loaded')

if shuffle:
self.mag, self.aminer, self.labels = sklearn.utils.shuffle(self.mag, self.aminer, self.labels,
random_state=seed)
self.n_pairs = len(self.labels)
logger.info('all pairs count %d', self.n_pairs)

def split_and_tokenize(self):
for i, pair in enumerate(self.train_data.copy()):
seq1 = pair[1].split(' ')
seq2 = pair[2].split(' ')
pass
for j, w in enumerate(seq1.copy()):
if w not in self.vocab:
self.vocab[w] = len(self.vocab) + 1
seq1[j] = self.vocab[w]
for j, w in enumerate(seq2.copy()):
if w not in self.vocab:
self.vocab[w] = len(self.vocab) + 1
seq2[j] = self.vocab[w]
seq1 = text.text_to_word_sequence(pair[1])
seq2 = text.text_to_word_sequence(pair[2])
self.train_data[i] = [pair[0], seq1, seq2]
return len(self.vocab)
return len(self.tokenizer.word_index)

def preprocess(self, seq1, seq2, use_stop_word=False):
overlap = set(seq1).intersection(seq2)
jaccard = len(overlap) / (len(seq1) + len(seq2) - len(overlap))
# stop_word = set(seq1).union(seq2).difference(overlap) if not use_stop_word else None
# keyword_seq1 = self.remove_stop_word(seq1, stop_word)
# keyword_seq2 = self.remove_stop_word(seq2, stop_word)
inverse_pairs, keyword_seq1, keyword_seq2 = self.compute_inverse_pairs(seq1, seq2, overlap)
return len(seq1), len(seq2), keyword_seq1, keyword_seq2, jaccard, inverse_pairs

Expand All @@ -96,7 +85,7 @@ def remove_stop_word(self, seq, stop_word=None):
for word in seq:
if word not in stop_list:
s.append(word)
return s + [0] * (self.mksl - len(s)) if len(s) <= self.mksl else s[:self.mksl]
return [0] * (self.mksl - len(s)) + s if len(s) <= self.mksl else s[:self.mksl]

def compute_inverse_pairs(self, seq1, seq2, overlap):
look_up = {}
Expand All @@ -115,8 +104,8 @@ def compute_inverse_pairs(self, seq1, seq2, overlap):
if new_seq2[j] < i + 1:
result -= 1
return result, \
new_seq1 + [0] * (self.mksl - len(new_seq1)) if len(new_seq1) <= self.mksl else new_seq1[:self.mksl], \
new_seq2 + [0] * (self.mksl - len(new_seq2)) if len(new_seq2) <= self.mksl else new_seq2[:self.mksl]
[0] * (self.mksl - len(new_seq1)) + new_seq1 if len(new_seq1) <= self.mksl else new_seq1[:self.mksl], \
[0] * (self.mksl - len(new_seq2)) + new_seq2 if len(new_seq2) <= self.mksl else new_seq2[:self.mksl]

def split_dataset(self, test_size):
train = {'mag': None, 'aminer': None, 'keyword_mag': None, 'keyword_aminer': None, 'jaccard': None,
Expand All @@ -127,7 +116,7 @@ def split_dataset(self, test_size):
'inverse'], train['labels'], test['labels'] = train_test_split(self.mag, self.aminer, self.keyword_mag,
self.keyword_aminer, self.jaccard,
self.inverse_pairs, self.labels,
test_size=test_size)
test_size=test_size, random_state=37)
return DataLoader(self.batch_size, train), DataLoader(self.batch_size, test)

def __len__(self):
Expand Down
65 changes: 0 additions & 65 deletions core/rnn/models.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -51,68 +51,3 @@ def BiLSTM(vocab_size, max_sequence_length, max_key_sequence_length, embedding_s
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=["accuracy"])
model.summary()
return model


"""
class BiLSTM(nn.Module):
def __init__(self, vocab_size, max_sequence_length, batch_size=32, embedding_size=128, hidden_size=32, dropout=0.2,
multiple=16):
super(BiLSTM, self).__init__()
self.vocab_size = vocab_size
self.msl = max_sequence_length
self.multiple = multiple
# embedding layer
self.embed_seq = nn.Embedding(self.vocab_size + 1, embedding_size)
self.embed_keyword_seq = nn.Embedding(self.vocab_size + 1, embedding_size)
print(type(self.embed_seq.weight))
# LSTM layer
self.lstm_seq1 = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, dropout=dropout)
self.lstm_seq2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, dropout=dropout)
self.lstm_key_seq1 = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, dropout=dropout)
self.lstm_key_seq2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, dropout=dropout)
# dense layer & normalization
# self.normalization = nn.BatchNorm1d()
self.output = nn.Sequential(
nn.Linear(6 * hidden_size + 3 * multiple, 64), #
nn.Linear(64, 16),
nn.BatchNorm1d(16),
nn.Linear(16, 2),
nn.Softmax()
)
def forward(self, mag, aminer, jaccard, keyword_mag, keyword_aminer, inverse):
mag = self.embed_seq(mag)
aminer = self.embed_seq(aminer)
keyword_mag = self.embed_keyword_seq(keyword_mag)
keyword_aminer = self.embed_keyword_seq(keyword_aminer)
jaccard = jaccard.repeat(1, 2 * self.multiple)
inverse = inverse.repeat(1, self.multiple)
mag, _ = self.lstm_seq1(mag)
mag, _ = self.lstm_seq2(mag)
aminer, _ = self.lstm_seq1(aminer)
aminer, _ = self.lstm_seq2(aminer)
keyword_mag, _ = self.lstm_key_seq1(keyword_mag)
keyword_mag, _ = self.lstm_key_seq2(keyword_mag)
keyword_aminer, _ = self.lstm_key_seq1(keyword_aminer)
keyword_aminer, _ = self.lstm_key_seq2(keyword_aminer)
minus = keyword_mag[:, -1, :] - keyword_aminer[:, -1, :]
minus_key = mag[:, -1, :] - aminer[:, -1, :]
concat_input = torch.cat(
(minus,
minus_key,
jaccard,
inverse,
mag[:, -1, :],
aminer[:, -1, :],
keyword_mag[:, -1, :],
keyword_aminer[:, -1, :],
), dim=1)
output = self.output(concat_input)
# output = self.sigmoid(output)
return output
"""
46 changes: 1 addition & 45 deletions core/rnn/train.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default='rnn', help="models used")
parser.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training.')
parser.add_argument('--seed', type=int, default=42, help='Random seed.')
parser.add_argument('--seed', type=int, default=37, help='Random seed.')
parser.add_argument('--epochs', type=int, default=30, help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=5e-2, help='Initial learning rate.')
parser.add_argument('--weight-decay', type=float, default=1e-3,
Expand Down Expand Up @@ -91,50 +91,6 @@ def on_epoch_end(self, epoch, logs={}):
return


"""
def evaluate(loader, model, thr=None, return_best_thr=False, args=args):
model.eval()
total = 0.
loss = 0.
y_true, y_pred, y_score = [], [], []
for ibatch, batch in enumerate(loader):
labels = batch[-1]
if args.cuda:
batch = [data.cuda() for data in batch]
output = model(batch[0], batch[1], batch[2], batch[3], batch[4], batch[5])
y_true += labels.data.tolist()
y_pred += output.max(1)[1].data.tolist()
y_score += output[:, 1].data.tolist()
total += len(labels)
model.train()
if thr is not None:
logger.info("using threshold %.4f", thr)
y_score = np.array(y_score)
y_pred = np.zeros_like(y_score)
y_pred[y_score > thr] = 1
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
auc = roc_auc_score(y_true, y_score)
logger.info("loss: %.4f AUC: %.4f Prec: %.4f Rec: %.4f F1: %.4f",
loss / total, auc, prec, rec, f1)
if return_best_thr: # valid
precs, recs, thrs = precision_recall_curve(y_true, y_score)
f1s = 2 * precs * recs / (precs + recs)
f1s = f1s[:-1]
thrs = thrs[~np.isnan(f1s)]
f1s = f1s[~np.isnan(f1s)]
best_thr = thrs[np.argmax(f1s)]
logger.info("best threshold=%4f, f1=%.4f", best_thr, np.max(f1s))
return best_thr
else:
return None
"""


def train(train_loader, test_loader, model, args=args):
model_chechpoint = ModelCheckpoint(join(join(settings.OUT_DIR, 'rnn-model'), 'model.h5'), save_best_only=True,
save_weights_only=False)
Expand Down
3 changes: 3 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Dataset

The dataset can be downloaded from [OneDrive](https://mailstsinghuaeducn-my.sharepoint.com/:u:/g/personal/zfj17_mails_tsinghua_edu_cn/ES2s-PhyDeREs1zk0qdnA08BhzBZRSzrzKCqGAjEvdGBVQ?e=6U3bOd), [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/1141adb4aac240d7a49d/?dl=1) or [BaiduPan](https://pan.baidu.com/s/1ZkIs89yy9TrDMssZ3ceeVw) (with password gzpp). Unzip the file and put the _data_ directory into project directory.
Binary file added data/venues/tokenizer
Binary file not shown.

0 comments on commit 351da18

Please sign in to comment.