diff --git a/core/rnn/data_loader.py b/core/rnn/data_loader.py old mode 100644 new mode 100755 index 652a1d2..7341d38 --- a/core/rnn/data_loader.py +++ b/core/rnn/data_loader.py @@ -6,10 +6,14 @@ import json import codecs from sklearn.model_selection import train_test_split +import _pickle from core.utils import data_utils from core.utils import settings +from keras.preprocessing.text import text +from keras.preprocessing.sequence import pad_sequences + logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') # include timestamp @@ -24,7 +28,7 @@ def __init__(self, file_dir, seed, shuffle, max_sequence_length, max_key_sequenc self.msl = max_sequence_length self.mksl = max_key_sequence_length self.train_data = json.load(codecs.open(join(settings.VENUE_DATA_DIR, 'train.txt'), 'r', 'utf-8')) - self.vocab = {} + self.tokenizer = _pickle.load(open(join(settings.DATA_DIR, 'venues', "tokenizer"), "rb")) self.vocab_size = self.split_and_tokenize() self.stop_list = [] self.batch_size = batch_size @@ -40,53 +44,38 @@ def __init__(self, file_dir, seed, shuffle, max_sequence_length, max_key_sequenc self.length_aminer = [] self.jaccard = [] self.inverse_pairs = [] - for pair in self.train_data: - len_mag, len_aminer, keyword_mag, keyword_aminer, jaccard, inverse_pairs = self.preprocess(pair[1], pair[2]) - pair[1] = pair[1] + [0] * (self.msl - len(pair[1])) if len(pair[1]) <= self.msl else pair[1][:self.msl] - pair[2] = pair[2] + [0] * (self.msl - len(pair[2])) if len(pair[2]) <= self.msl else pair[2][:self.msl] + self.mag = self.tokenizer.texts_to_sequences([p[1] for p in self.train_data]) + self.aminer = self.tokenizer.texts_to_sequences([p[2] for p in self.train_data]) + for i, pair in enumerate(self.train_data): + len_mag, len_aminer, keyword_mag, keyword_aminer, jaccard, inverse_pairs = self.preprocess(self.mag[i], self.aminer[i]) self.labels.append(pair[0]) - self.mag.append(pair[1]) - self.aminer.append(pair[2]) self.length_mag.append(len_mag) self.length_aminer.append(len_aminer) self.keyword_mag.append(keyword_mag) self.keyword_aminer.append(keyword_aminer) self.jaccard.append([np.float32(jaccard)] * (multiple * 2)) self.inverse_pairs.append([np.float32(inverse_pairs)] * multiple) + self.mag = pad_sequences(self.mag, maxlen=self.msl) + self.aminer = pad_sequences(self.aminer, maxlen=self.msl) self.labels, self.mag, self.aminer, self.length_mag, self.length_aminer, self.keyword_mag, self.keyword_aminer, self.jaccard, self.inverse_pairs = np.array( self.labels), np.array(self.mag), np.array(self.aminer), np.array(self.length_mag), np.array( self.length_aminer), np.array(self.keyword_mag), np.array(self.keyword_aminer), np.array( self.jaccard), np.array(self.inverse_pairs) logger.info('training pairs loaded') - if shuffle: - self.mag, self.aminer, self.labels = sklearn.utils.shuffle(self.mag, self.aminer, self.labels, - random_state=seed) self.n_pairs = len(self.labels) logger.info('all pairs count %d', self.n_pairs) def split_and_tokenize(self): for i, pair in enumerate(self.train_data.copy()): - seq1 = pair[1].split(' ') - seq2 = pair[2].split(' ') - pass - for j, w in enumerate(seq1.copy()): - if w not in self.vocab: - self.vocab[w] = len(self.vocab) + 1 - seq1[j] = self.vocab[w] - for j, w in enumerate(seq2.copy()): - if w not in self.vocab: - self.vocab[w] = len(self.vocab) + 1 - seq2[j] = self.vocab[w] + seq1 = text.text_to_word_sequence(pair[1]) + seq2 = text.text_to_word_sequence(pair[2]) self.train_data[i] = [pair[0], seq1, seq2] - return len(self.vocab) + return len(self.tokenizer.word_index) def preprocess(self, seq1, seq2, use_stop_word=False): overlap = set(seq1).intersection(seq2) jaccard = len(overlap) / (len(seq1) + len(seq2) - len(overlap)) - # stop_word = set(seq1).union(seq2).difference(overlap) if not use_stop_word else None - # keyword_seq1 = self.remove_stop_word(seq1, stop_word) - # keyword_seq2 = self.remove_stop_word(seq2, stop_word) inverse_pairs, keyword_seq1, keyword_seq2 = self.compute_inverse_pairs(seq1, seq2, overlap) return len(seq1), len(seq2), keyword_seq1, keyword_seq2, jaccard, inverse_pairs @@ -96,7 +85,7 @@ def remove_stop_word(self, seq, stop_word=None): for word in seq: if word not in stop_list: s.append(word) - return s + [0] * (self.mksl - len(s)) if len(s) <= self.mksl else s[:self.mksl] + return [0] * (self.mksl - len(s)) + s if len(s) <= self.mksl else s[:self.mksl] def compute_inverse_pairs(self, seq1, seq2, overlap): look_up = {} @@ -115,8 +104,8 @@ def compute_inverse_pairs(self, seq1, seq2, overlap): if new_seq2[j] < i + 1: result -= 1 return result, \ - new_seq1 + [0] * (self.mksl - len(new_seq1)) if len(new_seq1) <= self.mksl else new_seq1[:self.mksl], \ - new_seq2 + [0] * (self.mksl - len(new_seq2)) if len(new_seq2) <= self.mksl else new_seq2[:self.mksl] + [0] * (self.mksl - len(new_seq1)) + new_seq1 if len(new_seq1) <= self.mksl else new_seq1[:self.mksl], \ + [0] * (self.mksl - len(new_seq2)) + new_seq2 if len(new_seq2) <= self.mksl else new_seq2[:self.mksl] def split_dataset(self, test_size): train = {'mag': None, 'aminer': None, 'keyword_mag': None, 'keyword_aminer': None, 'jaccard': None, @@ -127,7 +116,7 @@ def split_dataset(self, test_size): 'inverse'], train['labels'], test['labels'] = train_test_split(self.mag, self.aminer, self.keyword_mag, self.keyword_aminer, self.jaccard, self.inverse_pairs, self.labels, - test_size=test_size) + test_size=test_size, random_state=37) return DataLoader(self.batch_size, train), DataLoader(self.batch_size, test) def __len__(self): diff --git a/core/rnn/models.py b/core/rnn/models.py old mode 100644 new mode 100755 index b9c102d..b7c175a --- a/core/rnn/models.py +++ b/core/rnn/models.py @@ -51,68 +51,3 @@ def BiLSTM(vocab_size, max_sequence_length, max_key_sequence_length, embedding_s model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=["accuracy"]) model.summary() return model - - -""" -class BiLSTM(nn.Module): - def __init__(self, vocab_size, max_sequence_length, batch_size=32, embedding_size=128, hidden_size=32, dropout=0.2, - multiple=16): - super(BiLSTM, self).__init__() - self.vocab_size = vocab_size - self.msl = max_sequence_length - self.multiple = multiple - - # embedding layer - self.embed_seq = nn.Embedding(self.vocab_size + 1, embedding_size) - self.embed_keyword_seq = nn.Embedding(self.vocab_size + 1, embedding_size) - print(type(self.embed_seq.weight)) - - # LSTM layer - self.lstm_seq1 = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, dropout=dropout) - self.lstm_seq2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, dropout=dropout) - - self.lstm_key_seq1 = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, dropout=dropout) - self.lstm_key_seq2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, dropout=dropout) - - # dense layer & normalization - # self.normalization = nn.BatchNorm1d() - self.output = nn.Sequential( - nn.Linear(6 * hidden_size + 3 * multiple, 64), # - nn.Linear(64, 16), - nn.BatchNorm1d(16), - nn.Linear(16, 2), - nn.Softmax() - ) - - def forward(self, mag, aminer, jaccard, keyword_mag, keyword_aminer, inverse): - mag = self.embed_seq(mag) - aminer = self.embed_seq(aminer) - keyword_mag = self.embed_keyword_seq(keyword_mag) - keyword_aminer = self.embed_keyword_seq(keyword_aminer) - jaccard = jaccard.repeat(1, 2 * self.multiple) - inverse = inverse.repeat(1, self.multiple) - mag, _ = self.lstm_seq1(mag) - mag, _ = self.lstm_seq2(mag) - aminer, _ = self.lstm_seq1(aminer) - aminer, _ = self.lstm_seq2(aminer) - keyword_mag, _ = self.lstm_key_seq1(keyword_mag) - keyword_mag, _ = self.lstm_key_seq2(keyword_mag) - keyword_aminer, _ = self.lstm_key_seq1(keyword_aminer) - keyword_aminer, _ = self.lstm_key_seq2(keyword_aminer) - minus = keyword_mag[:, -1, :] - keyword_aminer[:, -1, :] - minus_key = mag[:, -1, :] - aminer[:, -1, :] - concat_input = torch.cat( - (minus, - minus_key, - jaccard, - inverse, - mag[:, -1, :], - aminer[:, -1, :], - keyword_mag[:, -1, :], - keyword_aminer[:, -1, :], - ), dim=1) - - output = self.output(concat_input) - # output = self.sigmoid(output) - return output -""" diff --git a/core/rnn/train.py b/core/rnn/train.py old mode 100644 new mode 100755 index c96cf26..9cbe167 --- a/core/rnn/train.py +++ b/core/rnn/train.py @@ -22,7 +22,7 @@ parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default='rnn', help="models used") parser.add_argument('--no-cuda', action='store_true', default=False, help='Disables CUDA training.') -parser.add_argument('--seed', type=int, default=42, help='Random seed.') +parser.add_argument('--seed', type=int, default=37, help='Random seed.') parser.add_argument('--epochs', type=int, default=30, help='Number of epochs to train.') parser.add_argument('--lr', type=float, default=5e-2, help='Initial learning rate.') parser.add_argument('--weight-decay', type=float, default=1e-3, @@ -91,50 +91,6 @@ def on_epoch_end(self, epoch, logs={}): return -""" -def evaluate(loader, model, thr=None, return_best_thr=False, args=args): - model.eval() - total = 0. - loss = 0. - y_true, y_pred, y_score = [], [], [] - - for ibatch, batch in enumerate(loader): - labels = batch[-1] - if args.cuda: - batch = [data.cuda() for data in batch] - output = model(batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]) - y_true += labels.data.tolist() - y_pred += output.max(1)[1].data.tolist() - y_score += output[:, 1].data.tolist() - total += len(labels) - - model.train() - - if thr is not None: - logger.info("using threshold %.4f", thr) - y_score = np.array(y_score) - y_pred = np.zeros_like(y_score) - y_pred[y_score > thr] = 1 - - prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary") - auc = roc_auc_score(y_true, y_score) - logger.info("loss: %.4f AUC: %.4f Prec: %.4f Rec: %.4f F1: %.4f", - loss / total, auc, prec, rec, f1) - - if return_best_thr: # valid - precs, recs, thrs = precision_recall_curve(y_true, y_score) - f1s = 2 * precs * recs / (precs + recs) - f1s = f1s[:-1] - thrs = thrs[~np.isnan(f1s)] - f1s = f1s[~np.isnan(f1s)] - best_thr = thrs[np.argmax(f1s)] - logger.info("best threshold=%4f, f1=%.4f", best_thr, np.max(f1s)) - return best_thr - else: - return None -""" - - def train(train_loader, test_loader, model, args=args): model_chechpoint = ModelCheckpoint(join(join(settings.OUT_DIR, 'rnn-model'), 'model.h5'), save_best_only=True, save_weights_only=False) diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..9cc8c24 --- /dev/null +++ b/data/README.md @@ -0,0 +1,3 @@ +### Dataset + +The dataset can be downloaded from [OneDrive](https://mailstsinghuaeducn-my.sharepoint.com/:u:/g/personal/zfj17_mails_tsinghua_edu_cn/ES2s-PhyDeREs1zk0qdnA08BhzBZRSzrzKCqGAjEvdGBVQ?e=6U3bOd), [Tsinghua Cloud](https://cloud.tsinghua.edu.cn/f/1141adb4aac240d7a49d/?dl=1) or [BaiduPan](https://pan.baidu.com/s/1ZkIs89yy9TrDMssZ3ceeVw) (with password gzpp). Unzip the file and put the _data_ directory into project directory. \ No newline at end of file diff --git a/data/venues/tokenizer b/data/venues/tokenizer new file mode 100644 index 0000000..7806a36 Binary files /dev/null and b/data/venues/tokenizer differ