From af949de23024c5eca4e73beceb355a5cd26cd0c2 Mon Sep 17 00:00:00 2001 From: Yoon Kim Date: Mon, 8 Apr 2019 20:55:11 -0400 Subject: [PATCH] . --- README.md | 18 +++++++++--------- data.py | 2 ++ eval_ppl.py | 3 ++- models.py | 21 ++++----------------- pred-test.txt | 0 train.py | 3 ++- train_lm.py | 2 +- 7 files changed, 20 insertions(+), 29 deletions(-) create mode 100644 pred-test.txt diff --git a/README.md b/README.md index 2dfb005..e03f29e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Unsupervised Recurrent Neural Network Grammars This is an implementation of the paper: -[Unsupervised Recurrent Neural Network Grammars](https://arxiv.org/pdf/1804.0000.pdf) +[Unsupervised Recurrent Neural Network Grammars](https://arxiv.org/abs/1904.03746) Yoon Kim, Alexander Rush, Lei Yu, Adhiguna Kuncoro, Chris Dyer, Gabor Melis NAACL 2019 @@ -29,7 +29,7 @@ python train.py --train_file data/ptb-train.pkl --val_file data/ptb-val.pkl --sa ``` where `save_path` is where you want to save the model, and `gpu 0` is for using the first GPU in the cluster (the mapping from PyTorch GPU index to your cluster's GPU index may vary). -Training should take 2 to 4 days depending on your setup. +Training should take 2 to 3 days depending on your setup. To train the RNNG: ``` @@ -41,7 +41,7 @@ For fine-tuning: ``` python train.py --train_from rnng.pt --train_file data/ptb-train.pkl --val_file data/ptb-val.pkl --save_path rnng-urnng.pt --mode unsupervised --lr 0.1 --train_q_epochs 10 --epochs 10 ---gpu 0 --kl_warmup 0 +--min_epochs 6 --gpu 0 --kl_warmup 0 ``` To train the LM: @@ -58,7 +58,7 @@ python eval_ppl.py --model_file urnng.pt --test_file data/ptb-test.pkl --samples ``` The argument `samples` is for the number of importance weighted samples, and `is_temp` is for flattening the inference network's distribution (footnote 14 in the paper). -The same evalulation code will work for RNNG. +The same evaluation code will work for RNNG. For LM evaluation: ``` @@ -85,13 +85,13 @@ punctuation and evaluate on unlabeled F1. Note that some of the details regarding the preprocessing is slightly different from the original paper. In particular, in this implementation we replace singleton words a single `` token instead of using Berkeley parser's mapping rules. This results in slight lower perplexity -for all models, since the vocabulary size is smaller. Here are the results I get +for all models, since the vocabulary size is smaller. Here are the perplexty numbers I get in this setting: -- RNNLM: 89.2 -- RNNG: 83.7 -- URNNG: 85.1 (F1: 38.4) - +- RNNLM: 89.2 +- RNNG: 83.7 +- URNNG: 85.1 (F1: 38.4) +- RNNG --> URNNG: 82.5 ## Acknowledgements Some of our preprocessing and evaluation code is based on the following repositories: diff --git a/data.py b/data.py index 4199fde..9ea154f 100755 --- a/data.py +++ b/data.py @@ -35,6 +35,8 @@ def __getitem__(self, idx): binary_tree = [d[3] for d in other_data] spans = [d[5] for d in other_data] batch_size = self.batch_size[idx].item() + # by default, we return sents with tokens + # hence we subtract 2 from length as these are (by default) not counted for evaluation data_batch = [sents[:, :length], length-2, batch_size, actions, spans, binary_tree, other_data] return data_batch diff --git a/eval_ppl.py b/eval_ppl.py index 66ffe39..b09af24 100755 --- a/eval_ppl.py +++ b/eval_ppl.py @@ -28,7 +28,7 @@ parser.add_argument('--test_file', default='data/ptb-test.pkl') parser.add_argument('--model_file', default='') parser.add_argument('--is_temp', default=2., type=float, help='divide scores by is_temp before CRF') -parser.add_argument('--samples', default=1000, type=int, help='samples for IWAE calculation') +parser.add_argument('--samples', default=1000, type=int, help='samples for IS calculation') parser.add_argument('--count_eos_ppl', default=0, type=int, help='whether to count eos in val PPL') parser.add_argument('--gpu', default=2, type=int, help='which gpu to use') parser.add_argument('--seed', default=3435, type=int) @@ -57,6 +57,7 @@ def main(args): for i in list(reversed(range(len(data)))): sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = data[i] if length == 1: + # length 1 sents are ignored since our generative model requires sents of length >= 2 continue if args.count_eos_ppl == 1: length += 1 diff --git a/models.py b/models.py index 30de169..207fc4a 100644 --- a/models.py +++ b/models.py @@ -50,7 +50,7 @@ def __init__(self, i_dim = 200, h_dim = 0, num_layers = 1, dropout = 0): - super(SeqLSTM, self).__init__() + super(SeqLSTM, self).__init__() self.i_dim = i_dim self.h_dim = h_dim self.num_layers = num_layers @@ -60,8 +60,6 @@ def __init__(self, i_dim = 200, self.dropout_layer = nn.Dropout(dropout) def forward(self, x, prev_h = None): - #x = b x i_dim - #prev_h = [(h_l, c_l) for l layers] if prev_h is None: prev_h = [(x.new(x.size(0), self.h_dim).fill_(0), x.new(x.size(0), self.h_dim).fill_(0)) for _ in range(self.num_layers)] @@ -79,17 +77,12 @@ def forward(self, x, prev_h = None): return curr_h class TreeLSTM(nn.Module): - def __init__(self, dim = 200, - e_dim = 0, - dropout = 0): + def __init__(self, dim = 200): super(TreeLSTM, self).__init__() self.dim = dim - self.e_dim = e_dim - self.linear = nn.Linear(dim*2 + e_dim, dim*5) + self.linear = nn.Linear(dim*2, dim*5) def forward(self, x1, x2, e=None): - #x = (h, c). h, c = b x dim. hidden/cell states of children - #e = b x e_dim. external information vector if not isinstance(x1, tuple): x1 = (x1, None) h1, c1 = x1 @@ -102,13 +95,7 @@ def forward(self, x1, x2, e=None): c1 = torch.zeros_like(h1) if c2 is None: c2 = torch.zeros_like(h2) - if self.e_dim == 0: - concat = torch.cat([h1, h2], 1) - else: - if e is None: - concat = torch.cat([h1, h2, torch.zeros_like(h1)], 1) - else: - concat = torch.cat([h1, h2, e], 1) + concat = torch.cat([h1, h2], 1) all_sum = self.linear(concat) i, f1, f2, o, g = all_sum.split(self.dim, 1) diff --git a/pred-test.txt b/pred-test.txt new file mode 100644 index 0000000..e69de29 diff --git a/train.py b/train.py index bc55e40..914e006 100755 --- a/train.py +++ b/train.py @@ -138,6 +138,7 @@ def main(args): kl_pen = min(1., kl_pen + kl_warmup_batch) sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = train_data[i] if length == 1: + # we ignore length 1 sents during training/eval since we work with binary trees only continue sents = sents.cuda() b += 1 @@ -250,7 +251,7 @@ def eval(data, model, samples = 0, count_eos_ppl = 0): with torch.no_grad(): for i in list(reversed(range(len(data)))): sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = data[i] - if length == 1: + if length == 1: # length 1 sents are ignored since URNNG needs at least length 2 sents continue if args.count_eos_ppl == 1: tree_length = length diff --git a/train_lm.py b/train_lm.py index b65a49e..7c6dc60 100755 --- a/train_lm.py +++ b/train_lm.py @@ -147,7 +147,7 @@ def eval(data, model, count_eos_ppl = 0): with torch.no_grad(): for i in list(reversed(range(len(data)))): sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = data[i] - if length == 1: + if length == 1: #we ignore length 1 sents in URNNG eval so do this for LM too continue if args.count_eos_ppl == 1: length += 1