From af949de23024c5eca4e73beceb355a5cd26cd0c2 Mon Sep 17 00:00:00 2001
From: Yoon Kim <yoonkim@rushgpu01.rc.fas.harvard.edu>
Date: Mon, 8 Apr 2019 20:55:11 -0400
Subject: [PATCH] .

---
 README.md     | 18 +++++++++---------
 data.py       |  2 ++
 eval_ppl.py   |  3 ++-
 models.py     | 21 ++++-----------------
 pred-test.txt |  0
 train.py      |  3 ++-
 train_lm.py   |  2 +-
 7 files changed, 20 insertions(+), 29 deletions(-)
 create mode 100644 pred-test.txt
diff --git a/README.md b/README.md
index 2dfb005..e03f29e 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # Unsupervised Recurrent Neural Network Grammars
 
 This is an implementation of the paper:  
-[Unsupervised Recurrent Neural Network Grammars](https://arxiv.org/pdf/1804.0000.pdf)  
+[Unsupervised Recurrent Neural Network Grammars](https://arxiv.org/abs/1904.03746)  
 Yoon Kim, Alexander Rush, Lei Yu, Adhiguna Kuncoro, Chris Dyer, Gabor Melis  
 NAACL 2019  
 
@@ -29,7 +29,7 @@ python train.py --train_file data/ptb-train.pkl --val_file data/ptb-val.pkl --sa
 ```
 where `save_path` is where you want to save the model, and `gpu 0` is for using the first GPU
 in the cluster (the mapping from PyTorch GPU index to your cluster's GPU index may vary).
-Training should take 2 to 4 days depending on your setup.
+Training should take 2 to 3 days depending on your setup.
 
 To train the RNNG:
 ```
@@ -41,7 +41,7 @@ For fine-tuning:
 ```
 python train.py --train_from rnng.pt --train_file data/ptb-train.pkl --val_file data/ptb-val.pkl 
 --save_path rnng-urnng.pt --mode unsupervised --lr 0.1 --train_q_epochs 10 --epochs 10 
---gpu 0 --kl_warmup 0
+--min_epochs 6 --gpu 0 --kl_warmup 0
 ```
 
 To train the LM:
@@ -58,7 +58,7 @@ python eval_ppl.py --model_file urnng.pt --test_file data/ptb-test.pkl --samples
 ```
 The argument `samples` is for the number of importance weighted samples, and `is_temp` is for
 flattening the inference network's distribution (footnote 14 in the paper).
-The same evalulation code will work for RNNG. 
+The same evaluation code will work for RNNG. 
 
 For LM evaluation:
 ```
@@ -85,13 +85,13 @@ punctuation and evaluate on unlabeled F1.
 Note that some of the details regarding the preprocessing is slightly different from the original 
 paper. In particular, in this implementation we replace singleton words a single `<unk>` token
 instead of using Berkeley parser's mapping rules. This results in slight lower perplexity
-for all models, since the vocabulary size is smaller. Here are the results I get
+for all models, since the vocabulary size is smaller. Here are the perplexty numbers I get
 in this setting:
 
-- RNNLM: 89.2
-- RNNG: 83.7
-- URNNG: 85.1 (F1: 38.4)
-
+- RNNLM: 89.2 
+- RNNG: 83.7 
+- URNNG: 85.1 (F1: 38.4) 
+- RNNG --> URNNG: 82.5
 
 ## Acknowledgements
 Some of our preprocessing and evaluation code is based on the following repositories:  
diff --git a/data.py b/data.py
index 4199fde..9ea154f 100755
--- a/data.py
+++ b/data.py
@@ -35,6 +35,8 @@ def __getitem__(self, idx):
     binary_tree = [d[3] for d in other_data]
     spans = [d[5] for d in other_data]
     batch_size = self.batch_size[idx].item()
+    # by default, we return sents with <s> </s> tokens
+    # hence we subtract 2 from length as these are (by default) not counted for evaluation
     data_batch = [sents[:, :length], length-2, batch_size, actions, 
                   spans, binary_tree, other_data]
     return data_batch
diff --git a/eval_ppl.py b/eval_ppl.py
index 66ffe39..b09af24 100755
--- a/eval_ppl.py
+++ b/eval_ppl.py
@@ -28,7 +28,7 @@
 parser.add_argument('--test_file', default='data/ptb-test.pkl')
 parser.add_argument('--model_file', default='')
 parser.add_argument('--is_temp', default=2., type=float, help='divide scores by is_temp before CRF')
-parser.add_argument('--samples', default=1000, type=int, help='samples for IWAE calculation')
+parser.add_argument('--samples', default=1000, type=int, help='samples for IS calculation')
 parser.add_argument('--count_eos_ppl', default=0, type=int, help='whether to count eos in val PPL')
 parser.add_argument('--gpu', default=2, type=int, help='which gpu to use')
 parser.add_argument('--seed', default=3435, type=int)
@@ -57,6 +57,7 @@ def main(args):
     for i in list(reversed(range(len(data)))):
       sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = data[i] 
       if length == 1:
+        # length 1 sents are ignored since our generative model requires sents of length >= 2
         continue
       if args.count_eos_ppl == 1:
         length += 1
diff --git a/models.py b/models.py
index 30de169..207fc4a 100644
--- a/models.py
+++ b/models.py
@@ -50,7 +50,7 @@ def __init__(self, i_dim = 200,
                h_dim = 0,
                num_layers = 1,
                dropout = 0):
-    super(SeqLSTM, self).__init__()
+    super(SeqLSTM, self).__init__()    
     self.i_dim = i_dim
     self.h_dim = h_dim
     self.num_layers = num_layers
@@ -60,8 +60,6 @@ def __init__(self, i_dim = 200,
     self.dropout_layer = nn.Dropout(dropout)
 
   def forward(self, x, prev_h = None):
-    #x = b x i_dim
-    #prev_h = [(h_l, c_l) for l layers]
     if prev_h is None:
       prev_h = [(x.new(x.size(0), self.h_dim).fill_(0),
                  x.new(x.size(0), self.h_dim).fill_(0)) for _ in range(self.num_layers)]
@@ -79,17 +77,12 @@ def forward(self, x, prev_h = None):
     return curr_h
 
 class TreeLSTM(nn.Module):
-  def __init__(self, dim = 200,
-               e_dim = 0,
-               dropout = 0):
+  def __init__(self, dim = 200):
     super(TreeLSTM, self).__init__()
     self.dim = dim
-    self.e_dim = e_dim
-    self.linear = nn.Linear(dim*2 + e_dim, dim*5)
+    self.linear = nn.Linear(dim*2, dim*5)
 
   def forward(self, x1, x2, e=None):
-    #x = (h, c). h, c = b x dim. hidden/cell states of children
-    #e = b x e_dim. external information vector
     if not isinstance(x1, tuple):
       x1 = (x1, None)    
     h1, c1 = x1 
@@ -102,13 +95,7 @@ def forward(self, x1, x2, e=None):
       c1 = torch.zeros_like(h1)
     if c2 is None:
       c2 = torch.zeros_like(h2)
-    if self.e_dim == 0:
-      concat = torch.cat([h1, h2], 1)
-    else:
-      if e is None:
-        concat = torch.cat([h1, h2, torch.zeros_like(h1)], 1)
-      else:
-        concat = torch.cat([h1, h2, e], 1)
+    concat = torch.cat([h1, h2], 1)
     all_sum = self.linear(concat)
     i, f1, f2, o, g = all_sum.split(self.dim, 1)
 
diff --git a/pred-test.txt b/pred-test.txt
new file mode 100644
index 0000000..e69de29
diff --git a/train.py b/train.py
index bc55e40..914e006 100755
--- a/train.py
+++ b/train.py
@@ -138,6 +138,7 @@ def main(args):
         kl_pen = min(1., kl_pen + kl_warmup_batch) 
       sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = train_data[i]      
       if length == 1:
+        # we ignore length 1 sents during training/eval since we work with binary trees only
         continue
       sents = sents.cuda()
       b += 1
@@ -250,7 +251,7 @@ def eval(data, model, samples = 0, count_eos_ppl = 0):
   with torch.no_grad():
     for i in list(reversed(range(len(data)))):
       sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = data[i] 
-      if length == 1:
+      if length == 1: # length 1 sents are ignored since URNNG needs at least length 2 sents
         continue
       if args.count_eos_ppl == 1:
         tree_length = length
diff --git a/train_lm.py b/train_lm.py
index b65a49e..7c6dc60 100755
--- a/train_lm.py
+++ b/train_lm.py
@@ -147,7 +147,7 @@ def eval(data, model, count_eos_ppl = 0):
   with torch.no_grad():
     for i in list(reversed(range(len(data)))):
       sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = data[i] 
-      if length == 1:
+      if length == 1: #we ignore length 1 sents in URNNG eval so do this for LM too
         continue
       if args.count_eos_ppl == 1:
         length += 1