加速crf层解码速度

SmartMapple · Apr 26, 2020 · b659726 · b659726
1 parent da6631c
commit b659726
Show file tree

Hide file tree

Showing 74 changed files with 1,008 additions and 324 deletions.
diff --git a/pytorch_version/README.md b/pytorch_version/README.md
@@ -1,82 +1,33 @@
-# CLUE_NER_PyTorch
-
-CLUENER 细粒度命名实体识别
-
-## 数据介绍
+### 数据介绍
 
 数据详细描述: https://www.cluebenchmarks.com/introduce.html
 
-## 代码目录说明
-
-```text
-├── callback  # 自定义常用callback
-|  └── lr_scheduler.py　　
-|  └── ...
-├── losses   #　自定义loss函数
-|  └── focal_loss.py　
-|  └── ...
-├── CLUEdatasets   #　存放数据
-|  └── cluener　　　
-├── metrics　　　　　　　　　# metric计算
-|  └── ner_metrics.py　　　
-├── outputs              # 模型输出保存
-|  └── cluener_output
-├── prev_trained_model　# 预训练模型
-|  └── albert_base
-|  └── bert-wwm
-|  └── ...
-├── processors　　　　　# 数据处理
-|  └── ner_seq.py
-|  └── ...
-├── tools　　　　　　　　#　通用脚本
-|  └── common.py
-|  └── download_clue_data.py
-|  └── ...
-├── models　　　# 主模型
-|  └── transformers
-|  └── bert_for_ner.py
-|  └── ...
-├── run_ner_span.py       # 主程序
-├── run_ner_span.sh   #　任务运行脚本
-```
-## 依赖模块
-
-* pytorch=1.1.0
-* boto3=1.9
-* regex
-* sacremoses
-* sentencepiece
-* python3.7+
-
-## 运行方式
-
-### 1. 下载CLUE_NER数据集，运行以下命令：
-```python
-python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener
+### 运行方式
+1. 下载CLUE_NER数据集，运行以下命令：
+```shell
+python tools/download_clue_data.py --data_dir=./datasets --tasks=cluener
 ```
-### 2. 预训练模型文件格式，比如:
+2. 预训练模型文件格式，比如:
 ```text
 ├── prev_trained_model　# 预训练模型
 |  └── bert-base
 |  | └── vocab.txt
 |  | └── config.json
 |  | └── pytorch_model.bin
 ```
-### 3. 训练：
+3. 训练：
 
 直接执行对应shell脚本，如：
 ```shell
-sh run_ner_span.sh
+sh scripts/run_ner_crf.sh
 ```
-
-### 4. 预测
+4. 预测
 
 当前默认使用最后一个checkpoint模型作为预测模型，你也可以指定--predict_checkpoints参数进行对应的checkpoint进行预测，比如：
-
 ```python
 CURRENT_DIR=`pwd`
 export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base
-export GLUE_DIR=$CURRENT_DIR/CLUEdatasets
+export CLUE_DIR=$CURRENT_DIR/datasets
 export OUTPUR_DIR=$CURRENT_DIR/outputs
 TASK_NAME="cluener"
 
@@ -87,45 +38,14 @@ python run_ner_span.py \
   --do_predict \
   --predict_checkpoints=100 \
   --do_lower_case \
-  --loss_type=ce \
 　...
 ```
 ### 模型列表
 
 model_type目前支持**bert**和**albert**
 
-**注意**: bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样，而模型本身主体一样，因此参数model_type=bert其余同理。
-
-### 输入编码方式
-
-目前默认为BIOS编码方式，比如:
-
-```text
-美	B-LOC
-国	I-LOC
-的	O
-华	B-PER
-莱	I-PER
-士	I-PER
-
-我	O
-跟	O
-他	O
-谈	O
-笑	O
-风	O
-生	O 
-```
-
-## 结果
-
-以下为模型在 **dev**上的测试结果:
+**注意:** bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样，而模型本身主体一样，因此参数model_type=bert其余同理。
 
-|              | Accuracy (entity)  | Recall (entity)    | F1 score (entity)  |                                                              |
-| ------------ | ------------------ | ------------------ | ------------------ | ------------------------------------------------------------ |
-| BERT+Softmax | 0.7916     | 0.7962     | 0.7939    | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 |
-| BERT+CRF     | 0.7877     | 0.8008 | 0.7942     | train_max_length=128 eval_max_length=512 epoch=5 lr=3e-5 batch_size=24 |
-| BERT+Span    | 0.8132 | **0.8092** | **0.8112** | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 |
-| BERT+Span+focal_loss    | 0.8121 | 0.8008 | 0.8064 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 loss_type=focal |
-| BERT+Span+label_smoothing   | **0.8235** | 0.7946 | 0.8088 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 loss_type=lsr |
+### 结果
 
+在dev上为F1分数为0.8076
diff --git a/pytorch_version/__init__.py b/pytorch_version/__init__.py
diff --git a/pytorch_version/callback/__init__.py b/pytorch_version/callback/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/callback/lr_scheduler.py b/pytorch_version/callback/lr_scheduler.py
@@ -4,20 +4,11 @@
 from torch.optim.optimizer import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
 
-__all__ = ['CustomDecayLR',
-           'BertLR',
-           'CyclicLR',
-           'ReduceLROnPlateau',
-           'ReduceLRWDOnPlateau',
-           'CosineLRWithRestarts',
-           ]
-
 def get_constant_schedule(optimizer, last_epoch=-1):
     """ Create a schedule with a constant learning rate.
     """
     return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
 
-
 def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
     """ Create a schedule with a constant learning rate preceded by a warmup
     period during which the learning rate increases linearly between 0 and 1.

diff --git a/pytorch_version/callback/optimizater/__init__.py b/pytorch_version/callback/optimizater/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/callback/optimizater/adamw.py b/pytorch_version/callback/optimizater/adamw.py
diff --git a/pytorch_version/callback/optimizater/lamb.py b/pytorch_version/callback/optimizater/lamb.py
diff --git a/pytorch_version/callback/optimizater/lars.py b/pytorch_version/callback/optimizater/lars.py
diff --git a/pytorch_version/callback/optimizater/lookahead.py b/pytorch_version/callback/optimizater/lookahead.py
@@ -1,5 +1,5 @@
 import torch
-from torch.optim import Optimizer
+from torch.optim.optimizer import Optimizer
 from collections import defaultdict
 
 class Lookahead(Optimizer):

diff --git a/pytorch_version/callback/optimizater/nadam.py b/pytorch_version/callback/optimizater/nadam.py
diff --git a/pytorch_version/callback/optimizater/novograd.py b/pytorch_version/callback/optimizater/novograd.py
diff --git a/pytorch_version/callback/optimizater/planradam.py b/pytorch_version/callback/optimizater/planradam.py
diff --git a/pytorch_version/callback/optimizater/radam.py b/pytorch_version/callback/optimizater/radam.py
diff --git a/pytorch_version/callback/optimizater/ralamb.py b/pytorch_version/callback/optimizater/ralamb.py
diff --git a/pytorch_version/callback/progressbar.py b/pytorch_version/callback/progressbar.py
diff --git a/pytorch_version/datasets/cluener/__init__.py b/pytorch_version/datasets/cluener/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/losses/__init__.py b/pytorch_version/losses/__init__.py
diff --git a/pytorch_version/metrics/__init__.py b/pytorch_version/metrics/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/metrics/ner_metrics.py b/pytorch_version/metrics/ner_metrics.py
diff --git a/pytorch_version/models/__init__.py b/pytorch_version/models/__init__.py
diff --git a/pytorch_version/models/albert_for_ner.py b/pytorch_version/models/albert_for_ner.py
@@ -1,10 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .crf import CRF
+from .layers.crf import CRF
 from .transformers.modeling_albert import AlbertPreTrainedModel
 from .transformers.modeling_albert import AlbertModel
-from .linears import PoolerEndLogits, PoolerStartLogits
+from .layers.linears import PoolerEndLogits, PoolerStartLogits
 from torch.nn import CrossEntropyLoss
 from losses.focal_loss import FocalLoss
 from losses.label_smoothing import LabelSmoothingCrossEntropy
@@ -21,11 +21,8 @@ def __init__(self, config):
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                 position_ids=None, head_mask=None, labels=None):
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,
+                            position_ids=position_ids,head_mask=head_mask)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -50,23 +47,23 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None,
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 class AlbertCrfForNer(AlbertPreTrainedModel):
-    def __init__(self, config, label2id, device):
+    def __init__(self, config):
         super(AlbertCrfForNer, self).__init__(config)
         self.bert = AlbertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, len(label2id))
-        self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_lens=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
         outputs = (logits,)
         if labels is not None:
-            loss = self.crf.calculate_loss(logits, tag_list=labels, lengths=input_lens)
-            outputs =(loss,)+outputs
+            loss = self.crf(emissions = logits, tags=labels, mask=attention_mask)
+            outputs =(-1*loss,)+outputs
         return outputs # (loss), scores
 
 class AlbertSpanForNer(AlbertPreTrainedModel):
@@ -85,7 +82,7 @@ def __init__(self, config,):
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         start_logits = self.start_fc(sequence_output)

diff --git a/pytorch_version/models/bert_for_ner.py b/pytorch_version/models/bert_for_ner.py
@@ -1,10 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .crf import CRF
+from .layers.crf import CRF
 from .transformers.modeling_bert import BertPreTrainedModel
 from .transformers.modeling_bert import BertModel
-from .linears import PoolerEndLogits, PoolerStartLogits
+from .layers.linears import PoolerEndLogits, PoolerStartLogits
 from torch.nn import CrossEntropyLoss
 from losses.focal_loss import FocalLoss
 from losses.label_smoothing import LabelSmoothingCrossEntropy
@@ -21,11 +21,7 @@ def __init__(self, config):
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                 position_ids=None, head_mask=None, labels=None):
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -50,23 +46,23 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None,
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 class BertCrfForNer(BertPreTrainedModel):
-    def __init__(self, config, label2id, device):
+    def __init__(self, config):
         super(BertCrfForNer, self).__init__(config)
         self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, len(label2id))
-        self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_lens=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
         outputs = (logits,)
         if labels is not None:
-            loss = self.crf.calculate_loss(logits, tag_list=labels, lengths=input_lens)
-            outputs =(loss,)+outputs
+            loss = self.crf(emissions = logits, tags=labels, mask=attention_mask)
+            outputs =(-1*loss,)+outputs
         return outputs # (loss), scores
 
 class BertSpanForNer(BertPreTrainedModel):
@@ -85,7 +81,7 @@ def __init__(self, config,):
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         start_logits = self.start_fc(sequence_output)

diff --git a/pytorch_version/models/layers/__init__.py b/pytorch_version/models/layers/__init__.py
@@ -0,0 +1,2 @@
+
+