From b6597268c000e06aa95bcdc59ef122805254cab6 Mon Sep 17 00:00:00 2001
From: dongqianqian <dongqianqian@bytedance.com>
Date: Sun, 26 Apr 2020 10:03:39 +0800
Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E9=80=9Fcrf=E5=B1=82=E8=A7=A3?=
 =?UTF-8?q?=E7=A0=81=E9=80=9F=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pytorch_version/README.md                     | 106 +----
 pytorch_version/__init__.py                   |   0
 pytorch_version/callback/__init__.py          |   2 +
 pytorch_version/callback/lr_scheduler.py      |   9 -
 .../callback/optimizater/__init__.py          |   2 +
 pytorch_version/callback/optimizater/adamw.py |   0
 pytorch_version/callback/optimizater/lamb.py  |   0
 pytorch_version/callback/optimizater/lars.py  |   0
 .../callback/optimizater/lookahead.py         |   2 +-
 pytorch_version/callback/optimizater/nadam.py |   0
 .../callback/optimizater/novograd.py          |   0
 .../callback/optimizater/planradam.py         |   0
 pytorch_version/callback/optimizater/radam.py |   0
 .../callback/optimizater/ralamb.py            |   0
 pytorch_version/callback/progressbar.py       |   0
 pytorch_version/datasets/cluener/__init__.py  |   2 +
 pytorch_version/losses/__init__.py            |   0
 pytorch_version/metrics/__init__.py           |   2 +
 pytorch_version/metrics/ner_metrics.py        |   0
 pytorch_version/models/__init__.py            |   0
 pytorch_version/models/albert_for_ner.py      |  25 +-
 pytorch_version/models/bert_for_ner.py        |  24 +-
 pytorch_version/models/layers/__init__.py     |   2 +
 pytorch_version/models/layers/crf.py          | 411 ++++++++++++++++++
 pytorch_version/models/layers/linears.py      |  40 ++
 .../models/transformers/__init__.py           |   0
 .../models/transformers/__main__.py           |   0
 .../transformers/configuration_albert.py      |   0
 .../models/transformers/configuration_auto.py |   0
 .../models/transformers/configuration_bert.py |   0
 .../models/transformers/configuration_ctrl.py |   0
 .../transformers/configuration_distilbert.py  |   0
 .../models/transformers/configuration_gpt2.py |   0
 .../transformers/configuration_openai.py      |   0
 .../transformers/configuration_roberta.py     |   0
 .../transformers/configuration_transfo_xl.py  |   0
 .../transformers/configuration_utils.py       |   0
 .../models/transformers/configuration_xlm.py  |   0
 .../transformers/configuration_xlnet.py       |   0
 .../models/transformers/file_utils.py         |   0
 .../models/transformers/modeling_albert.py    |   0
 .../transformers/modeling_albert_bright.py    |   0
 .../models/transformers/modeling_auto.py      |   0
 .../models/transformers/modeling_bert.py      |   0
 .../models/transformers/modeling_ctrl.py      |   0
 .../transformers/modeling_distilbert.py       |   0
 .../models/transformers/modeling_gpt2.py      |   0
 .../models/transformers/modeling_openai.py    |   0
 .../models/transformers/modeling_roberta.py   |   0
 .../transformers/modeling_transfo_xl.py       |   0
 .../modeling_transfo_xl_utilities.py          |   0
 .../models/transformers/modeling_utils.py     |   0
 .../models/transformers/modeling_xlm.py       |   0
 .../models/transformers/modeling_xlnet.py     |   0
 .../transformers/tokenization_albert.py       |   0
 .../models/transformers/tokenization_auto.py  |   0
 .../models/transformers/tokenization_bert.py  |   0
 .../models/transformers/tokenization_ctrl.py  |   0
 .../transformers/tokenization_distilbert.py   |   0
 .../models/transformers/tokenization_gpt2.py  |   0
 .../models/transformers/tokenization_utils.py |  78 ++--
 .../outputs/cluener_output/__init__.py        |   2 +
 .../prev_trained_model/bert-base/.gitignore   | 104 +++++
 .../prev_trained_model/bert-base/__init__.py  |   0
 pytorch_version/processors/__init__.py        |   0
 pytorch_version/processors/ner_seq.py         |  52 ++-
 pytorch_version/processors/utils_ner.py       |   0
 pytorch_version/run_ner_crf.py                | 273 +++++-------
 pytorch_version/scripts/run_ner_crf.sh        |  26 ++
 pytorch_version/tools/__init__.py             |   0
 pytorch_version/tools/common.py               |   0
 pytorch_version/tools/download_clue_data.py   |   6 +-
 pytorch_version/tools/finetuning_argparse.py  |  96 ++++
 pytorch_version/tools/plot.py                 |  68 +++
 74 files changed, 1008 insertions(+), 324 deletions(-)
 mode change 100644 => 100755 pytorch_version/README.md
 mode change 100644 => 100755 pytorch_version/__init__.py
 create mode 100755 pytorch_version/callback/__init__.py
 mode change 100644 => 100755 pytorch_version/callback/lr_scheduler.py
 create mode 100755 pytorch_version/callback/optimizater/__init__.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/adamw.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/lamb.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/lars.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/lookahead.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/nadam.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/novograd.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/planradam.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/radam.py
 mode change 100644 => 100755 pytorch_version/callback/optimizater/ralamb.py
 mode change 100644 => 100755 pytorch_version/callback/progressbar.py
 create mode 100755 pytorch_version/datasets/cluener/__init__.py
 mode change 100644 => 100755 pytorch_version/losses/__init__.py
 create mode 100755 pytorch_version/metrics/__init__.py
 mode change 100644 => 100755 pytorch_version/metrics/ner_metrics.py
 mode change 100644 => 100755 pytorch_version/models/__init__.py
 mode change 100644 => 100755 pytorch_version/models/albert_for_ner.py
 mode change 100644 => 100755 pytorch_version/models/bert_for_ner.py
 create mode 100755 pytorch_version/models/layers/__init__.py
 create mode 100755 pytorch_version/models/layers/crf.py
 create mode 100755 pytorch_version/models/layers/linears.py
 mode change 100644 => 100755 pytorch_version/models/transformers/__init__.py
 mode change 100644 => 100755 pytorch_version/models/transformers/__main__.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_albert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_auto.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_bert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_ctrl.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_distilbert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_gpt2.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_openai.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_roberta.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_transfo_xl.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_utils.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_xlm.py
 mode change 100644 => 100755 pytorch_version/models/transformers/configuration_xlnet.py
 mode change 100644 => 100755 pytorch_version/models/transformers/file_utils.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_albert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_albert_bright.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_auto.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_bert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_ctrl.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_distilbert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_gpt2.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_openai.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_roberta.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_transfo_xl.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_transfo_xl_utilities.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_utils.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_xlm.py
 mode change 100644 => 100755 pytorch_version/models/transformers/modeling_xlnet.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_albert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_auto.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_bert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_ctrl.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_distilbert.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_gpt2.py
 mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_utils.py
 create mode 100755 pytorch_version/outputs/cluener_output/__init__.py
 create mode 100755 pytorch_version/prev_trained_model/bert-base/.gitignore
 mode change 100644 => 100755 pytorch_version/prev_trained_model/bert-base/__init__.py
 mode change 100644 => 100755 pytorch_version/processors/__init__.py
 mode change 100644 => 100755 pytorch_version/processors/ner_seq.py
 mode change 100644 => 100755 pytorch_version/processors/utils_ner.py
 mode change 100644 => 100755 pytorch_version/run_ner_crf.py
 create mode 100755 pytorch_version/scripts/run_ner_crf.sh
 mode change 100644 => 100755 pytorch_version/tools/__init__.py
 mode change 100644 => 100755 pytorch_version/tools/common.py
 mode change 100644 => 100755 pytorch_version/tools/download_clue_data.py
 create mode 100755 pytorch_version/tools/finetuning_argparse.py
 create mode 100755 pytorch_version/tools/plot.py

diff --git a/pytorch_version/README.md b/pytorch_version/README.md
old mode 100644
new mode 100755
index 983a1e7c..436502f0
--- a/pytorch_version/README.md
+++ b/pytorch_version/README.md
@@ -1,60 +1,13 @@
-# CLUE_NER_PyTorch
-
-CLUENER 细粒度命名实体识别
-
-## 数据介绍
+### 数据介绍
 
 数据详细描述: https://www.cluebenchmarks.com/introduce.html
 
-## 代码目录说明
-
-```text
-├── callback  # 自定义常用callback
-|  └── lr_scheduler.py　　
-|  └── ...
-├── losses   #　自定义loss函数
-|  └── focal_loss.py　
-|  └── ...
-├── CLUEdatasets   #　存放数据
-|  └── cluener　　　
-├── metrics　　　　　　　　　# metric计算
-|  └── ner_metrics.py　　　
-├── outputs              # 模型输出保存
-|  └── cluener_output
-├── prev_trained_model　# 预训练模型
-|  └── albert_base
-|  └── bert-wwm
-|  └── ...
-├── processors　　　　　# 数据处理
-|  └── ner_seq.py
-|  └── ...
-├── tools　　　　　　　　#　通用脚本
-|  └── common.py
-|  └── download_clue_data.py
-|  └── ...
-├── models　　　# 主模型
-|  └── transformers
-|  └── bert_for_ner.py
-|  └── ...
-├── run_ner_span.py       # 主程序
-├── run_ner_span.sh   #　任务运行脚本
-```
-## 依赖模块
-
-* pytorch=1.1.0
-* boto3=1.9
-* regex
-* sacremoses
-* sentencepiece
-* python3.7+
-
-## 运行方式
-
-### 1. 下载CLUE_NER数据集，运行以下命令：
-```python
-python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener
+### 运行方式
+1. 下载CLUE_NER数据集，运行以下命令：
+```shell
+python tools/download_clue_data.py --data_dir=./datasets --tasks=cluener
 ```
-### 2. 预训练模型文件格式，比如:
+2. 预训练模型文件格式，比如:
 ```text
 ├── prev_trained_model　# 预训练模型
 |  └── bert-base
@@ -62,21 +15,19 @@ python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener
 |  | └── config.json
 |  | └── pytorch_model.bin
 ```
-### 3. 训练：
+3. 训练：
 
 直接执行对应shell脚本，如：
 ```shell
-sh run_ner_span.sh
+sh scripts/run_ner_crf.sh
 ```
-
-### 4. 预测
+4. 预测
 
 当前默认使用最后一个checkpoint模型作为预测模型，你也可以指定--predict_checkpoints参数进行对应的checkpoint进行预测，比如：
-
 ```python
 CURRENT_DIR=`pwd`
 export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base
-export GLUE_DIR=$CURRENT_DIR/CLUEdatasets
+export CLUE_DIR=$CURRENT_DIR/datasets
 export OUTPUR_DIR=$CURRENT_DIR/outputs
 TASK_NAME="cluener"
 
@@ -87,45 +38,14 @@ python run_ner_span.py \
   --do_predict \
   --predict_checkpoints=100 \
   --do_lower_case \
-  --loss_type=ce \
 　...
 ```
 ### 模型列表
 
 model_type目前支持**bert**和**albert**
 
-**注意**: bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样，而模型本身主体一样，因此参数model_type=bert其余同理。
-
-### 输入编码方式
-
-目前默认为BIOS编码方式，比如:
-
-```text
-美	B-LOC
-国	I-LOC
-的	O
-华	B-PER
-莱	I-PER
-士	I-PER
-
-我	O
-跟	O
-他	O
-谈	O
-笑	O
-风	O
-生	O 
-```
-
-## 结果
-
-以下为模型在 **dev**上的测试结果:
+**注意:** bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样，而模型本身主体一样，因此参数model_type=bert其余同理。
 
-|              | Accuracy (entity)  | Recall (entity)    | F1 score (entity)  |                                                              |
-| ------------ | ------------------ | ------------------ | ------------------ | ------------------------------------------------------------ |
-| BERT+Softmax | 0.7916     | 0.7962     | 0.7939    | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 |
-| BERT+CRF     | 0.7877     | 0.8008 | 0.7942     | train_max_length=128 eval_max_length=512 epoch=5 lr=3e-5 batch_size=24 |
-| BERT+Span    | 0.8132 | **0.8092** | **0.8112** | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 |
-| BERT+Span+focal_loss    | 0.8121 | 0.8008 | 0.8064 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 loss_type=focal |
-| BERT+Span+label_smoothing   | **0.8235** | 0.7946 | 0.8088 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 loss_type=lsr |
+### 结果
 
+在dev上为F1分数为0.8076
\ No newline at end of file
diff --git a/pytorch_version/__init__.py b/pytorch_version/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/__init__.py b/pytorch_version/callback/__init__.py
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/pytorch_version/callback/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/callback/lr_scheduler.py b/pytorch_version/callback/lr_scheduler.py
old mode 100644
new mode 100755
index 57543c34..2ee2ad3f
--- a/pytorch_version/callback/lr_scheduler.py
+++ b/pytorch_version/callback/lr_scheduler.py
@@ -4,20 +4,11 @@
 from torch.optim.optimizer import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
 
-__all__ = ['CustomDecayLR',
-           'BertLR',
-           'CyclicLR',
-           'ReduceLROnPlateau',
-           'ReduceLRWDOnPlateau',
-           'CosineLRWithRestarts',
-           ]
-
 def get_constant_schedule(optimizer, last_epoch=-1):
     """ Create a schedule with a constant learning rate.
     """
     return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
 
-
 def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
     """ Create a schedule with a constant learning rate preceded by a warmup
     period during which the learning rate increases linearly between 0 and 1.
diff --git a/pytorch_version/callback/optimizater/__init__.py b/pytorch_version/callback/optimizater/__init__.py
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/pytorch_version/callback/optimizater/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/callback/optimizater/adamw.py b/pytorch_version/callback/optimizater/adamw.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/lamb.py b/pytorch_version/callback/optimizater/lamb.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/lars.py b/pytorch_version/callback/optimizater/lars.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/lookahead.py b/pytorch_version/callback/optimizater/lookahead.py
old mode 100644
new mode 100755
index affb3fa0..cd0d7beb
--- a/pytorch_version/callback/optimizater/lookahead.py
+++ b/pytorch_version/callback/optimizater/lookahead.py
@@ -1,5 +1,5 @@
 import torch
-from torch.optim import Optimizer
+from torch.optim.optimizer import Optimizer
 from collections import defaultdict
 
 class Lookahead(Optimizer):
diff --git a/pytorch_version/callback/optimizater/nadam.py b/pytorch_version/callback/optimizater/nadam.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/novograd.py b/pytorch_version/callback/optimizater/novograd.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/planradam.py b/pytorch_version/callback/optimizater/planradam.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/radam.py b/pytorch_version/callback/optimizater/radam.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/optimizater/ralamb.py b/pytorch_version/callback/optimizater/ralamb.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/callback/progressbar.py b/pytorch_version/callback/progressbar.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/datasets/cluener/__init__.py b/pytorch_version/datasets/cluener/__init__.py
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/pytorch_version/datasets/cluener/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/losses/__init__.py b/pytorch_version/losses/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/metrics/__init__.py b/pytorch_version/metrics/__init__.py
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/pytorch_version/metrics/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/metrics/ner_metrics.py b/pytorch_version/metrics/ner_metrics.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/__init__.py b/pytorch_version/models/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/albert_for_ner.py b/pytorch_version/models/albert_for_ner.py
old mode 100644
new mode 100755
index 6fbd24b2..639a5591
--- a/pytorch_version/models/albert_for_ner.py
+++ b/pytorch_version/models/albert_for_ner.py
@@ -1,10 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .crf import CRF
+from .layers.crf import CRF
 from .transformers.modeling_albert import AlbertPreTrainedModel
 from .transformers.modeling_albert import AlbertModel
-from .linears import PoolerEndLogits, PoolerStartLogits
+from .layers.linears import PoolerEndLogits, PoolerStartLogits
 from torch.nn import CrossEntropyLoss
 from losses.focal_loss import FocalLoss
 from losses.label_smoothing import LabelSmoothingCrossEntropy
@@ -21,11 +21,8 @@ def __init__(self, config):
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                 position_ids=None, head_mask=None, labels=None):
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids,
+                            position_ids=position_ids,head_mask=head_mask)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -50,23 +47,23 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None,
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 class AlbertCrfForNer(AlbertPreTrainedModel):
-    def __init__(self, config, label2id, device):
+    def __init__(self, config):
         super(AlbertCrfForNer, self).__init__(config)
         self.bert = AlbertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, len(label2id))
-        self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_lens=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
         outputs = (logits,)
         if labels is not None:
-            loss = self.crf.calculate_loss(logits, tag_list=labels, lengths=input_lens)
-            outputs =(loss,)+outputs
+            loss = self.crf(emissions = logits, tags=labels, mask=attention_mask)
+            outputs =(-1*loss,)+outputs
         return outputs # (loss), scores
 
 class AlbertSpanForNer(AlbertPreTrainedModel):
@@ -85,7 +82,7 @@ def __init__(self, config,):
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         start_logits = self.start_fc(sequence_output)
diff --git a/pytorch_version/models/bert_for_ner.py b/pytorch_version/models/bert_for_ner.py
old mode 100644
new mode 100755
index e7cb44eb..2d867fce
--- a/pytorch_version/models/bert_for_ner.py
+++ b/pytorch_version/models/bert_for_ner.py
@@ -1,10 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .crf import CRF
+from .layers.crf import CRF
 from .transformers.modeling_bert import BertPreTrainedModel
 from .transformers.modeling_bert import BertModel
-from .linears import PoolerEndLogits, PoolerStartLogits
+from .layers.linears import PoolerEndLogits, PoolerStartLogits
 from torch.nn import CrossEntropyLoss
 from losses.focal_loss import FocalLoss
 from losses.label_smoothing import LabelSmoothingCrossEntropy
@@ -21,11 +21,7 @@ def __init__(self, config):
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                 position_ids=None, head_mask=None, labels=None):
-        outputs = self.bert(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids,
-                            head_mask=head_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -50,23 +46,23 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None,
         return outputs  # (loss), scores, (hidden_states), (attentions)
 
 class BertCrfForNer(BertPreTrainedModel):
-    def __init__(self, config, label2id, device):
+    def __init__(self, config):
         super(BertCrfForNer, self).__init__(config)
         self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, len(label2id))
-        self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.crf = CRF(num_tags=config.num_labels, batch_first=True)
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_lens=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
         outputs = (logits,)
         if labels is not None:
-            loss = self.crf.calculate_loss(logits, tag_list=labels, lengths=input_lens)
-            outputs =(loss,)+outputs
+            loss = self.crf(emissions = logits, tags=labels, mask=attention_mask)
+            outputs =(-1*loss,)+outputs
         return outputs # (loss), scores
 
 class BertSpanForNer(BertPreTrainedModel):
@@ -85,7 +81,7 @@ def __init__(self, config,):
         self.init_weights()
 
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None):
-        outputs = self.bert(input_ids, token_type_ids, attention_mask)
+        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
         sequence_output = outputs[0]
         sequence_output = self.dropout(sequence_output)
         start_logits = self.start_fc(sequence_output)
diff --git a/pytorch_version/models/layers/__init__.py b/pytorch_version/models/layers/__init__.py
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/pytorch_version/models/layers/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/models/layers/crf.py b/pytorch_version/models/layers/crf.py
new file mode 100755
index 00000000..d8b3adcc
--- /dev/null
+++ b/pytorch_version/models/layers/crf.py
@@ -0,0 +1,411 @@
+import torch
+import torch.nn as nn
+from typing import List, Optional
+
+class CRF(nn.Module):
+    """Conditional random field.
+    This module implements a conditional random field [LMP01]_. The forward computation
+    of this class computes the log likelihood of the given sequence of tags and
+    emission score tensor. This class also has `~CRF.decode` method which finds
+    the best tag sequence given an emission score tensor using `Viterbi algorithm`_.
+    Args:
+        num_tags: Number of tags.
+        batch_first: Whether the first dimension corresponds to the size of a minibatch.
+    Attributes:
+        start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size
+            ``(num_tags,)``.
+        end_transitions (`~torch.nn.Parameter`): End transition score tensor of size
+            ``(num_tags,)``.
+        transitions (`~torch.nn.Parameter`): Transition score tensor of size
+            ``(num_tags, num_tags)``.
+    .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001).
+       "Conditional random fields: Probabilistic models for segmenting and
+       labeling sequence data". *Proc. 18th International Conf. on Machine
+       Learning*. Morgan Kaufmann. pp. 282–289.
+    .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
+    """
+
+    def __init__(self, num_tags: int, batch_first: bool = False) -> None:
+        if num_tags <= 0:
+            raise ValueError(f'invalid number of tags: {num_tags}')
+        super().__init__()
+        self.num_tags = num_tags
+        self.batch_first = batch_first
+        self.start_transitions = nn.Parameter(torch.empty(num_tags))
+        self.end_transitions = nn.Parameter(torch.empty(num_tags))
+        self.transitions = nn.Parameter(torch.empty(num_tags, num_tags))
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        """Initialize the transition parameters.
+        The parameters will be initialized randomly from a uniform distribution
+        between -0.1 and 0.1.
+        """
+        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
+        nn.init.uniform_(self.transitions, -0.1, 0.1)
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(num_tags={self.num_tags})'
+
+    def forward(self, emissions: torch.Tensor,
+                tags: torch.LongTensor,
+                mask: Optional[torch.ByteTensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+        """Compute the conditional log likelihood of a sequence of tags given emission scores.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            tags (`~torch.LongTensor`): Sequence of tags tensor of size
+                ``(seq_length, batch_size)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            reduction: Specifies  the reduction to apply to the output:
+                ``none|sum|mean|token_mean``. ``none``: no reduction will be applied.
+                ``sum``: the output will be summed over batches. ``mean``: the output will be
+                averaged over batches. ``token_mean``: the output will be averaged over tokens.
+        Returns:
+            `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if
+            reduction is ``none``, ``()`` otherwise.
+        """
+        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
+            raise ValueError(f'invalid reduction: {reduction}')
+        if mask is None:
+            mask = torch.ones_like(tags, dtype=torch.uint8, device=tags.device)
+        if mask.dtype != torch.uint8:
+            mask = mask.byte()
+        self._validate(emissions, tags=tags, mask=mask)
+
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            tags = tags.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+
+        # shape: (batch_size,)
+        numerator = self._compute_score(emissions, tags, mask)
+        # shape: (batch_size,)
+        denominator = self._compute_normalizer(emissions, mask)
+        # shape: (batch_size,)
+        llh = numerator - denominator
+
+        if reduction == 'none':
+            return llh
+        if reduction == 'sum':
+            return llh.sum()
+        if reduction == 'mean':
+            return llh.mean()
+        return llh.sum() / mask.float().sum()
+
+    def decode(self, emissions: torch.Tensor,
+               mask: Optional[torch.ByteTensor] = None,
+               nbest: Optional[int] = None,
+               pad_tag: Optional[int] = None) -> List[List[List[int]]]:
+        """Find the most likely tag sequence using Viterbi algorithm.
+        Args:
+            emissions (`~torch.Tensor`): Emission score tensor of size
+                ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``,
+                ``(batch_size, seq_length, num_tags)`` otherwise.
+            mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)``
+                if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise.
+            nbest (`int`): Number of most probable paths for each sequence
+            pad_tag (`int`): Tag at padded positions. Often input varies in length and
+                the length will be padded to the maximum length in the batch. Tags at
+                the padded positions will be assigned with a padding tag, i.e. `pad_tag`
+        Returns:
+            A PyTorch tensor of the best tag sequence for each batch of shape
+            (nbest, batch_size, seq_length)
+        """
+        if nbest is None:
+            nbest = 1
+        if mask is None:
+            mask = torch.ones(emissions.shape[:2], dtype=torch.uint8,
+                              device=emissions.device)
+        if mask.dtype != torch.uint8:
+            mask = mask.byte()
+        self._validate(emissions, mask=mask)
+
+        if self.batch_first:
+            emissions = emissions.transpose(0, 1)
+            mask = mask.transpose(0, 1)
+
+        if nbest == 1:
+            return self._viterbi_decode(emissions, mask, pad_tag).unsqueeze(0)
+        return self._viterbi_decode_nbest(emissions, mask, nbest, pad_tag)
+
+    def _validate(self, emissions: torch.Tensor,
+                  tags: Optional[torch.LongTensor] = None,
+                  mask: Optional[torch.ByteTensor] = None) -> None:
+        if emissions.dim() != 3:
+            raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
+        if emissions.size(2) != self.num_tags:
+            raise ValueError(
+                f'expected last dimension of emissions is {self.num_tags}, '
+                f'got {emissions.size(2)}')
+
+        if tags is not None:
+            if emissions.shape[:2] != tags.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and tags must match, '
+                    f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}')
+
+        if mask is not None:
+            if emissions.shape[:2] != mask.shape:
+                raise ValueError(
+                    'the first two dimensions of emissions and mask must match, '
+                    f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}')
+            no_empty_seq = not self.batch_first and mask[0].all()
+            no_empty_seq_bf = self.batch_first and mask[:, 0].all()
+            if not no_empty_seq and not no_empty_seq_bf:
+                raise ValueError('mask of the first timestep must all be on')
+
+    def _compute_score(self, emissions: torch.Tensor,
+                       tags: torch.LongTensor,
+                       mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # tags: (seq_length, batch_size)
+        # mask: (seq_length, batch_size)
+        seq_length, batch_size = tags.shape
+        mask = mask.float()
+
+        # Start transition score and first emission
+        # shape: (batch_size,)
+        score = self.start_transitions[tags[0]]
+        score += emissions[0, torch.arange(batch_size), tags[0]]
+
+        for i in range(1, seq_length):
+            # Transition score to next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += self.transitions[tags[i - 1], tags[i]] * mask[i]
+
+            # Emission score for next tag, only added if next timestep is valid (mask == 1)
+            # shape: (batch_size,)
+            score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i]
+
+        # End transition score
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+        # shape: (batch_size,)
+        last_tags = tags[seq_ends, torch.arange(batch_size)]
+        # shape: (batch_size,)
+        score += self.end_transitions[last_tags]
+
+        return score
+
+    def _compute_normalizer(self, emissions: torch.Tensor,
+                            mask: torch.ByteTensor) -> torch.Tensor:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        seq_length = emissions.size(0)
+
+        # Start transition score and first emission; score has size of
+        # (batch_size, num_tags) where for each batch, the j-th column stores
+        # the score that the first timestep has tag j
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+
+        for i in range(1, seq_length):
+            # Broadcast score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emissions = emissions[i].unsqueeze(1)
+
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the sum of scores of all
+            # possible tag sequences so far that end with transitioning from tag i to tag j
+            # and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emissions
+
+            # Sum over all possible current tags, but we're in score space, so a sum
+            # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of
+            # all possible tag sequences so far, that end in tag i
+            # shape: (batch_size, num_tags)
+            next_score = torch.logsumexp(next_score, dim=1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(1), next_score, score)
+
+        # End transition score
+        # shape: (batch_size, num_tags)
+        score += self.end_transitions
+
+        # Sum (log-sum-exp) over all possible tags
+        # shape: (batch_size,)
+        return torch.logsumexp(score, dim=1)
+
+    def _viterbi_decode(self, emissions: torch.FloatTensor,
+                        mask: torch.ByteTensor,
+                        pad_tag: Optional[int] = None) -> List[List[int]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        # return: (batch_size, seq_length)
+        if pad_tag is None:
+            pad_tag = 0
+
+        device = emissions.device
+        seq_length, batch_size = mask.shape
+
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history_idx = torch.zeros((seq_length, batch_size, self.num_tags),
+                                  dtype=torch.long, device=device)
+        oor_idx = torch.zeros((batch_size, self.num_tags),
+                              dtype=torch.long, device=device)
+        oor_tag = torch.full((seq_length, batch_size), pad_tag,
+                             dtype=torch.long, device=device)
+
+        # - score is a tensor of size (batch_size, num_tags) where for every batch,
+        #   value at column j stores the score of the best tag sequence so far that ends
+        #   with tag j
+        # - history_idx saves where the best tags candidate transitioned from; this is used
+        #   when we trace back the best tag sequence
+        # - oor_idx saves the best tags candidate transitioned from at the positions
+        #   where mask is 0, i.e. out of range (oor)
+
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            # Broadcast viterbi score for every possible next tag
+            # shape: (batch_size, num_tags, 1)
+            broadcast_score = score.unsqueeze(2)
+
+            # Broadcast emission score for every possible current tag
+            # shape: (batch_size, 1, num_tags)
+            broadcast_emission = emissions[i].unsqueeze(1)
+
+            # Compute the score tensor of size (batch_size, num_tags, num_tags) where
+            # for each sample, entry at row i and column j stores the score of the best
+            # tag sequence so far that ends with transitioning from tag i to tag j and emitting
+            # shape: (batch_size, num_tags, num_tags)
+            next_score = broadcast_score + self.transitions + broadcast_emission
+
+            # Find the maximum score over all possible current tag
+            # shape: (batch_size, num_tags)
+            next_score, indices = next_score.max(dim=1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags)
+            score = torch.where(mask[i].unsqueeze(-1), next_score, score)
+            indices = torch.where(mask[i].unsqueeze(-1), indices, oor_idx)
+            history_idx[i - 1] = indices
+
+        # End transition score
+        # shape: (batch_size, num_tags)
+        end_score = score + self.end_transitions
+        _, end_tag = end_score.max(dim=1)
+
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+
+        # insert the best tag at each sequence end (last position with mask == 1)
+        history_idx = history_idx.transpose(1, 0).contiguous()
+        history_idx.scatter_(1, seq_ends.view(-1, 1, 1).expand(-1, 1, self.num_tags),
+                             end_tag.view(-1, 1, 1).expand(-1, 1, self.num_tags))
+        history_idx = history_idx.transpose(1, 0).contiguous()
+
+        # The most probable path for each sequence
+        best_tags_arr = torch.zeros((seq_length, batch_size),
+                                    dtype=torch.long, device=device)
+        best_tags = torch.zeros(batch_size, 1, dtype=torch.long, device=device)
+        for idx in range(seq_length - 1, -1, -1):
+            best_tags = torch.gather(history_idx[idx], 1, best_tags)
+            best_tags_arr[idx] = best_tags.data.view(batch_size)
+
+        return torch.where(mask, best_tags_arr, oor_tag).transpose(0, 1)
+
+    def _viterbi_decode_nbest(self, emissions: torch.FloatTensor,
+                              mask: torch.ByteTensor,
+                              nbest: int,
+                              pad_tag: Optional[int] = None) -> List[List[List[int]]]:
+        # emissions: (seq_length, batch_size, num_tags)
+        # mask: (seq_length, batch_size)
+        # return: (nbest, batch_size, seq_length)
+        if pad_tag is None:
+            pad_tag = 0
+
+        device = emissions.device
+        seq_length, batch_size = mask.shape
+
+        # Start transition and first emission
+        # shape: (batch_size, num_tags)
+        score = self.start_transitions + emissions[0]
+        history_idx = torch.zeros((seq_length, batch_size, self.num_tags, nbest),
+                                  dtype=torch.long, device=device)
+        oor_idx = torch.zeros((batch_size, self.num_tags, nbest),
+                              dtype=torch.long, device=device)
+        oor_tag = torch.full((seq_length, batch_size, nbest), pad_tag,
+                             dtype=torch.long, device=device)
+
+        # + score is a tensor of size (batch_size, num_tags) where for every batch,
+        #   value at column j stores the score of the best tag sequence so far that ends
+        #   with tag j
+        # + history_idx saves where the best tags candidate transitioned from; this is used
+        #   when we trace back the best tag sequence
+        # - oor_idx saves the best tags candidate transitioned from at the positions
+        #   where mask is 0, i.e. out of range (oor)
+
+        # Viterbi algorithm recursive case: we compute the score of the best tag sequence
+        # for every possible next tag
+        for i in range(1, seq_length):
+            if i == 1:
+                broadcast_score = score.unsqueeze(-1)
+                broadcast_emission = emissions[i].unsqueeze(1)
+                # shape: (batch_size, num_tags, num_tags)
+                next_score = broadcast_score + self.transitions + broadcast_emission
+            else:
+                broadcast_score = score.unsqueeze(-1)
+                broadcast_emission = emissions[i].unsqueeze(1).unsqueeze(2)
+                # shape: (batch_size, num_tags, nbest, num_tags)
+                next_score = broadcast_score + self.transitions.unsqueeze(1) + broadcast_emission
+
+            # Find the top `nbest` maximum score over all possible current tag
+            # shape: (batch_size, nbest, num_tags)
+            next_score, indices = next_score.view(batch_size, -1, self.num_tags).topk(nbest, dim=1)
+
+            if i == 1:
+                score = score.unsqueeze(-1).expand(-1, -1, nbest)
+                indices = indices * nbest
+
+            # convert to shape: (batch_size, num_tags, nbest)
+            next_score = next_score.transpose(2, 1)
+            indices = indices.transpose(2, 1)
+
+            # Set score to the next score if this timestep is valid (mask == 1)
+            # and save the index that produces the next score
+            # shape: (batch_size, num_tags, nbest)
+            score = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), next_score, score)
+            indices = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), indices, oor_idx)
+            history_idx[i - 1] = indices
+
+        # End transition score shape: (batch_size, num_tags, nbest)
+        end_score = score + self.end_transitions.unsqueeze(-1)
+        _, end_tag = end_score.view(batch_size, -1).topk(nbest, dim=1)
+
+        # shape: (batch_size,)
+        seq_ends = mask.long().sum(dim=0) - 1
+
+        # insert the best tag at each sequence end (last position with mask == 1)
+        history_idx = history_idx.transpose(1, 0).contiguous()
+        history_idx.scatter_(1, seq_ends.view(-1, 1, 1, 1).expand(-1, 1, self.num_tags, nbest),
+                             end_tag.view(-1, 1, 1, nbest).expand(-1, 1, self.num_tags, nbest))
+        history_idx = history_idx.transpose(1, 0).contiguous()
+
+        # The most probable path for each sequence
+        best_tags_arr = torch.zeros((seq_length, batch_size, nbest),
+                                    dtype=torch.long, device=device)
+        best_tags = torch.arange(nbest, dtype=torch.long, device=device) \
+                         .view(1, -1).expand(batch_size, -1)
+        for idx in range(seq_length - 1, -1, -1):
+            best_tags = torch.gather(history_idx[idx].view(batch_size, -1), 1, best_tags)
+            best_tags_arr[idx] = best_tags.data.view(batch_size, -1) // nbest
+
+        return torch.where(mask.unsqueeze(-1), best_tags_arr, oor_tag).permute(2, 1, 0)
\ No newline at end of file
diff --git a/pytorch_version/models/layers/linears.py b/pytorch_version/models/layers/linears.py
new file mode 100755
index 00000000..9437ae3b
--- /dev/null
+++ b/pytorch_version/models/layers/linears.py
@@ -0,0 +1,40 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class FeedForwardNetwork(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0):
+        super(FeedForwardNetwork, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.linear1 = nn.Linear(input_size, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        x_proj = F.dropout(F.relu(self.linear1(x)), p=self.dropout_rate, training=self.training)
+        x_proj = self.linear2(x_proj)
+        return x_proj
+
+
+class PoolerStartLogits(nn.Module):
+    def __init__(self, hidden_size, num_classes):
+        super(PoolerStartLogits, self).__init__()
+        self.dense = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, hidden_states, p_mask=None):
+        x = self.dense(hidden_states)
+        return x
+
+class PoolerEndLogits(nn.Module):
+    def __init__(self, hidden_size, num_classes):
+        super(PoolerEndLogits, self).__init__()
+        self.dense_0 = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(hidden_size)
+        self.dense_1 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, hidden_states, start_positions=None, p_mask=None):
+        x = self.dense_0(torch.cat([hidden_states, start_positions], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x)
+        return x
diff --git a/pytorch_version/models/transformers/__init__.py b/pytorch_version/models/transformers/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/__main__.py b/pytorch_version/models/transformers/__main__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_albert.py b/pytorch_version/models/transformers/configuration_albert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_auto.py b/pytorch_version/models/transformers/configuration_auto.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_bert.py b/pytorch_version/models/transformers/configuration_bert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_ctrl.py b/pytorch_version/models/transformers/configuration_ctrl.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_distilbert.py b/pytorch_version/models/transformers/configuration_distilbert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_gpt2.py b/pytorch_version/models/transformers/configuration_gpt2.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_openai.py b/pytorch_version/models/transformers/configuration_openai.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_roberta.py b/pytorch_version/models/transformers/configuration_roberta.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_transfo_xl.py b/pytorch_version/models/transformers/configuration_transfo_xl.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_utils.py b/pytorch_version/models/transformers/configuration_utils.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_xlm.py b/pytorch_version/models/transformers/configuration_xlm.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/configuration_xlnet.py b/pytorch_version/models/transformers/configuration_xlnet.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/file_utils.py b/pytorch_version/models/transformers/file_utils.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_albert.py b/pytorch_version/models/transformers/modeling_albert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_albert_bright.py b/pytorch_version/models/transformers/modeling_albert_bright.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_auto.py b/pytorch_version/models/transformers/modeling_auto.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_bert.py b/pytorch_version/models/transformers/modeling_bert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_ctrl.py b/pytorch_version/models/transformers/modeling_ctrl.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_distilbert.py b/pytorch_version/models/transformers/modeling_distilbert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_gpt2.py b/pytorch_version/models/transformers/modeling_gpt2.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_openai.py b/pytorch_version/models/transformers/modeling_openai.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_roberta.py b/pytorch_version/models/transformers/modeling_roberta.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_transfo_xl.py b/pytorch_version/models/transformers/modeling_transfo_xl.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_transfo_xl_utilities.py b/pytorch_version/models/transformers/modeling_transfo_xl_utilities.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_utils.py b/pytorch_version/models/transformers/modeling_utils.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_xlm.py b/pytorch_version/models/transformers/modeling_xlm.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/modeling_xlnet.py b/pytorch_version/models/transformers/modeling_xlnet.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_albert.py b/pytorch_version/models/transformers/tokenization_albert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_auto.py b/pytorch_version/models/transformers/tokenization_auto.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_bert.py b/pytorch_version/models/transformers/tokenization_bert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_ctrl.py b/pytorch_version/models/transformers/tokenization_ctrl.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_distilbert.py b/pytorch_version/models/transformers/tokenization_distilbert.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_gpt2.py b/pytorch_version/models/transformers/tokenization_gpt2.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/models/transformers/tokenization_utils.py b/pytorch_version/models/transformers/tokenization_utils.py
old mode 100644
new mode 100755
index 3d91067b..5e5be872
--- a/pytorch_version/models/transformers/tokenization_utils.py
+++ b/pytorch_version/models/transformers/tokenization_utils.py
@@ -739,7 +739,6 @@ def encode(self,
     def encode_plus(self,
                     text,
                     text_pair=None,
-                    text_third=None,
                     add_special_tokens=False,
                     max_length=None,
                     stride=0,
@@ -786,18 +785,16 @@ def get_input_ids(text):
 
         first_ids = get_input_ids(text)
         second_ids = get_input_ids(text_pair) if text_pair is not None else None
-        third_ids = get_input_ids(text_third) if text_third is not None else None
 
         return self.prepare_for_model(first_ids,
                                       pair_ids=second_ids,
-                                      third_ids=third_ids,
                                       max_length=max_length,
                                       add_special_tokens=add_special_tokens,
                                       stride=stride,
                                       truncation_strategy=truncation_strategy,
                                       return_tensors=return_tensors)
 
-    def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None, add_special_tokens=False, stride=0,
+    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
                           truncation_strategy='longest_first', return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
@@ -844,18 +841,17 @@ def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None,
         pair = bool(pair_ids is not None)
         len_ids = len(ids)
         len_pair_ids = len(pair_ids) if pair else 0
-        len_third_ids = len(third_ids) if pair else 0
 
         encoded_inputs = {}
         total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
-        if len_third_ids>0:
-            total_len+= 1
         if max_length and total_len > max_length:
-            ids, pair_ids,third_ids = self.truncate_sequences(ids, pair_ids=pair_ids,third_ids=third_ids,
-                                                                        num_tokens_to_remove=total_len-max_length)
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
+                                                                        num_tokens_to_remove=total_len-max_length,
+                                                                        truncation_strategy=truncation_strategy,
+                                                                        stride=stride)
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
             encoded_inputs["num_truncated_tokens"] = total_len - max_length
-        if len(third_ids) >0:
-            pair_ids  = pair_ids+[self.sep_token_id]+third_ids
+
         if add_special_tokens:
             sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
@@ -864,8 +860,14 @@ def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None,
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
 
-        sequence = torch.tensor([sequence])
-        token_type_ids = torch.tensor([token_type_ids])
+        if return_tensors == 'tf' and is_tf_available():
+            sequence = tf.constant([sequence])
+            token_type_ids = tf.constant([token_type_ids])
+        elif return_tensors == 'pt' and is_torch_available():
+            sequence = torch.tensor([sequence])
+            token_type_ids = torch.tensor([token_type_ids])
+        elif return_tensors is not None:
+            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
 
         encoded_inputs["input_ids"] = sequence
         encoded_inputs["token_type_ids"] = token_type_ids
@@ -877,23 +879,45 @@ def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None,
 
         return encoded_inputs
 
-    def truncate_sequences(self, ids, pair_ids=None, third_ids=None,num_tokens_to_remove=0):
+    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
         """Truncates a sequence pair in place to the maximum length.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences).
+                    Overflowing tokens only contains overflow from the first sequence.
+                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
         """
-        if pair_ids is None:
-            pair_ids = []
-        if third_ids is None:
-            third_ids = []
         if num_tokens_to_remove <= 0:
-            return ids, pair_ids, third_ids
-        for _ in range(num_tokens_to_remove):
-            if len(ids) > len(pair_ids) and len(ids) > len(third_ids):
-                ids = ids[:-1]
-            elif len(pair_ids) > len(ids) and len(pair_ids) > len(third_ids):
-                pair_ids = pair_ids[:-1]
-            else:
-                third_ids = third_ids[:-1]
-        return (ids, pair_ids, third_ids)
+            return ids, pair_ids, []
+
+        if truncation_strategy == 'longest_first':
+            overflowing_tokens = []
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    overflowing_tokens = [ids[-1]] + overflowing_tokens
+                    ids = ids[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+            window_len = min(len(ids), stride)
+            if window_len > 0:
+                overflowing_tokens = ids[-window_len:] + overflowing_tokens
+        elif truncation_strategy == 'only_first':
+            assert len(ids) > num_tokens_to_remove
+            window_len = min(len(ids), stride + num_tokens_to_remove)
+            overflowing_tokens = ids[-window_len:]
+            ids = ids[:-num_tokens_to_remove]
+        elif truncation_strategy == 'only_second':
+            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
+            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+            overflowing_tokens = pair_ids[-window_len:]
+            pair_ids = pair_ids[:-num_tokens_to_remove]
+        elif truncation_strategy == 'do_not_truncate':
+            raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
+        else:
+            raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
+        return (ids, pair_ids, overflowing_tokens)
 
     def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
         logger.warning("This tokenizer does not make use of special tokens.")
diff --git a/pytorch_version/outputs/cluener_output/__init__.py b/pytorch_version/outputs/cluener_output/__init__.py
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/pytorch_version/outputs/cluener_output/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/pytorch_version/prev_trained_model/bert-base/.gitignore b/pytorch_version/prev_trained_model/bert-base/.gitignore
new file mode 100755
index 00000000..894a44cc
--- /dev/null
+++ b/pytorch_version/prev_trained_model/bert-base/.gitignore
@@ -0,0 +1,104 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/pytorch_version/prev_trained_model/bert-base/__init__.py b/pytorch_version/prev_trained_model/bert-base/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/processors/__init__.py b/pytorch_version/processors/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/processors/ner_seq.py b/pytorch_version/processors/ner_seq.py
old mode 100644
new mode 100755
index 6a5c3ebc..c4c45f1a
--- a/pytorch_version/processors/ner_seq.py
+++ b/pytorch_version/processors/ner_seq.py
@@ -107,16 +107,16 @@ def convert_examples_to_features(examples,label_list,max_seq_length,tokenizer,
         # used as as the "sentence vector". Note that this only makes sense because
         # the entire model is fine-tuned.
         tokens += [sep_token]
-        label_ids += [label_map[sep_token]]
+        label_ids += [label_map['O']]
         segment_ids = [sequence_a_segment_id] * len(tokens)
 
         if cls_token_at_end:
             tokens += [cls_token]
-            label_ids += [label_map[cls_token]]
+            label_ids += [label_map['O']]
             segment_ids += [cls_token_segment_id]
         else:
             tokens = [cls_token] + tokens
-            label_ids = [label_map[cls_token]] + label_ids
+            label_ids = [label_map['O']] + label_ids
             segment_ids = [cls_token_segment_id] + segment_ids
 
         input_ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -154,6 +154,48 @@ def convert_examples_to_features(examples,label_list,max_seq_length,tokenizer,
                                       segment_ids=segment_ids, label_ids=label_ids))
     return features
 
+
+class CnerProcessor(DataProcessor):
+    """Processor for the chinese ner data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_text(os.path.join(data_dir, "train.char.bmes")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_text(os.path.join(data_dir, "dev.char.bmes")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_text(os.path.join(data_dir, "test.char.bmes")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["X",'B-CONT','B-EDU','B-LOC','B-NAME','B-ORG','B-PRO','B-RACE','B-TITLE',
+                'I-CONT','I-EDU','I-LOC','I-NAME','I-ORG','I-PRO','I-RACE','I-TITLE',
+                'O','S-NAME','S-ORG','S-RACE',"[START]", "[END]"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a= line['words']
+            # BIOS
+            labels = []
+            for x in line['labels']:
+                if 'M-' in x:
+                    labels.append(x.replace('M-','I-'))
+                elif 'E-' in x:
+                    labels.append(x.replace('E-', 'I-'))
+                else:
+                    labels.append(x)
+            examples.append(InputExample(guid=guid, text_a=text_a, labels=labels))
+        return examples
+
 class CluenerProcessor(DataProcessor):
     """Processor for the chinese ner data set."""
 
@@ -177,7 +219,7 @@ def get_labels(self):
                 'I-organization', 'I-position','I-scene',
                 "S-address", "S-book", "S-company", 'S-game', 'S-government', 'S-movie',
                 'S-name', 'S-organization', 'S-position',
-                'S-scene','O',"[CLS]", "[SEP]"]
+                'S-scene','O',"[START]", "[END]"]
 
     def _create_examples(self, lines, set_type):
         """Creates examples for the training and dev sets."""
@@ -189,6 +231,8 @@ def _create_examples(self, lines, set_type):
             labels = line['labels']
             examples.append(InputExample(guid=guid, text_a=text_a, labels=labels))
         return examples
+
 ner_processors = {
+    "cner": CnerProcessor,
     'cluener':CluenerProcessor
 }
diff --git a/pytorch_version/processors/utils_ner.py b/pytorch_version/processors/utils_ner.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/run_ner_crf.py b/pytorch_version/run_ner_crf.py
old mode 100644
new mode 100755
index bdb3ba2a..7e5bdac3
--- a/pytorch_version/run_ner_crf.py
+++ b/pytorch_version/run_ner_crf.py
@@ -1,12 +1,11 @@
-import argparse
 import glob
 import logging
 import os
 import json
+import time
 
-import numpy as np
 import torch
-from torch.nn import CrossEntropyLoss
+import torch.nn as nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from callback.optimizater.adamw import AdamW
@@ -15,21 +14,20 @@
 from tools.common import seed_everything,json_to_text
 from tools.common import init_logger, logger
 
-from models.transformers import WEIGHTS_NAME,BertConfig,AlbertConfig
+from models.transformers import WEIGHTS_NAME, BertConfig, AlbertConfig
 from models.bert_for_ner import BertCrfForNer
 from models.albert_for_ner import AlbertCrfForNer
-from processors.utils_ner import CNerTokenizer,get_entities
+from processors.utils_ner import CNerTokenizer, get_entities
 from processors.ner_seq import convert_examples_to_features
 from processors.ner_seq import ner_processors as processors
 from processors.ner_seq import collate_fn
 from metrics.ner_metrics import SeqEntityScore
-
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig,)), ())
+from tools.finetuning_argparse import get_argparse
 
 MODEL_CLASSES = {
-    ## bert ernie bert_wwm bert_wwwm_ext,roberta
+    ## bert ernie bert_wwm bert_wwwm_ext
     'bert': (BertConfig, BertCrfForNer, CNerTokenizer),
-    'albert':(AlbertConfig,AlbertCrfForNer,CNerTokenizer)
+    'albert': (AlbertConfig, AlbertCrfForNer, CNerTokenizer)
 }
 
 def train(args, train_dataset, model, tokenizer):
@@ -43,21 +41,34 @@ def train(args, train_dataset, model, tokenizer):
         args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
     else:
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ["bias", "LayerNorm.weight"]
+    bert_param_optimizer = list(model.bert.named_parameters())
+    crf_param_optimizer = list(model.crf.named_parameters())
+    linear_param_optimizer = list(model.classifier.named_parameters())
     optimizer_grouped_parameters = [
-        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-        "weight_decay": args.weight_decay,},
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay, 'lr': args.learning_rate},
+        {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
+         'lr': args.learning_rate},
+
+        {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate},
+        {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
+         'lr': args.crf_learning_rate},
+
+        {'params': [p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate},
+        {'params': [p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,
+         'lr': args.crf_learning_rate}
     ]
+    args.warmup_steps = int(t_total * args.warmup_proportion)
     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
                                                 num_training_steps=t_total)
-
     # Check if saved optimizer or scheduler states exist
     if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")):
+            os.path.join(args.model_name_or_path, "scheduler.pt")):
         # Load in optimizer and scheduler states
         optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
         scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
@@ -81,10 +92,10 @@ def train(args, train_dataset, model, tokenizer):
     logger.info("  Num Epochs = %d", args.num_train_epochs)
     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
     logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
+                args.train_batch_size
+                * args.gradient_accumulation_steps
+                * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+                )
     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
     logger.info("  Total optimization steps = %d", t_total)
 
@@ -96,7 +107,6 @@ def train(args, train_dataset, model, tokenizer):
         global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
         epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
         steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
         logger.info("  Continuing training from checkpoint, will skip to saved global_step")
         logger.info("  Continuing training from epoch %d", epochs_trained)
         logger.info("  Continuing training from global step %d", global_step)
@@ -114,7 +124,7 @@ def train(args, train_dataset, model, tokenizer):
                 continue
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3],'input_lens':batch[4]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4]}
             if args.model_type != "distilbert":
                 # XLM and RoBERTa don"t use segment_ids
                 inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
@@ -161,17 +171,18 @@ def train(args, train_dataset, model, tokenizer):
                     torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                     torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                     logger.info("Saving optimizer and scheduler states to %s", output_dir)
-        print(" ")
+        logger.info("\n")
         if 'cuda' in str(args.device):
             torch.cuda.empty_cache()
     return global_step, tr_loss / global_step
 
+
 def evaluate(args, model, tokenizer, prefix=""):
-    metric = SeqEntityScore(args.id2label,markup=args.markup)
+    metric = SeqEntityScore(args.id2label, markup=args.markup)
     eval_output_dir = args.output_dir
     if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(eval_output_dir)
-    eval_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='dev')
+    eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev')
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
     eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
@@ -184,36 +195,40 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_loss = 0.0
     nb_eval_steps = 0
     pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
+    if isinstance(model, nn.DataParallel):
+        model = model.module
     for step, batch in enumerate(eval_dataloader):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3],'input_lens':batch[4]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4]}
             if args.model_type != "distilbert":
                 # XLM and RoBERTa don"t use segment_ids
                 inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
             outputs = model(**inputs)
             tmp_eval_loss, logits = outputs[:2]
-            tags,_ = model.crf._obtain_labels(logits, args.id2label, inputs['input_lens'])
-            if args.n_gpu > 1:
-                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
-            eval_loss += tmp_eval_loss.item()
+            tags = model.crf.decode(logits, inputs['attention_mask'])
+        if args.n_gpu > 1:
+            tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
+        eval_loss += tmp_eval_loss.item()
         nb_eval_steps += 1
         out_label_ids = inputs['labels'].cpu().numpy().tolist()
+        input_lens = inputs['input_lens'].cpu().numpy().tolist()
+        tags = tags.squeeze(0).cpu().numpy().tolist()
         for i, label in enumerate(out_label_ids):
             temp_1 = []
             temp_2 = []
             for j, m in enumerate(label):
                 if j == 0:
                     continue
-                elif out_label_ids[i][j] == args.label2id['[SEP]']:
+                elif j == input_lens[i] - 1:
                     metric.update(pred_paths=[temp_2], label_paths=[temp_1])
                     break
                 else:
                     temp_1.append(args.id2label[out_label_ids[i][j]])
-                    temp_2.append(tags[i][j])
+                    temp_2.append(args.id2label[tags[i][j]])
         pbar(step)
-    print(' ')
+    logger.info("\n")
     eval_loss = eval_loss / nb_eval_steps
     eval_info, entity_info = metric.result()
     results = {f'{key}': value for key, value in eval_info.items()}
@@ -221,80 +236,85 @@ def evaluate(args, model, tokenizer, prefix=""):
     logger.info("***** Eval results %s *****", prefix)
     info = "-".join([f' {key}: {value:.4f} ' for key, value in results.items()])
     logger.info(info)
-    logger.info("**    Entity results %s    **", prefix)
+    logger.info("***** Entity results %s *****", prefix)
     for key in sorted(entity_info.keys()):
-        logger.info("******* %s results ********"%key)
+        logger.info("******* %s results ********" % key)
         info = "-".join([f' {key}: {value:.4f} ' for key, value in entity_info[key].items()])
         logger.info(info)
     return results
 
+
 def predict(args, model, tokenizer, prefix=""):
     pred_output_dir = args.output_dir
     if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]:
         os.makedirs(pred_output_dir)
-    test_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='test')
+    test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test')
     # Note that DistributedSampler samples randomly
     test_sampler = SequentialSampler(test_dataset) if args.local_rank == -1 else DistributedSampler(test_dataset)
-    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1,collate_fn=collate_fn)
+    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn)
     # Eval!
     logger.info("***** Running prediction %s *****", prefix)
     logger.info("  Num examples = %d", len(test_dataset))
     logger.info("  Batch size = %d", 1)
-
     results = []
+    output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.json")
     pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting")
+
+    if isinstance(model, nn.DataParallel):
+        model = model.module
     for step, batch in enumerate(test_dataloader):
         model.eval()
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
-            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": None,'input_lens':batch[4]}
+            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4]}
             if args.model_type != "distilbert":
                 # XLM and RoBERTa don"t use segment_ids
                 inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
             outputs = model(**inputs)
             logits = outputs[0]
-            preds, _ = model.crf._obtain_labels(logits, args.id2label, inputs['input_lens'])
-        preds = preds[0][1:-1] # [CLS]XXXX[SEP]
+            tags = model.crf.decode(logits, inputs['attention_mask'])
+            tags  = tags.squeeze(0).cpu().numpy().tolist()
+        preds = tags[0][1:-1]  # [CLS]XXXX[SEP]
         label_entities = get_entities(preds, args.id2label, args.markup)
         json_d = {}
         json_d['id'] = step
-        json_d['tag_seq'] = " ".join(preds)
+        json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds])
         json_d['entities'] = label_entities
         results.append(json_d)
         pbar(step)
-    print(" ")
-    output_predic_file = os.path.join(pred_output_dir, prefix, "test_prediction.json")
-    output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json")
-    with open(output_predic_file, "w") as writer:
+    logger.info("\n")
+    with open(output_predict_file, "w") as writer:
         for record in results:
             writer.write(json.dumps(record) + '\n')
-    test_text = []
-    with open(os.path.join(args.data_dir,"test.json"), 'r') as fr:
-        for line in fr:
-            test_text.append(json.loads(line))
-    test_submit = []
-    for x, y in zip(test_text, results):
-        json_d = {}
-        json_d['id'] = x['id']
-        json_d['label'] = {}
-        entities = y['entities']
-        words = list(x['text'])
-        if len(entities) != 0:
-            for subject in entities:
-                tag = subject[0]
-                start = subject[1]
-                end = subject[2]
-                word = "".join(words[start:end + 1])
-                if tag in json_d['label']:
-                    if word in json_d['label'][tag]:
-                        json_d['label'][tag][word].append([start, end])
+    if args.task_name == 'cluener':
+        output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json")
+        test_text = []
+        with open(os.path.join(args.data_dir,"test.json"), 'r') as fr:
+            for line in fr:
+                test_text.append(json.loads(line))
+        test_submit = []
+        for x, y in zip(test_text, results):
+            json_d = {}
+            json_d['id'] = x['id']
+            json_d['label'] = {}
+            entities = y['entities']
+            words = list(x['text'])
+            if len(entities) != 0:
+                for subject in entities:
+                    tag = subject[0]
+                    start = subject[1]
+                    end = subject[2]
+                    word = "".join(words[start:end + 1])
+                    if tag in json_d['label']:
+                        if word in json_d['label'][tag]:
+                            json_d['label'][tag][word].append([start, end])
+                        else:
+                            json_d['label'][tag][word] = [[start, end]]
                     else:
+                        json_d['label'][tag] = {}
                         json_d['label'][tag][word] = [[start, end]]
-                else:
-                    json_d['label'][tag] = {}
-                    json_d['label'][tag][word] = [[start, end]]
-        test_submit.append(json_d)
-    json_to_text(output_submit_file,test_submit)
+            test_submit.append(json_d)
+        json_to_text(output_submit_file,test_submit)
 
 def load_and_cache_examples(args, task, tokenizer, data_type='train'):
     if args.local_rank not in [-1, 0] and not evaluate:
@@ -304,7 +324,7 @@ def load_and_cache_examples(args, task, tokenizer, data_type='train'):
     cached_features_file = os.path.join(args.data_dir, 'cached_crf-{}_{}_{}_{}'.format(
         data_type,
         list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.train_max_seq_length if data_type=='train' else args.eval_max_seq_length),
+        str(args.train_max_seq_length if data_type == 'train' else args.eval_max_seq_length),
         str(task)))
     if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
@@ -321,11 +341,11 @@ def load_and_cache_examples(args, task, tokenizer, data_type='train'):
         features = convert_examples_to_features(examples=examples,
                                                 tokenizer=tokenizer,
                                                 label_list=label_list,
-                                                max_seq_length=args.train_max_seq_length if data_type=='train' \
-                                                               else args.eval_max_seq_length,
+                                                max_seq_length=args.train_max_seq_length if data_type == 'train' \
+                                                    else args.eval_max_seq_length,
                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                                                 pad_on_left=bool(args.model_type in ['xlnet']),
-                                                cls_token = tokenizer.cls_token,
+                                                cls_token=tokenizer.cls_token,
                                                 cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
                                                 sep_token=tokenizer.sep_token,
                                                 # pad on the left for xlnet
@@ -343,89 +363,20 @@ def load_and_cache_examples(args, task, tokenizer, data_type='train'):
     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
     all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
     all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long)
-    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lens,all_label_ids)
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lens, all_label_ids)
     return dataset
 
-def main():
-    parser = argparse.ArgumentParser()
 
-    # Required parameters
-    parser.add_argument("--task_name", default=None, type=str, required=True,
-                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
-    parser.add_argument("--data_dir",default=None,type=str,required=True,
-                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",)
-    parser.add_argument("--model_type",default=None,type=str,required=True,
-                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),)
-    parser.add_argument("--model_name_or_path",default=None,type=str,required=True,
-                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),)
-    parser.add_argument("--output_dir",default=None,type=str, required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.", )
-
-    # Other parameters
-    parser.add_argument('--markup',default='bios',type=str,choices=['bios','bio'])
-    parser.add_argument( "--labels",default="",type=str,
-                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",)
-    parser.add_argument( "--config_name", default="", type=str,
-                         help="Pretrained config name or path if not the same as model_name")
-    parser.add_argument("--tokenizer_name",default="",type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name",)
-    parser.add_argument("--cache_dir",default="",type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3", )
-    parser.add_argument("--train_max_seq_length", default=128,type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.",)
-    parser.add_argument("--eval_max_seq_length",default=512,type=int,
-                        help="The maximum total input sequence length after tokenization. Sequences longer "
-                             "than this will be truncated, sequences shorter will be padded.", )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument("--evaluate_during_training",action="store_true",
-                        help="Whether to run evaluation during training at each logging step.", )
-    parser.add_argument("--do_lower_case", action="store_true",
-                        help="Set this flag if you are using an uncased model.")
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.")
-    parser.add_argument("--gradient_accumulation_steps",type=int,default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.",)
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument("--num_train_epochs", default=3.0, type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument( "--max_steps", default=-1,type=int,
-                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.",)
+def main():
+    args = get_argparse().parse_args()
 
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument("--eval_all_checkpoints",action="store_true",
-                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",)
-    parser.add_argument("--predict_checkpoints", type=int, default=0,
-                        help="predict checkpoints starting with the same prefix as model_name ending and ending with step number")
-    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
-    parser.add_argument("--overwrite_output_dir", action="store_true",
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--overwrite_cache", action="store_true",
-                        help="Overwrite the cached training and evaluation sets")
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument("--fp16",action="store_true",
-                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",)
-    parser.add_argument("--fp16_opt_level",type=str,default="O1",
-                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-                             "See details at https://nvidia.github.io/apex/amp.html",)
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
-    args = parser.parse_args()
     if not os.path.exists(args.output_dir):
         os.mkdir(args.output_dir)
     args.output_dir = args.output_dir + '{}'.format(args.model_type)
     if not os.path.exists(args.output_dir):
         os.mkdir(args.output_dir)
-    init_logger(log_file=args.output_dir + '/{}-{}.log'.format(args.model_type, args.task_name))
+    time_ = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
+    init_logger(log_file=args.output_dir + f'/{args.model_type}-{args.task_name}-{time_}.log')
     if os.path.exists(args.output_dir) and os.listdir(
             args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError(
@@ -440,8 +391,8 @@ def main():
         ptvsd.wait_for_attach()
     # Setup CUDA, GPU & distributed training
     if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda:1" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 1#torch.cuda.device_count()
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.cuda.set_device(args.local_rank)
         device = torch.device("cuda", args.local_rank)
@@ -449,8 +400,8 @@ def main():
         args.n_gpu = 1
     args.device = device
     logger.warning(
-                "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-                args.local_rank,device,args.n_gpu, bool(args.local_rank != -1),args.fp16,)
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, )
     # Set seed
     seed_everything(args.seed)
     # Prepare NER task
@@ -469,13 +420,12 @@ def main():
     args.model_type = args.model_type.lower()
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
-                                          num_labels=num_labels,cache_dir=args.cache_dir if args.cache_dir else None,)
+                                          num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, )
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                                 do_lower_case=args.do_lower_case,
-                                                cache_dir=args.cache_dir if args.cache_dir else None,)
-    model = model_class.from_pretrained(args.model_name_or_path,from_tf=bool(".ckpt" in args.model_name_or_path),
-                                        config=config,cache_dir=args.cache_dir if args.cache_dir else None,
-                                        label2id=args.label2id,device=args.device)
+                                                cache_dir=args.cache_dir if args.cache_dir else None, )
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path),
+                                        config=config, cache_dir=args.cache_dir if args.cache_dir else None)
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
@@ -483,7 +433,7 @@ def main():
     logger.info("Training/evaluation parameters %s", args)
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='train')
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='train')
         global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
@@ -515,9 +465,9 @@ def main():
         for checkpoint in checkpoints:
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            model = model_class.from_pretrained(checkpoint,config=config,label2id=args.label2id, device=args.device)
+            model = model_class.from_pretrained(checkpoint, config=config)
             model.to(args.device)
-            result= evaluate(args, model, tokenizer, prefix=prefix)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
             if global_step:
                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
             results.update(result)
@@ -537,9 +487,10 @@ def main():
         logger.info("Predict the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
-            model = model_class.from_pretrained(checkpoint,config=config,label2id=args.label2id, device=args.device)
+            model = model_class.from_pretrained(checkpoint, config=config)
             model.to(args.device)
             predict(args, model, tokenizer, prefix=prefix)
 
+
 if __name__ == "__main__":
     main()
diff --git a/pytorch_version/scripts/run_ner_crf.sh b/pytorch_version/scripts/run_ner_crf.sh
new file mode 100755
index 00000000..1f9d6e88
--- /dev/null
+++ b/pytorch_version/scripts/run_ner_crf.sh
@@ -0,0 +1,26 @@
+CURRENT_DIR=`pwd`
+export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base
+export CLUE_DIR=$CURRENT_DIR/datasets
+export OUTPUR_DIR=$CURRENT_DIR/outputs
+TASK_NAME="cluener"
+#
+python run_ner_crf.py \
+  --model_type=bert \
+  --model_name_or_path=$BERT_BASE_DIR \
+  --task_name=$TASK_NAME \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --data_dir=$GLUE_DIR/${TASK_NAME}/ \
+  --train_max_seq_length=128 \
+  --eval_max_seq_length=512 \
+  --per_gpu_train_batch_size=24 \
+  --per_gpu_eval_batch_size=24 \
+  --learning_rate=3e-5 \
+  --crf_learning_rate=1e-3 \
+  --num_train_epochs=4.0 \
+  --logging_steps=448 \
+  --save_steps=448 \
+  --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \
+  --overwrite_output_dir \
+  --seed=42
diff --git a/pytorch_version/tools/__init__.py b/pytorch_version/tools/__init__.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/tools/common.py b/pytorch_version/tools/common.py
old mode 100644
new mode 100755
diff --git a/pytorch_version/tools/download_clue_data.py b/pytorch_version/tools/download_clue_data.py
old mode 100644
new mode 100755
index 7fccf789..cedc3088
--- a/pytorch_version/tools/download_clue_data.py
+++ b/pytorch_version/tools/download_clue_data.py
@@ -74,4 +74,8 @@ def main(arguments):
         download_and_extract(task, args.data_dir)
 
 if __name__ == "__main__":
-    sys.exit(main(sys.argv[1:]))
\ No newline at end of file
+    sys.exit(main(sys.argv[1:]))
+
+'''
+python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener
+'''
\ No newline at end of file
diff --git a/pytorch_version/tools/finetuning_argparse.py b/pytorch_version/tools/finetuning_argparse.py
new file mode 100755
index 00000000..92d48c28
--- /dev/null
+++ b/pytorch_version/tools/finetuning_argparse.py
@@ -0,0 +1,96 @@
+import argparse
+
+def get_argparse():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: ")
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", )
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: ")
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " )
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.", )
+
+    # Other parameters
+    parser.add_argument('--markup', default='bios', type=str,
+                        choices=['bios', 'bio'])
+    parser.add_argument('--loss_type', default='ce', type=str,
+                        choices=['lsr', 'focal', 'ce'])
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name", )
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3", )
+    parser.add_argument("--train_max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.", )
+    parser.add_argument("--eval_max_seq_length", default=512, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.", )
+    parser.add_argument("--do_train", action="store_true",
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true",
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true",
+                        help="Whether to run predictions on the test set.")
+    parser.add_argument("--evaluate_during_training", action="store_true",
+                        help="Whether to run evaluation during training at each logging step.", )
+    parser.add_argument("--do_lower_case", action="store_true",
+                        help="Set this flag if you are using an uncased model.")
+    # adversarial training
+    parser.add_argument("--do_adv", action="store_true",
+                        help="Whether to adversarial training.")
+    parser.add_argument('--adv_epsilon', default=1.0, type=float,
+                        help="Epsilon for adversarial.")
+    parser.add_argument('--adv_name', default='word_embeddings', type=str,
+                        help="name for adversarial layer.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.", )
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--crf_learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for crf and linear layer.")
+    parser.add_argument("--weight_decay", default=0.01, type=float,
+                        help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.", )
+
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training.")
+    parser.add_argument("--logging_steps", type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action="store_true",
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", )
+    parser.add_argument("--predict_checkpoints",type=int, default=0,
+                        help="predict checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
+    parser.add_argument("--overwrite_output_dir", action="store_true",
+                        help="Overwrite the content of the output directory")
+    parser.add_argument("--overwrite_cache", action="store_true",
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument("--fp16", action="store_true",
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", )
+    parser.add_argument("--fp16_opt_level", type=str, default="O1",
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html", )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    return parser
\ No newline at end of file
diff --git a/pytorch_version/tools/plot.py b/pytorch_version/tools/plot.py
new file mode 100755
index 00000000..08e34e04
--- /dev/null
+++ b/pytorch_version/tools/plot.py
@@ -0,0 +1,68 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+plt.switch_backend('agg')
+
+def plot_confusion_matrix(y_true, y_pred, classes,
+                          save_path,normalize=False,title=None,
+                          cmap=plt.cm.Blues):
+    """
+    This function prints and plots the confusion matrix.
+    Normalization can be applied by setting `normalize=True`.
+    """
+    if not title:
+        if normalize:
+            title = 'Normalized confusion matrix'
+        else:
+            title = 'Confusion matrix, without normalization'
+    # Compute confusion matrix
+    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
+    # Only use the labels that appear in the data
+    if normalize:
+        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+        print("Normalized confusion matrix")
+    else:
+        print('Confusion matrix, without normalization')
+    # --- plot--- #
+    plt.rcParams['savefig.dpi'] = 200
+    plt.rcParams['figure.dpi'] = 200
+    plt.rcParams['figure.figsize'] = [20, 20]  # plot
+    plt.rcParams.update({'font.size': 10})
+    fig, ax = plt.subplots()
+    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
+    # --- bar --- #
+    from mpl_toolkits.axes_grid1 import make_axes_locatable
+    divider = make_axes_locatable(ax)
+    cax = divider.append_axes("right", size="5%", pad=0.05)
+    plt.colorbar(im, cax=cax)
+    # --- bar --- #
+    # ax.figure.colorbar(im, ax=ax)
+    # We want to show all ticks...
+    ax.set(xticks=np.arange(cm.shape[1]),
+           yticks=np.arange(cm.shape[0]),
+           # ... and label them with the respective list entries
+           xticklabels=classes, yticklabels=classes,
+           title=title,
+           ylabel='True label',
+           xlabel='Predicted label')
+
+    # Rotate the tick labels and set their alignment.
+    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
+             rotation_mode="anchor")
+    # Loop over data dimensions and create text annotations.
+    fmt = '.2f' if normalize else 'd'
+    thresh = cm.max() / 2.
+    for i in range(cm.shape[0]):
+        for j in range(cm.shape[1]):
+            ax.text(j, i, format(cm[i, j], fmt),
+                    ha="center", va="center",
+                    color="white" if cm[i, j] > thresh else "black")
+    fig.tight_layout()
+    plt.savefig(save_path)
+
+if __name__ == "__main__":
+    y_true = ['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
+    y_pred = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O','B-PER', 'I-PER', 'O']
+    classes = ['O','B-MISC', 'I-MISC','B-PER', 'I-PER']
+    save_path = './ner_confusion_matrix.png'
+    plot_confusion_matrix(y_true,y_pred,classes,save_path)
\ No newline at end of file