From b6597268c000e06aa95bcdc59ef122805254cab6 Mon Sep 17 00:00:00 2001 From: dongqianqian Date: Sun, 26 Apr 2020 10:03:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E9=80=9Fcrf=E5=B1=82=E8=A7=A3?= =?UTF-8?q?=E7=A0=81=E9=80=9F=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pytorch_version/README.md | 106 +---- pytorch_version/__init__.py | 0 pytorch_version/callback/__init__.py | 2 + pytorch_version/callback/lr_scheduler.py | 9 - .../callback/optimizater/__init__.py | 2 + pytorch_version/callback/optimizater/adamw.py | 0 pytorch_version/callback/optimizater/lamb.py | 0 pytorch_version/callback/optimizater/lars.py | 0 .../callback/optimizater/lookahead.py | 2 +- pytorch_version/callback/optimizater/nadam.py | 0 .../callback/optimizater/novograd.py | 0 .../callback/optimizater/planradam.py | 0 pytorch_version/callback/optimizater/radam.py | 0 .../callback/optimizater/ralamb.py | 0 pytorch_version/callback/progressbar.py | 0 pytorch_version/datasets/cluener/__init__.py | 2 + pytorch_version/losses/__init__.py | 0 pytorch_version/metrics/__init__.py | 2 + pytorch_version/metrics/ner_metrics.py | 0 pytorch_version/models/__init__.py | 0 pytorch_version/models/albert_for_ner.py | 25 +- pytorch_version/models/bert_for_ner.py | 24 +- pytorch_version/models/layers/__init__.py | 2 + pytorch_version/models/layers/crf.py | 411 ++++++++++++++++++ pytorch_version/models/layers/linears.py | 40 ++ .../models/transformers/__init__.py | 0 .../models/transformers/__main__.py | 0 .../transformers/configuration_albert.py | 0 .../models/transformers/configuration_auto.py | 0 .../models/transformers/configuration_bert.py | 0 .../models/transformers/configuration_ctrl.py | 0 .../transformers/configuration_distilbert.py | 0 .../models/transformers/configuration_gpt2.py | 0 .../transformers/configuration_openai.py | 0 .../transformers/configuration_roberta.py | 0 .../transformers/configuration_transfo_xl.py | 0 .../transformers/configuration_utils.py | 0 .../models/transformers/configuration_xlm.py | 0 .../transformers/configuration_xlnet.py | 0 .../models/transformers/file_utils.py | 0 .../models/transformers/modeling_albert.py | 0 .../transformers/modeling_albert_bright.py | 0 .../models/transformers/modeling_auto.py | 0 .../models/transformers/modeling_bert.py | 0 .../models/transformers/modeling_ctrl.py | 0 .../transformers/modeling_distilbert.py | 0 .../models/transformers/modeling_gpt2.py | 0 .../models/transformers/modeling_openai.py | 0 .../models/transformers/modeling_roberta.py | 0 .../transformers/modeling_transfo_xl.py | 0 .../modeling_transfo_xl_utilities.py | 0 .../models/transformers/modeling_utils.py | 0 .../models/transformers/modeling_xlm.py | 0 .../models/transformers/modeling_xlnet.py | 0 .../transformers/tokenization_albert.py | 0 .../models/transformers/tokenization_auto.py | 0 .../models/transformers/tokenization_bert.py | 0 .../models/transformers/tokenization_ctrl.py | 0 .../transformers/tokenization_distilbert.py | 0 .../models/transformers/tokenization_gpt2.py | 0 .../models/transformers/tokenization_utils.py | 78 ++-- .../outputs/cluener_output/__init__.py | 2 + .../prev_trained_model/bert-base/.gitignore | 104 +++++ .../prev_trained_model/bert-base/__init__.py | 0 pytorch_version/processors/__init__.py | 0 pytorch_version/processors/ner_seq.py | 52 ++- pytorch_version/processors/utils_ner.py | 0 pytorch_version/run_ner_crf.py | 273 +++++------- pytorch_version/scripts/run_ner_crf.sh | 26 ++ pytorch_version/tools/__init__.py | 0 pytorch_version/tools/common.py | 0 pytorch_version/tools/download_clue_data.py | 6 +- pytorch_version/tools/finetuning_argparse.py | 96 ++++ pytorch_version/tools/plot.py | 68 +++ 74 files changed, 1008 insertions(+), 324 deletions(-) mode change 100644 => 100755 pytorch_version/README.md mode change 100644 => 100755 pytorch_version/__init__.py create mode 100755 pytorch_version/callback/__init__.py mode change 100644 => 100755 pytorch_version/callback/lr_scheduler.py create mode 100755 pytorch_version/callback/optimizater/__init__.py mode change 100644 => 100755 pytorch_version/callback/optimizater/adamw.py mode change 100644 => 100755 pytorch_version/callback/optimizater/lamb.py mode change 100644 => 100755 pytorch_version/callback/optimizater/lars.py mode change 100644 => 100755 pytorch_version/callback/optimizater/lookahead.py mode change 100644 => 100755 pytorch_version/callback/optimizater/nadam.py mode change 100644 => 100755 pytorch_version/callback/optimizater/novograd.py mode change 100644 => 100755 pytorch_version/callback/optimizater/planradam.py mode change 100644 => 100755 pytorch_version/callback/optimizater/radam.py mode change 100644 => 100755 pytorch_version/callback/optimizater/ralamb.py mode change 100644 => 100755 pytorch_version/callback/progressbar.py create mode 100755 pytorch_version/datasets/cluener/__init__.py mode change 100644 => 100755 pytorch_version/losses/__init__.py create mode 100755 pytorch_version/metrics/__init__.py mode change 100644 => 100755 pytorch_version/metrics/ner_metrics.py mode change 100644 => 100755 pytorch_version/models/__init__.py mode change 100644 => 100755 pytorch_version/models/albert_for_ner.py mode change 100644 => 100755 pytorch_version/models/bert_for_ner.py create mode 100755 pytorch_version/models/layers/__init__.py create mode 100755 pytorch_version/models/layers/crf.py create mode 100755 pytorch_version/models/layers/linears.py mode change 100644 => 100755 pytorch_version/models/transformers/__init__.py mode change 100644 => 100755 pytorch_version/models/transformers/__main__.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_albert.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_auto.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_bert.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_ctrl.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_distilbert.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_gpt2.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_openai.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_roberta.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_transfo_xl.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_utils.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_xlm.py mode change 100644 => 100755 pytorch_version/models/transformers/configuration_xlnet.py mode change 100644 => 100755 pytorch_version/models/transformers/file_utils.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_albert.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_albert_bright.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_auto.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_bert.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_ctrl.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_distilbert.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_gpt2.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_openai.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_roberta.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_transfo_xl.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_transfo_xl_utilities.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_utils.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_xlm.py mode change 100644 => 100755 pytorch_version/models/transformers/modeling_xlnet.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_albert.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_auto.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_bert.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_ctrl.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_distilbert.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_gpt2.py mode change 100644 => 100755 pytorch_version/models/transformers/tokenization_utils.py create mode 100755 pytorch_version/outputs/cluener_output/__init__.py create mode 100755 pytorch_version/prev_trained_model/bert-base/.gitignore mode change 100644 => 100755 pytorch_version/prev_trained_model/bert-base/__init__.py mode change 100644 => 100755 pytorch_version/processors/__init__.py mode change 100644 => 100755 pytorch_version/processors/ner_seq.py mode change 100644 => 100755 pytorch_version/processors/utils_ner.py mode change 100644 => 100755 pytorch_version/run_ner_crf.py create mode 100755 pytorch_version/scripts/run_ner_crf.sh mode change 100644 => 100755 pytorch_version/tools/__init__.py mode change 100644 => 100755 pytorch_version/tools/common.py mode change 100644 => 100755 pytorch_version/tools/download_clue_data.py create mode 100755 pytorch_version/tools/finetuning_argparse.py create mode 100755 pytorch_version/tools/plot.py diff --git a/pytorch_version/README.md b/pytorch_version/README.md old mode 100644 new mode 100755 index 983a1e7c..436502f0 --- a/pytorch_version/README.md +++ b/pytorch_version/README.md @@ -1,60 +1,13 @@ -# CLUE_NER_PyTorch - -CLUENER 细粒度命名实体识别 - -## 数据介绍 +### 数据介绍 数据详细描述: https://www.cluebenchmarks.com/introduce.html -## 代码目录说明 - -```text -├── callback # 自定义常用callback -| └── lr_scheduler.py   -| └── ... -├── losses # 自定义loss函数 -| └── focal_loss.py  -| └── ... -├── CLUEdatasets # 存放数据 -| └── cluener    -├── metrics         # metric计算 -| └── ner_metrics.py    -├── outputs # 模型输出保存 -| └── cluener_output -├── prev_trained_model # 预训练模型 -| └── albert_base -| └── bert-wwm -| └── ... -├── processors     # 数据处理 -| └── ner_seq.py -| └── ... -├── tools        # 通用脚本 -| └── common.py -| └── download_clue_data.py -| └── ... -├── models   # 主模型 -| └── transformers -| └── bert_for_ner.py -| └── ... -├── run_ner_span.py # 主程序 -├── run_ner_span.sh # 任务运行脚本 -``` -## 依赖模块 - -* pytorch=1.1.0 -* boto3=1.9 -* regex -* sacremoses -* sentencepiece -* python3.7+ - -## 运行方式 - -### 1. 下载CLUE_NER数据集,运行以下命令: -```python -python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener +### 运行方式 +1. 下载CLUE_NER数据集,运行以下命令: +```shell +python tools/download_clue_data.py --data_dir=./datasets --tasks=cluener ``` -### 2. 预训练模型文件格式,比如: +2. 预训练模型文件格式,比如: ```text ├── prev_trained_model # 预训练模型 | └── bert-base @@ -62,21 +15,19 @@ python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener | | └── config.json | | └── pytorch_model.bin ``` -### 3. 训练: +3. 训练: 直接执行对应shell脚本,如: ```shell -sh run_ner_span.sh +sh scripts/run_ner_crf.sh ``` - -### 4. 预测 +4. 预测 当前默认使用最后一个checkpoint模型作为预测模型,你也可以指定--predict_checkpoints参数进行对应的checkpoint进行预测,比如: - ```python CURRENT_DIR=`pwd` export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base -export GLUE_DIR=$CURRENT_DIR/CLUEdatasets +export CLUE_DIR=$CURRENT_DIR/datasets export OUTPUR_DIR=$CURRENT_DIR/outputs TASK_NAME="cluener" @@ -87,45 +38,14 @@ python run_ner_span.py \ --do_predict \ --predict_checkpoints=100 \ --do_lower_case \ - --loss_type=ce \  ... ``` ### 模型列表 model_type目前支持**bert**和**albert** -**注意**: bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样,而模型本身主体一样,因此参数model_type=bert其余同理。 - -### 输入编码方式 - -目前默认为BIOS编码方式,比如: - -```text -美 B-LOC -国 I-LOC -的 O -华 B-PER -莱 I-PER -士 I-PER - -我 O -跟 O -他 O -谈 O -笑 O -风 O -生 O -``` - -## 结果 - -以下为模型在 **dev**上的测试结果: +**注意:** bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样,而模型本身主体一样,因此参数model_type=bert其余同理。 -| | Accuracy (entity) | Recall (entity) | F1 score (entity) | | -| ------------ | ------------------ | ------------------ | ------------------ | ------------------------------------------------------------ | -| BERT+Softmax | 0.7916 | 0.7962 | 0.7939 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 | -| BERT+CRF | 0.7877 | 0.8008 | 0.7942 | train_max_length=128 eval_max_length=512 epoch=5 lr=3e-5 batch_size=24 | -| BERT+Span | 0.8132 | **0.8092** | **0.8112** | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 | -| BERT+Span+focal_loss | 0.8121 | 0.8008 | 0.8064 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 loss_type=focal | -| BERT+Span+label_smoothing | **0.8235** | 0.7946 | 0.8088 | train_max_length=128 eval_max_length=512 epoch=4 lr=3e-5 batch_size=24 loss_type=lsr | +### 结果 +在dev上为F1分数为0.8076 \ No newline at end of file diff --git a/pytorch_version/__init__.py b/pytorch_version/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/__init__.py b/pytorch_version/callback/__init__.py new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/pytorch_version/callback/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/pytorch_version/callback/lr_scheduler.py b/pytorch_version/callback/lr_scheduler.py old mode 100644 new mode 100755 index 57543c34..2ee2ad3f --- a/pytorch_version/callback/lr_scheduler.py +++ b/pytorch_version/callback/lr_scheduler.py @@ -4,20 +4,11 @@ from torch.optim.optimizer import Optimizer from torch.optim.lr_scheduler import LambdaLR -__all__ = ['CustomDecayLR', - 'BertLR', - 'CyclicLR', - 'ReduceLROnPlateau', - 'ReduceLRWDOnPlateau', - 'CosineLRWithRestarts', - ] - def get_constant_schedule(optimizer, last_epoch=-1): """ Create a schedule with a constant learning rate. """ return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) - def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): """ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and 1. diff --git a/pytorch_version/callback/optimizater/__init__.py b/pytorch_version/callback/optimizater/__init__.py new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/pytorch_version/callback/optimizater/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/pytorch_version/callback/optimizater/adamw.py b/pytorch_version/callback/optimizater/adamw.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/lamb.py b/pytorch_version/callback/optimizater/lamb.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/lars.py b/pytorch_version/callback/optimizater/lars.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/lookahead.py b/pytorch_version/callback/optimizater/lookahead.py old mode 100644 new mode 100755 index affb3fa0..cd0d7beb --- a/pytorch_version/callback/optimizater/lookahead.py +++ b/pytorch_version/callback/optimizater/lookahead.py @@ -1,5 +1,5 @@ import torch -from torch.optim import Optimizer +from torch.optim.optimizer import Optimizer from collections import defaultdict class Lookahead(Optimizer): diff --git a/pytorch_version/callback/optimizater/nadam.py b/pytorch_version/callback/optimizater/nadam.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/novograd.py b/pytorch_version/callback/optimizater/novograd.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/planradam.py b/pytorch_version/callback/optimizater/planradam.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/radam.py b/pytorch_version/callback/optimizater/radam.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/optimizater/ralamb.py b/pytorch_version/callback/optimizater/ralamb.py old mode 100644 new mode 100755 diff --git a/pytorch_version/callback/progressbar.py b/pytorch_version/callback/progressbar.py old mode 100644 new mode 100755 diff --git a/pytorch_version/datasets/cluener/__init__.py b/pytorch_version/datasets/cluener/__init__.py new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/pytorch_version/datasets/cluener/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/pytorch_version/losses/__init__.py b/pytorch_version/losses/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/metrics/__init__.py b/pytorch_version/metrics/__init__.py new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/pytorch_version/metrics/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/pytorch_version/metrics/ner_metrics.py b/pytorch_version/metrics/ner_metrics.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/__init__.py b/pytorch_version/models/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/albert_for_ner.py b/pytorch_version/models/albert_for_ner.py old mode 100644 new mode 100755 index 6fbd24b2..639a5591 --- a/pytorch_version/models/albert_for_ner.py +++ b/pytorch_version/models/albert_for_ner.py @@ -1,10 +1,10 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .crf import CRF +from .layers.crf import CRF from .transformers.modeling_albert import AlbertPreTrainedModel from .transformers.modeling_albert import AlbertModel -from .linears import PoolerEndLogits, PoolerStartLogits +from .layers.linears import PoolerEndLogits, PoolerStartLogits from torch.nn import CrossEntropyLoss from losses.focal_loss import FocalLoss from losses.label_smoothing import LabelSmoothingCrossEntropy @@ -21,11 +21,8 @@ def __init__(self, config): def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask) + outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids, + position_ids=position_ids,head_mask=head_mask) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -50,23 +47,23 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None, return outputs # (loss), scores, (hidden_states), (attentions) class AlbertCrfForNer(AlbertPreTrainedModel): - def __init__(self, config, label2id, device): + def __init__(self, config): super(AlbertCrfForNer, self).__init__(config) self.bert = AlbertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, len(label2id)) - self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights() def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_lens=None): - outputs = self.bert(input_ids, token_type_ids, attention_mask) + outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) if labels is not None: - loss = self.crf.calculate_loss(logits, tag_list=labels, lengths=input_lens) - outputs =(loss,)+outputs + loss = self.crf(emissions = logits, tags=labels, mask=attention_mask) + outputs =(-1*loss,)+outputs return outputs # (loss), scores class AlbertSpanForNer(AlbertPreTrainedModel): @@ -85,7 +82,7 @@ def __init__(self, config,): self.init_weights() def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None): - outputs = self.bert(input_ids, token_type_ids, attention_mask) + outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) start_logits = self.start_fc(sequence_output) diff --git a/pytorch_version/models/bert_for_ner.py b/pytorch_version/models/bert_for_ner.py old mode 100644 new mode 100755 index e7cb44eb..2d867fce --- a/pytorch_version/models/bert_for_ner.py +++ b/pytorch_version/models/bert_for_ner.py @@ -1,10 +1,10 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .crf import CRF +from .layers.crf import CRF from .transformers.modeling_bert import BertPreTrainedModel from .transformers.modeling_bert import BertModel -from .linears import PoolerEndLogits, PoolerStartLogits +from .layers.linears import PoolerEndLogits, PoolerStartLogits from torch.nn import CrossEntropyLoss from losses.focal_loss import FocalLoss from losses.label_smoothing import LabelSmoothingCrossEntropy @@ -21,11 +21,7 @@ def __init__(self, config): def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, labels=None): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask) + outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -50,23 +46,23 @@ def forward(self, input_ids, attention_mask=None, token_type_ids=None, return outputs # (loss), scores, (hidden_states), (attentions) class BertCrfForNer(BertPreTrainedModel): - def __init__(self, config, label2id, device): + def __init__(self, config): super(BertCrfForNer, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, len(label2id)) - self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights() def forward(self, input_ids, token_type_ids=None, attention_mask=None,labels=None,input_lens=None): - outputs = self.bert(input_ids, token_type_ids, attention_mask) + outputs =self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) if labels is not None: - loss = self.crf.calculate_loss(logits, tag_list=labels, lengths=input_lens) - outputs =(loss,)+outputs + loss = self.crf(emissions = logits, tags=labels, mask=attention_mask) + outputs =(-1*loss,)+outputs return outputs # (loss), scores class BertSpanForNer(BertPreTrainedModel): @@ -85,7 +81,7 @@ def __init__(self, config,): self.init_weights() def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,end_positions=None): - outputs = self.bert(input_ids, token_type_ids, attention_mask) + outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) start_logits = self.start_fc(sequence_output) diff --git a/pytorch_version/models/layers/__init__.py b/pytorch_version/models/layers/__init__.py new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/pytorch_version/models/layers/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/pytorch_version/models/layers/crf.py b/pytorch_version/models/layers/crf.py new file mode 100755 index 00000000..d8b3adcc --- /dev/null +++ b/pytorch_version/models/layers/crf.py @@ -0,0 +1,411 @@ +import torch +import torch.nn as nn +from typing import List, Optional + +class CRF(nn.Module): + """Conditional random field. + This module implements a conditional random field [LMP01]_. The forward computation + of this class computes the log likelihood of the given sequence of tags and + emission score tensor. This class also has `~CRF.decode` method which finds + the best tag sequence given an emission score tensor using `Viterbi algorithm`_. + Args: + num_tags: Number of tags. + batch_first: Whether the first dimension corresponds to the size of a minibatch. + Attributes: + start_transitions (`~torch.nn.Parameter`): Start transition score tensor of size + ``(num_tags,)``. + end_transitions (`~torch.nn.Parameter`): End transition score tensor of size + ``(num_tags,)``. + transitions (`~torch.nn.Parameter`): Transition score tensor of size + ``(num_tags, num_tags)``. + .. [LMP01] Lafferty, J., McCallum, A., Pereira, F. (2001). + "Conditional random fields: Probabilistic models for segmenting and + labeling sequence data". *Proc. 18th International Conf. on Machine + Learning*. Morgan Kaufmann. pp. 282–289. + .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm + """ + + def __init__(self, num_tags: int, batch_first: bool = False) -> None: + if num_tags <= 0: + raise ValueError(f'invalid number of tags: {num_tags}') + super().__init__() + self.num_tags = num_tags + self.batch_first = batch_first + self.start_transitions = nn.Parameter(torch.empty(num_tags)) + self.end_transitions = nn.Parameter(torch.empty(num_tags)) + self.transitions = nn.Parameter(torch.empty(num_tags, num_tags)) + + self.reset_parameters() + + def reset_parameters(self) -> None: + """Initialize the transition parameters. + The parameters will be initialized randomly from a uniform distribution + between -0.1 and 0.1. + """ + nn.init.uniform_(self.start_transitions, -0.1, 0.1) + nn.init.uniform_(self.end_transitions, -0.1, 0.1) + nn.init.uniform_(self.transitions, -0.1, 0.1) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}(num_tags={self.num_tags})' + + def forward(self, emissions: torch.Tensor, + tags: torch.LongTensor, + mask: Optional[torch.ByteTensor] = None, + reduction: str = 'mean') -> torch.Tensor: + """Compute the conditional log likelihood of a sequence of tags given emission scores. + Args: + emissions (`~torch.Tensor`): Emission score tensor of size + ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, + ``(batch_size, seq_length, num_tags)`` otherwise. + tags (`~torch.LongTensor`): Sequence of tags tensor of size + ``(seq_length, batch_size)`` if ``batch_first`` is ``False``, + ``(batch_size, seq_length)`` otherwise. + mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` + if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. + reduction: Specifies the reduction to apply to the output: + ``none|sum|mean|token_mean``. ``none``: no reduction will be applied. + ``sum``: the output will be summed over batches. ``mean``: the output will be + averaged over batches. ``token_mean``: the output will be averaged over tokens. + Returns: + `~torch.Tensor`: The log likelihood. This will have size ``(batch_size,)`` if + reduction is ``none``, ``()`` otherwise. + """ + if reduction not in ('none', 'sum', 'mean', 'token_mean'): + raise ValueError(f'invalid reduction: {reduction}') + if mask is None: + mask = torch.ones_like(tags, dtype=torch.uint8, device=tags.device) + if mask.dtype != torch.uint8: + mask = mask.byte() + self._validate(emissions, tags=tags, mask=mask) + + if self.batch_first: + emissions = emissions.transpose(0, 1) + tags = tags.transpose(0, 1) + mask = mask.transpose(0, 1) + + # shape: (batch_size,) + numerator = self._compute_score(emissions, tags, mask) + # shape: (batch_size,) + denominator = self._compute_normalizer(emissions, mask) + # shape: (batch_size,) + llh = numerator - denominator + + if reduction == 'none': + return llh + if reduction == 'sum': + return llh.sum() + if reduction == 'mean': + return llh.mean() + return llh.sum() / mask.float().sum() + + def decode(self, emissions: torch.Tensor, + mask: Optional[torch.ByteTensor] = None, + nbest: Optional[int] = None, + pad_tag: Optional[int] = None) -> List[List[List[int]]]: + """Find the most likely tag sequence using Viterbi algorithm. + Args: + emissions (`~torch.Tensor`): Emission score tensor of size + ``(seq_length, batch_size, num_tags)`` if ``batch_first`` is ``False``, + ``(batch_size, seq_length, num_tags)`` otherwise. + mask (`~torch.ByteTensor`): Mask tensor of size ``(seq_length, batch_size)`` + if ``batch_first`` is ``False``, ``(batch_size, seq_length)`` otherwise. + nbest (`int`): Number of most probable paths for each sequence + pad_tag (`int`): Tag at padded positions. Often input varies in length and + the length will be padded to the maximum length in the batch. Tags at + the padded positions will be assigned with a padding tag, i.e. `pad_tag` + Returns: + A PyTorch tensor of the best tag sequence for each batch of shape + (nbest, batch_size, seq_length) + """ + if nbest is None: + nbest = 1 + if mask is None: + mask = torch.ones(emissions.shape[:2], dtype=torch.uint8, + device=emissions.device) + if mask.dtype != torch.uint8: + mask = mask.byte() + self._validate(emissions, mask=mask) + + if self.batch_first: + emissions = emissions.transpose(0, 1) + mask = mask.transpose(0, 1) + + if nbest == 1: + return self._viterbi_decode(emissions, mask, pad_tag).unsqueeze(0) + return self._viterbi_decode_nbest(emissions, mask, nbest, pad_tag) + + def _validate(self, emissions: torch.Tensor, + tags: Optional[torch.LongTensor] = None, + mask: Optional[torch.ByteTensor] = None) -> None: + if emissions.dim() != 3: + raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}') + if emissions.size(2) != self.num_tags: + raise ValueError( + f'expected last dimension of emissions is {self.num_tags}, ' + f'got {emissions.size(2)}') + + if tags is not None: + if emissions.shape[:2] != tags.shape: + raise ValueError( + 'the first two dimensions of emissions and tags must match, ' + f'got {tuple(emissions.shape[:2])} and {tuple(tags.shape)}') + + if mask is not None: + if emissions.shape[:2] != mask.shape: + raise ValueError( + 'the first two dimensions of emissions and mask must match, ' + f'got {tuple(emissions.shape[:2])} and {tuple(mask.shape)}') + no_empty_seq = not self.batch_first and mask[0].all() + no_empty_seq_bf = self.batch_first and mask[:, 0].all() + if not no_empty_seq and not no_empty_seq_bf: + raise ValueError('mask of the first timestep must all be on') + + def _compute_score(self, emissions: torch.Tensor, + tags: torch.LongTensor, + mask: torch.ByteTensor) -> torch.Tensor: + # emissions: (seq_length, batch_size, num_tags) + # tags: (seq_length, batch_size) + # mask: (seq_length, batch_size) + seq_length, batch_size = tags.shape + mask = mask.float() + + # Start transition score and first emission + # shape: (batch_size,) + score = self.start_transitions[tags[0]] + score += emissions[0, torch.arange(batch_size), tags[0]] + + for i in range(1, seq_length): + # Transition score to next tag, only added if next timestep is valid (mask == 1) + # shape: (batch_size,) + score += self.transitions[tags[i - 1], tags[i]] * mask[i] + + # Emission score for next tag, only added if next timestep is valid (mask == 1) + # shape: (batch_size,) + score += emissions[i, torch.arange(batch_size), tags[i]] * mask[i] + + # End transition score + # shape: (batch_size,) + seq_ends = mask.long().sum(dim=0) - 1 + # shape: (batch_size,) + last_tags = tags[seq_ends, torch.arange(batch_size)] + # shape: (batch_size,) + score += self.end_transitions[last_tags] + + return score + + def _compute_normalizer(self, emissions: torch.Tensor, + mask: torch.ByteTensor) -> torch.Tensor: + # emissions: (seq_length, batch_size, num_tags) + # mask: (seq_length, batch_size) + seq_length = emissions.size(0) + + # Start transition score and first emission; score has size of + # (batch_size, num_tags) where for each batch, the j-th column stores + # the score that the first timestep has tag j + # shape: (batch_size, num_tags) + score = self.start_transitions + emissions[0] + + for i in range(1, seq_length): + # Broadcast score for every possible next tag + # shape: (batch_size, num_tags, 1) + broadcast_score = score.unsqueeze(2) + + # Broadcast emission score for every possible current tag + # shape: (batch_size, 1, num_tags) + broadcast_emissions = emissions[i].unsqueeze(1) + + # Compute the score tensor of size (batch_size, num_tags, num_tags) where + # for each sample, entry at row i and column j stores the sum of scores of all + # possible tag sequences so far that end with transitioning from tag i to tag j + # and emitting + # shape: (batch_size, num_tags, num_tags) + next_score = broadcast_score + self.transitions + broadcast_emissions + + # Sum over all possible current tags, but we're in score space, so a sum + # becomes a log-sum-exp: for each sample, entry i stores the sum of scores of + # all possible tag sequences so far, that end in tag i + # shape: (batch_size, num_tags) + next_score = torch.logsumexp(next_score, dim=1) + + # Set score to the next score if this timestep is valid (mask == 1) + # shape: (batch_size, num_tags) + score = torch.where(mask[i].unsqueeze(1), next_score, score) + + # End transition score + # shape: (batch_size, num_tags) + score += self.end_transitions + + # Sum (log-sum-exp) over all possible tags + # shape: (batch_size,) + return torch.logsumexp(score, dim=1) + + def _viterbi_decode(self, emissions: torch.FloatTensor, + mask: torch.ByteTensor, + pad_tag: Optional[int] = None) -> List[List[int]]: + # emissions: (seq_length, batch_size, num_tags) + # mask: (seq_length, batch_size) + # return: (batch_size, seq_length) + if pad_tag is None: + pad_tag = 0 + + device = emissions.device + seq_length, batch_size = mask.shape + + # Start transition and first emission + # shape: (batch_size, num_tags) + score = self.start_transitions + emissions[0] + history_idx = torch.zeros((seq_length, batch_size, self.num_tags), + dtype=torch.long, device=device) + oor_idx = torch.zeros((batch_size, self.num_tags), + dtype=torch.long, device=device) + oor_tag = torch.full((seq_length, batch_size), pad_tag, + dtype=torch.long, device=device) + + # - score is a tensor of size (batch_size, num_tags) where for every batch, + # value at column j stores the score of the best tag sequence so far that ends + # with tag j + # - history_idx saves where the best tags candidate transitioned from; this is used + # when we trace back the best tag sequence + # - oor_idx saves the best tags candidate transitioned from at the positions + # where mask is 0, i.e. out of range (oor) + + # Viterbi algorithm recursive case: we compute the score of the best tag sequence + # for every possible next tag + for i in range(1, seq_length): + # Broadcast viterbi score for every possible next tag + # shape: (batch_size, num_tags, 1) + broadcast_score = score.unsqueeze(2) + + # Broadcast emission score for every possible current tag + # shape: (batch_size, 1, num_tags) + broadcast_emission = emissions[i].unsqueeze(1) + + # Compute the score tensor of size (batch_size, num_tags, num_tags) where + # for each sample, entry at row i and column j stores the score of the best + # tag sequence so far that ends with transitioning from tag i to tag j and emitting + # shape: (batch_size, num_tags, num_tags) + next_score = broadcast_score + self.transitions + broadcast_emission + + # Find the maximum score over all possible current tag + # shape: (batch_size, num_tags) + next_score, indices = next_score.max(dim=1) + + # Set score to the next score if this timestep is valid (mask == 1) + # and save the index that produces the next score + # shape: (batch_size, num_tags) + score = torch.where(mask[i].unsqueeze(-1), next_score, score) + indices = torch.where(mask[i].unsqueeze(-1), indices, oor_idx) + history_idx[i - 1] = indices + + # End transition score + # shape: (batch_size, num_tags) + end_score = score + self.end_transitions + _, end_tag = end_score.max(dim=1) + + # shape: (batch_size,) + seq_ends = mask.long().sum(dim=0) - 1 + + # insert the best tag at each sequence end (last position with mask == 1) + history_idx = history_idx.transpose(1, 0).contiguous() + history_idx.scatter_(1, seq_ends.view(-1, 1, 1).expand(-1, 1, self.num_tags), + end_tag.view(-1, 1, 1).expand(-1, 1, self.num_tags)) + history_idx = history_idx.transpose(1, 0).contiguous() + + # The most probable path for each sequence + best_tags_arr = torch.zeros((seq_length, batch_size), + dtype=torch.long, device=device) + best_tags = torch.zeros(batch_size, 1, dtype=torch.long, device=device) + for idx in range(seq_length - 1, -1, -1): + best_tags = torch.gather(history_idx[idx], 1, best_tags) + best_tags_arr[idx] = best_tags.data.view(batch_size) + + return torch.where(mask, best_tags_arr, oor_tag).transpose(0, 1) + + def _viterbi_decode_nbest(self, emissions: torch.FloatTensor, + mask: torch.ByteTensor, + nbest: int, + pad_tag: Optional[int] = None) -> List[List[List[int]]]: + # emissions: (seq_length, batch_size, num_tags) + # mask: (seq_length, batch_size) + # return: (nbest, batch_size, seq_length) + if pad_tag is None: + pad_tag = 0 + + device = emissions.device + seq_length, batch_size = mask.shape + + # Start transition and first emission + # shape: (batch_size, num_tags) + score = self.start_transitions + emissions[0] + history_idx = torch.zeros((seq_length, batch_size, self.num_tags, nbest), + dtype=torch.long, device=device) + oor_idx = torch.zeros((batch_size, self.num_tags, nbest), + dtype=torch.long, device=device) + oor_tag = torch.full((seq_length, batch_size, nbest), pad_tag, + dtype=torch.long, device=device) + + # + score is a tensor of size (batch_size, num_tags) where for every batch, + # value at column j stores the score of the best tag sequence so far that ends + # with tag j + # + history_idx saves where the best tags candidate transitioned from; this is used + # when we trace back the best tag sequence + # - oor_idx saves the best tags candidate transitioned from at the positions + # where mask is 0, i.e. out of range (oor) + + # Viterbi algorithm recursive case: we compute the score of the best tag sequence + # for every possible next tag + for i in range(1, seq_length): + if i == 1: + broadcast_score = score.unsqueeze(-1) + broadcast_emission = emissions[i].unsqueeze(1) + # shape: (batch_size, num_tags, num_tags) + next_score = broadcast_score + self.transitions + broadcast_emission + else: + broadcast_score = score.unsqueeze(-1) + broadcast_emission = emissions[i].unsqueeze(1).unsqueeze(2) + # shape: (batch_size, num_tags, nbest, num_tags) + next_score = broadcast_score + self.transitions.unsqueeze(1) + broadcast_emission + + # Find the top `nbest` maximum score over all possible current tag + # shape: (batch_size, nbest, num_tags) + next_score, indices = next_score.view(batch_size, -1, self.num_tags).topk(nbest, dim=1) + + if i == 1: + score = score.unsqueeze(-1).expand(-1, -1, nbest) + indices = indices * nbest + + # convert to shape: (batch_size, num_tags, nbest) + next_score = next_score.transpose(2, 1) + indices = indices.transpose(2, 1) + + # Set score to the next score if this timestep is valid (mask == 1) + # and save the index that produces the next score + # shape: (batch_size, num_tags, nbest) + score = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), next_score, score) + indices = torch.where(mask[i].unsqueeze(-1).unsqueeze(-1), indices, oor_idx) + history_idx[i - 1] = indices + + # End transition score shape: (batch_size, num_tags, nbest) + end_score = score + self.end_transitions.unsqueeze(-1) + _, end_tag = end_score.view(batch_size, -1).topk(nbest, dim=1) + + # shape: (batch_size,) + seq_ends = mask.long().sum(dim=0) - 1 + + # insert the best tag at each sequence end (last position with mask == 1) + history_idx = history_idx.transpose(1, 0).contiguous() + history_idx.scatter_(1, seq_ends.view(-1, 1, 1, 1).expand(-1, 1, self.num_tags, nbest), + end_tag.view(-1, 1, 1, nbest).expand(-1, 1, self.num_tags, nbest)) + history_idx = history_idx.transpose(1, 0).contiguous() + + # The most probable path for each sequence + best_tags_arr = torch.zeros((seq_length, batch_size, nbest), + dtype=torch.long, device=device) + best_tags = torch.arange(nbest, dtype=torch.long, device=device) \ + .view(1, -1).expand(batch_size, -1) + for idx in range(seq_length - 1, -1, -1): + best_tags = torch.gather(history_idx[idx].view(batch_size, -1), 1, best_tags) + best_tags_arr[idx] = best_tags.data.view(batch_size, -1) // nbest + + return torch.where(mask.unsqueeze(-1), best_tags_arr, oor_tag).permute(2, 1, 0) \ No newline at end of file diff --git a/pytorch_version/models/layers/linears.py b/pytorch_version/models/layers/linears.py new file mode 100755 index 00000000..9437ae3b --- /dev/null +++ b/pytorch_version/models/layers/linears.py @@ -0,0 +1,40 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class FeedForwardNetwork(nn.Module): + def __init__(self, input_size, hidden_size, output_size, dropout_rate=0): + super(FeedForwardNetwork, self).__init__() + self.dropout_rate = dropout_rate + self.linear1 = nn.Linear(input_size, hidden_size) + self.linear2 = nn.Linear(hidden_size, output_size) + + def forward(self, x): + x_proj = F.dropout(F.relu(self.linear1(x)), p=self.dropout_rate, training=self.training) + x_proj = self.linear2(x_proj) + return x_proj + + +class PoolerStartLogits(nn.Module): + def __init__(self, hidden_size, num_classes): + super(PoolerStartLogits, self).__init__() + self.dense = nn.Linear(hidden_size, num_classes) + + def forward(self, hidden_states, p_mask=None): + x = self.dense(hidden_states) + return x + +class PoolerEndLogits(nn.Module): + def __init__(self, hidden_size, num_classes): + super(PoolerEndLogits, self).__init__() + self.dense_0 = nn.Linear(hidden_size, hidden_size) + self.activation = nn.Tanh() + self.LayerNorm = nn.LayerNorm(hidden_size) + self.dense_1 = nn.Linear(hidden_size, num_classes) + + def forward(self, hidden_states, start_positions=None, p_mask=None): + x = self.dense_0(torch.cat([hidden_states, start_positions], dim=-1)) + x = self.activation(x) + x = self.LayerNorm(x) + x = self.dense_1(x) + return x diff --git a/pytorch_version/models/transformers/__init__.py b/pytorch_version/models/transformers/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/__main__.py b/pytorch_version/models/transformers/__main__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_albert.py b/pytorch_version/models/transformers/configuration_albert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_auto.py b/pytorch_version/models/transformers/configuration_auto.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_bert.py b/pytorch_version/models/transformers/configuration_bert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_ctrl.py b/pytorch_version/models/transformers/configuration_ctrl.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_distilbert.py b/pytorch_version/models/transformers/configuration_distilbert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_gpt2.py b/pytorch_version/models/transformers/configuration_gpt2.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_openai.py b/pytorch_version/models/transformers/configuration_openai.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_roberta.py b/pytorch_version/models/transformers/configuration_roberta.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_transfo_xl.py b/pytorch_version/models/transformers/configuration_transfo_xl.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_utils.py b/pytorch_version/models/transformers/configuration_utils.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_xlm.py b/pytorch_version/models/transformers/configuration_xlm.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/configuration_xlnet.py b/pytorch_version/models/transformers/configuration_xlnet.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/file_utils.py b/pytorch_version/models/transformers/file_utils.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_albert.py b/pytorch_version/models/transformers/modeling_albert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_albert_bright.py b/pytorch_version/models/transformers/modeling_albert_bright.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_auto.py b/pytorch_version/models/transformers/modeling_auto.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_bert.py b/pytorch_version/models/transformers/modeling_bert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_ctrl.py b/pytorch_version/models/transformers/modeling_ctrl.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_distilbert.py b/pytorch_version/models/transformers/modeling_distilbert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_gpt2.py b/pytorch_version/models/transformers/modeling_gpt2.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_openai.py b/pytorch_version/models/transformers/modeling_openai.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_roberta.py b/pytorch_version/models/transformers/modeling_roberta.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_transfo_xl.py b/pytorch_version/models/transformers/modeling_transfo_xl.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_transfo_xl_utilities.py b/pytorch_version/models/transformers/modeling_transfo_xl_utilities.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_utils.py b/pytorch_version/models/transformers/modeling_utils.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_xlm.py b/pytorch_version/models/transformers/modeling_xlm.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/modeling_xlnet.py b/pytorch_version/models/transformers/modeling_xlnet.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_albert.py b/pytorch_version/models/transformers/tokenization_albert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_auto.py b/pytorch_version/models/transformers/tokenization_auto.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_bert.py b/pytorch_version/models/transformers/tokenization_bert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_ctrl.py b/pytorch_version/models/transformers/tokenization_ctrl.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_distilbert.py b/pytorch_version/models/transformers/tokenization_distilbert.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_gpt2.py b/pytorch_version/models/transformers/tokenization_gpt2.py old mode 100644 new mode 100755 diff --git a/pytorch_version/models/transformers/tokenization_utils.py b/pytorch_version/models/transformers/tokenization_utils.py old mode 100644 new mode 100755 index 3d91067b..5e5be872 --- a/pytorch_version/models/transformers/tokenization_utils.py +++ b/pytorch_version/models/transformers/tokenization_utils.py @@ -739,7 +739,6 @@ def encode(self, def encode_plus(self, text, text_pair=None, - text_third=None, add_special_tokens=False, max_length=None, stride=0, @@ -786,18 +785,16 @@ def get_input_ids(text): first_ids = get_input_ids(text) second_ids = get_input_ids(text_pair) if text_pair is not None else None - third_ids = get_input_ids(text_third) if text_third is not None else None return self.prepare_for_model(first_ids, pair_ids=second_ids, - third_ids=third_ids, max_length=max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, return_tensors=return_tensors) - def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None, add_special_tokens=False, stride=0, + def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, truncation_strategy='longest_first', return_tensors=None): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. @@ -844,18 +841,17 @@ def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None, pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 - len_third_ids = len(third_ids) if pair else 0 encoded_inputs = {} total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0) - if len_third_ids>0: - total_len+= 1 if max_length and total_len > max_length: - ids, pair_ids,third_ids = self.truncate_sequences(ids, pair_ids=pair_ids,third_ids=third_ids, - num_tokens_to_remove=total_len-max_length) + ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids, + num_tokens_to_remove=total_len-max_length, + truncation_strategy=truncation_strategy, + stride=stride) + encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length - if len(third_ids) >0: - pair_ids = pair_ids+[self.sep_token_id]+third_ids + if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) @@ -864,8 +860,14 @@ def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None, sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) - sequence = torch.tensor([sequence]) - token_type_ids = torch.tensor([token_type_ids]) + if return_tensors == 'tf' and is_tf_available(): + sequence = tf.constant([sequence]) + token_type_ids = tf.constant([token_type_ids]) + elif return_tensors == 'pt' and is_torch_available(): + sequence = torch.tensor([sequence]) + token_type_ids = torch.tensor([token_type_ids]) + elif return_tensors is not None: + logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)) encoded_inputs["input_ids"] = sequence encoded_inputs["token_type_ids"] = token_type_ids @@ -877,23 +879,45 @@ def prepare_for_model(self, ids, pair_ids=None,third_ids=None, max_length=None, return encoded_inputs - def truncate_sequences(self, ids, pair_ids=None, third_ids=None,num_tokens_to_remove=0): + def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): """Truncates a sequence pair in place to the maximum length. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) """ - if pair_ids is None: - pair_ids = [] - if third_ids is None: - third_ids = [] if num_tokens_to_remove <= 0: - return ids, pair_ids, third_ids - for _ in range(num_tokens_to_remove): - if len(ids) > len(pair_ids) and len(ids) > len(third_ids): - ids = ids[:-1] - elif len(pair_ids) > len(ids) and len(pair_ids) > len(third_ids): - pair_ids = pair_ids[:-1] - else: - third_ids = third_ids[:-1] - return (ids, pair_ids, third_ids) + return ids, pair_ids, [] + + if truncation_strategy == 'longest_first': + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == 'only_first': + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == 'only_second': + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == 'do_not_truncate': + raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") + else: + raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']") + return (ids, pair_ids, overflowing_tokens) def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): logger.warning("This tokenizer does not make use of special tokens.") diff --git a/pytorch_version/outputs/cluener_output/__init__.py b/pytorch_version/outputs/cluener_output/__init__.py new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/pytorch_version/outputs/cluener_output/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/pytorch_version/prev_trained_model/bert-base/.gitignore b/pytorch_version/prev_trained_model/bert-base/.gitignore new file mode 100755 index 00000000..894a44cc --- /dev/null +++ b/pytorch_version/prev_trained_model/bert-base/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/pytorch_version/prev_trained_model/bert-base/__init__.py b/pytorch_version/prev_trained_model/bert-base/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/processors/__init__.py b/pytorch_version/processors/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/processors/ner_seq.py b/pytorch_version/processors/ner_seq.py old mode 100644 new mode 100755 index 6a5c3ebc..c4c45f1a --- a/pytorch_version/processors/ner_seq.py +++ b/pytorch_version/processors/ner_seq.py @@ -107,16 +107,16 @@ def convert_examples_to_features(examples,label_list,max_seq_length,tokenizer, # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens += [sep_token] - label_ids += [label_map[sep_token]] + label_ids += [label_map['O']] segment_ids = [sequence_a_segment_id] * len(tokens) if cls_token_at_end: tokens += [cls_token] - label_ids += [label_map[cls_token]] + label_ids += [label_map['O']] segment_ids += [cls_token_segment_id] else: tokens = [cls_token] + tokens - label_ids = [label_map[cls_token]] + label_ids + label_ids = [label_map['O']] + label_ids segment_ids = [cls_token_segment_id] + segment_ids input_ids = tokenizer.convert_tokens_to_ids(tokens) @@ -154,6 +154,48 @@ def convert_examples_to_features(examples,label_list,max_seq_length,tokenizer, segment_ids=segment_ids, label_ids=label_ids)) return features + +class CnerProcessor(DataProcessor): + """Processor for the chinese ner data set.""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_text(os.path.join(data_dir, "train.char.bmes")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_text(os.path.join(data_dir, "dev.char.bmes")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_text(os.path.join(data_dir, "test.char.bmes")), "test") + + def get_labels(self): + """See base class.""" + return ["X",'B-CONT','B-EDU','B-LOC','B-NAME','B-ORG','B-PRO','B-RACE','B-TITLE', + 'I-CONT','I-EDU','I-LOC','I-NAME','I-ORG','I-PRO','I-RACE','I-TITLE', + 'O','S-NAME','S-ORG','S-RACE',"[START]", "[END]"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a= line['words'] + # BIOS + labels = [] + for x in line['labels']: + if 'M-' in x: + labels.append(x.replace('M-','I-')) + elif 'E-' in x: + labels.append(x.replace('E-', 'I-')) + else: + labels.append(x) + examples.append(InputExample(guid=guid, text_a=text_a, labels=labels)) + return examples + class CluenerProcessor(DataProcessor): """Processor for the chinese ner data set.""" @@ -177,7 +219,7 @@ def get_labels(self): 'I-organization', 'I-position','I-scene', "S-address", "S-book", "S-company", 'S-game', 'S-government', 'S-movie', 'S-name', 'S-organization', 'S-position', - 'S-scene','O',"[CLS]", "[SEP]"] + 'S-scene','O',"[START]", "[END]"] def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" @@ -189,6 +231,8 @@ def _create_examples(self, lines, set_type): labels = line['labels'] examples.append(InputExample(guid=guid, text_a=text_a, labels=labels)) return examples + ner_processors = { + "cner": CnerProcessor, 'cluener':CluenerProcessor } diff --git a/pytorch_version/processors/utils_ner.py b/pytorch_version/processors/utils_ner.py old mode 100644 new mode 100755 diff --git a/pytorch_version/run_ner_crf.py b/pytorch_version/run_ner_crf.py old mode 100644 new mode 100755 index bdb3ba2a..7e5bdac3 --- a/pytorch_version/run_ner_crf.py +++ b/pytorch_version/run_ner_crf.py @@ -1,12 +1,11 @@ -import argparse import glob import logging import os import json +import time -import numpy as np import torch -from torch.nn import CrossEntropyLoss +import torch.nn as nn from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler from callback.optimizater.adamw import AdamW @@ -15,21 +14,20 @@ from tools.common import seed_everything,json_to_text from tools.common import init_logger, logger -from models.transformers import WEIGHTS_NAME,BertConfig,AlbertConfig +from models.transformers import WEIGHTS_NAME, BertConfig, AlbertConfig from models.bert_for_ner import BertCrfForNer from models.albert_for_ner import AlbertCrfForNer -from processors.utils_ner import CNerTokenizer,get_entities +from processors.utils_ner import CNerTokenizer, get_entities from processors.ner_seq import convert_examples_to_features from processors.ner_seq import ner_processors as processors from processors.ner_seq import collate_fn from metrics.ner_metrics import SeqEntityScore - -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig,)), ()) +from tools.finetuning_argparse import get_argparse MODEL_CLASSES = { - ## bert ernie bert_wwm bert_wwwm_ext,roberta + ## bert ernie bert_wwm bert_wwwm_ext 'bert': (BertConfig, BertCrfForNer, CNerTokenizer), - 'albert':(AlbertConfig,AlbertCrfForNer,CNerTokenizer) + 'albert': (AlbertConfig, AlbertCrfForNer, CNerTokenizer) } def train(args, train_dataset, model, tokenizer): @@ -43,21 +41,34 @@ def train(args, train_dataset, model, tokenizer): args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] + bert_param_optimizer = list(model.bert.named_parameters()) + crf_param_optimizer = list(model.crf.named_parameters()) + linear_param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [ - {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay,}, - {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay, 'lr': args.learning_rate}, + {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, + 'lr': args.learning_rate}, + + {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate}, + {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, + 'lr': args.crf_learning_rate}, + + {'params': [p for n, p in linear_param_optimizer if not any(nd in n for nd in no_decay)], + 'weight_decay': args.weight_decay, 'lr': args.crf_learning_rate}, + {'params': [p for n, p in linear_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, + 'lr': args.crf_learning_rate} ] + args.warmup_steps = int(t_total * args.warmup_proportion) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) - # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( - os.path.join(args.model_name_or_path, "scheduler.pt")): + os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) @@ -81,10 +92,10 @@ def train(args, train_dataset, model, tokenizer): logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size - * args.gradient_accumulation_steps - * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), - ) + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -96,7 +107,6 @@ def train(args, train_dataset, model, tokenizer): global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) - logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) @@ -114,7 +124,7 @@ def train(args, train_dataset, model, tokenizer): continue model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3],'input_lens':batch[4]} + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) @@ -161,17 +171,18 @@ def train(args, train_dataset, model, tokenizer): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) - print(" ") + logger.info("\n") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step + def evaluate(args, model, tokenizer, prefix=""): - metric = SeqEntityScore(args.id2label,markup=args.markup) + metric = SeqEntityScore(args.id2label, markup=args.markup) eval_output_dir = args.output_dir if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: os.makedirs(eval_output_dir) - eval_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='dev') + eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='dev') args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) @@ -184,36 +195,40 @@ def evaluate(args, model, tokenizer, prefix=""): eval_loss = 0.0 nb_eval_steps = 0 pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating") + if isinstance(model, nn.DataParallel): + model = model.module for step, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3],'input_lens':batch[4]} + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], 'input_lens': batch[4]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] - tags,_ = model.crf._obtain_labels(logits, args.id2label, inputs['input_lens']) - if args.n_gpu > 1: - tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating - eval_loss += tmp_eval_loss.item() + tags = model.crf.decode(logits, inputs['attention_mask']) + if args.n_gpu > 1: + tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating + eval_loss += tmp_eval_loss.item() nb_eval_steps += 1 out_label_ids = inputs['labels'].cpu().numpy().tolist() + input_lens = inputs['input_lens'].cpu().numpy().tolist() + tags = tags.squeeze(0).cpu().numpy().tolist() for i, label in enumerate(out_label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue - elif out_label_ids[i][j] == args.label2id['[SEP]']: + elif j == input_lens[i] - 1: metric.update(pred_paths=[temp_2], label_paths=[temp_1]) break else: temp_1.append(args.id2label[out_label_ids[i][j]]) - temp_2.append(tags[i][j]) + temp_2.append(args.id2label[tags[i][j]]) pbar(step) - print(' ') + logger.info("\n") eval_loss = eval_loss / nb_eval_steps eval_info, entity_info = metric.result() results = {f'{key}': value for key, value in eval_info.items()} @@ -221,80 +236,85 @@ def evaluate(args, model, tokenizer, prefix=""): logger.info("***** Eval results %s *****", prefix) info = "-".join([f' {key}: {value:.4f} ' for key, value in results.items()]) logger.info(info) - logger.info("** Entity results %s **", prefix) + logger.info("***** Entity results %s *****", prefix) for key in sorted(entity_info.keys()): - logger.info("******* %s results ********"%key) + logger.info("******* %s results ********" % key) info = "-".join([f' {key}: {value:.4f} ' for key, value in entity_info[key].items()]) logger.info(info) return results + def predict(args, model, tokenizer, prefix=""): pred_output_dir = args.output_dir if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: os.makedirs(pred_output_dir) - test_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='test') + test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='test') # Note that DistributedSampler samples randomly test_sampler = SequentialSampler(test_dataset) if args.local_rank == -1 else DistributedSampler(test_dataset) - test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1,collate_fn=collate_fn) + test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn) # Eval! logger.info("***** Running prediction %s *****", prefix) logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", 1) - results = [] + output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting") + + if isinstance(model, nn.DataParallel): + model = model.module for step, batch in enumerate(test_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": None,'input_lens':batch[4]} + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": None, 'input_lens': batch[4]} if args.model_type != "distilbert": # XLM and RoBERTa don"t use segment_ids inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None) outputs = model(**inputs) logits = outputs[0] - preds, _ = model.crf._obtain_labels(logits, args.id2label, inputs['input_lens']) - preds = preds[0][1:-1] # [CLS]XXXX[SEP] + tags = model.crf.decode(logits, inputs['attention_mask']) + tags = tags.squeeze(0).cpu().numpy().tolist() + preds = tags[0][1:-1] # [CLS]XXXX[SEP] label_entities = get_entities(preds, args.id2label, args.markup) json_d = {} json_d['id'] = step - json_d['tag_seq'] = " ".join(preds) + json_d['tag_seq'] = " ".join([args.id2label[x] for x in preds]) json_d['entities'] = label_entities results.append(json_d) pbar(step) - print(" ") - output_predic_file = os.path.join(pred_output_dir, prefix, "test_prediction.json") - output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") - with open(output_predic_file, "w") as writer: + logger.info("\n") + with open(output_predict_file, "w") as writer: for record in results: writer.write(json.dumps(record) + '\n') - test_text = [] - with open(os.path.join(args.data_dir,"test.json"), 'r') as fr: - for line in fr: - test_text.append(json.loads(line)) - test_submit = [] - for x, y in zip(test_text, results): - json_d = {} - json_d['id'] = x['id'] - json_d['label'] = {} - entities = y['entities'] - words = list(x['text']) - if len(entities) != 0: - for subject in entities: - tag = subject[0] - start = subject[1] - end = subject[2] - word = "".join(words[start:end + 1]) - if tag in json_d['label']: - if word in json_d['label'][tag]: - json_d['label'][tag][word].append([start, end]) + if args.task_name == 'cluener': + output_submit_file = os.path.join(pred_output_dir, prefix, "test_submit.json") + test_text = [] + with open(os.path.join(args.data_dir,"test.json"), 'r') as fr: + for line in fr: + test_text.append(json.loads(line)) + test_submit = [] + for x, y in zip(test_text, results): + json_d = {} + json_d['id'] = x['id'] + json_d['label'] = {} + entities = y['entities'] + words = list(x['text']) + if len(entities) != 0: + for subject in entities: + tag = subject[0] + start = subject[1] + end = subject[2] + word = "".join(words[start:end + 1]) + if tag in json_d['label']: + if word in json_d['label'][tag]: + json_d['label'][tag][word].append([start, end]) + else: + json_d['label'][tag][word] = [[start, end]] else: + json_d['label'][tag] = {} json_d['label'][tag][word] = [[start, end]] - else: - json_d['label'][tag] = {} - json_d['label'][tag][word] = [[start, end]] - test_submit.append(json_d) - json_to_text(output_submit_file,test_submit) + test_submit.append(json_d) + json_to_text(output_submit_file,test_submit) def load_and_cache_examples(args, task, tokenizer, data_type='train'): if args.local_rank not in [-1, 0] and not evaluate: @@ -304,7 +324,7 @@ def load_and_cache_examples(args, task, tokenizer, data_type='train'): cached_features_file = os.path.join(args.data_dir, 'cached_crf-{}_{}_{}_{}'.format( data_type, list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.train_max_seq_length if data_type=='train' else args.eval_max_seq_length), + str(args.train_max_seq_length if data_type == 'train' else args.eval_max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) @@ -321,11 +341,11 @@ def load_and_cache_examples(args, task, tokenizer, data_type='train'): features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, label_list=label_list, - max_seq_length=args.train_max_seq_length if data_type=='train' \ - else args.eval_max_seq_length, + max_seq_length=args.train_max_seq_length if data_type == 'train' \ + else args.eval_max_seq_length, cls_token_at_end=bool(args.model_type in ["xlnet"]), pad_on_left=bool(args.model_type in ['xlnet']), - cls_token = tokenizer.cls_token, + cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, # pad on the left for xlnet @@ -343,89 +363,20 @@ def load_and_cache_examples(args, task, tokenizer, data_type='train'): all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lens,all_label_ids) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lens, all_label_ids) return dataset -def main(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) - parser.add_argument("--data_dir",default=None,type=str,required=True, - help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",) - parser.add_argument("--model_type",default=None,type=str,required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),) - parser.add_argument("--model_name_or_path",default=None,type=str,required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),) - parser.add_argument("--output_dir",default=None,type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.", ) - - # Other parameters - parser.add_argument('--markup',default='bios',type=str,choices=['bios','bio']) - parser.add_argument( "--labels",default="",type=str, - help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",) - parser.add_argument( "--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name",default="",type=str, - help="Pretrained tokenizer name or path if not the same as model_name",) - parser.add_argument("--cache_dir",default="",type=str, - help="Where do you want to store the pre-trained models downloaded from s3", ) - parser.add_argument("--train_max_seq_length", default=128,type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.",) - parser.add_argument("--eval_max_seq_length",default=512,type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", ) - parser.add_argument("--do_train", action="store_true", help="Whether to run training.") - parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") - parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") - parser.add_argument("--evaluate_during_training",action="store_true", - help="Whether to run evaluation during training at each logging step.", ) - parser.add_argument("--do_lower_case", action="store_true", - help="Set this flag if you are using an uncased model.") - - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") - parser.add_argument("--gradient_accumulation_steps",type=int,default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.",) - parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument( "--max_steps", default=-1,type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.",) +def main(): + args = get_argparse().parse_args() - parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") - parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints",action="store_true", - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",) - parser.add_argument("--predict_checkpoints", type=int, default=0, - help="predict checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") - parser.add_argument("--overwrite_output_dir", action="store_true", - help="Overwrite the content of the output directory") - parser.add_argument("--overwrite_cache", action="store_true", - help="Overwrite the cached training and evaluation sets") - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument("--fp16",action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",) - parser.add_argument("--fp16_opt_level",type=str,default="O1", - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html",) - parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") - parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") - parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") - args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) args.output_dir = args.output_dir + '{}'.format(args.model_type) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) - init_logger(log_file=args.output_dir + '/{}-{}.log'.format(args.model_type, args.task_name)) + time_ = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) + init_logger(log_file=args.output_dir + f'/{args.model_type}-{args.task_name}-{time_}.log') if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( @@ -440,8 +391,8 @@ def main(): ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: - device = torch.device("cuda:1" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = 1#torch.cuda.device_count() + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) @@ -449,8 +400,8 @@ def main(): args.n_gpu = 1 args.device = device logger.warning( - "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank,device,args.n_gpu, bool(args.local_rank != -1),args.fp16,) + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed seed_everything(args.seed) # Prepare NER task @@ -469,13 +420,12 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels,cache_dir=args.cache_dir if args.cache_dir else None,) + num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None,) - model = model_class.from_pretrained(args.model_name_or_path,from_tf=bool(".ckpt" in args.model_name_or_path), - config=config,cache_dir=args.cache_dir if args.cache_dir else None, - label2id=args.label2id,device=args.device) + cache_dir=args.cache_dir if args.cache_dir else None, ) + model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -483,7 +433,7 @@ def main(): logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: - train_dataset = load_and_cache_examples(args, args.task_name,tokenizer, data_type='train') + train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='train') global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() @@ -515,9 +465,9 @@ def main(): for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - model = model_class.from_pretrained(checkpoint,config=config,label2id=args.label2id, device=args.device) + model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) - result= evaluate(args, model, tokenizer, prefix=prefix) + result = evaluate(args, model, tokenizer, prefix=prefix) if global_step: result = {"{}_{}".format(global_step, k): v for k, v in result.items()} results.update(result) @@ -537,9 +487,10 @@ def main(): logger.info("Predict the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - model = model_class.from_pretrained(checkpoint,config=config,label2id=args.label2id, device=args.device) + model = model_class.from_pretrained(checkpoint, config=config) model.to(args.device) predict(args, model, tokenizer, prefix=prefix) + if __name__ == "__main__": main() diff --git a/pytorch_version/scripts/run_ner_crf.sh b/pytorch_version/scripts/run_ner_crf.sh new file mode 100755 index 00000000..1f9d6e88 --- /dev/null +++ b/pytorch_version/scripts/run_ner_crf.sh @@ -0,0 +1,26 @@ +CURRENT_DIR=`pwd` +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base +export CLUE_DIR=$CURRENT_DIR/datasets +export OUTPUR_DIR=$CURRENT_DIR/outputs +TASK_NAME="cluener" +# +python run_ner_crf.py \ + --model_type=bert \ + --model_name_or_path=$BERT_BASE_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir=$GLUE_DIR/${TASK_NAME}/ \ + --train_max_seq_length=128 \ + --eval_max_seq_length=512 \ + --per_gpu_train_batch_size=24 \ + --per_gpu_eval_batch_size=24 \ + --learning_rate=3e-5 \ + --crf_learning_rate=1e-3 \ + --num_train_epochs=4.0 \ + --logging_steps=448 \ + --save_steps=448 \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --overwrite_output_dir \ + --seed=42 diff --git a/pytorch_version/tools/__init__.py b/pytorch_version/tools/__init__.py old mode 100644 new mode 100755 diff --git a/pytorch_version/tools/common.py b/pytorch_version/tools/common.py old mode 100644 new mode 100755 diff --git a/pytorch_version/tools/download_clue_data.py b/pytorch_version/tools/download_clue_data.py old mode 100644 new mode 100755 index 7fccf789..cedc3088 --- a/pytorch_version/tools/download_clue_data.py +++ b/pytorch_version/tools/download_clue_data.py @@ -74,4 +74,8 @@ def main(arguments): download_and_extract(task, args.data_dir) if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) \ No newline at end of file + sys.exit(main(sys.argv[1:])) + +''' +python tools/download_clue_data.py --data_dir=./CLUEdatasets --tasks=cluener +''' \ No newline at end of file diff --git a/pytorch_version/tools/finetuning_argparse.py b/pytorch_version/tools/finetuning_argparse.py new file mode 100755 index 00000000..92d48c28 --- /dev/null +++ b/pytorch_version/tools/finetuning_argparse.py @@ -0,0 +1,96 @@ +import argparse + +def get_argparse(): + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the task to train selected in the list: ") + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) + parser.add_argument("--model_type", default=None, type=str, required=True, + help="Model type selected in the list: ") + parser.add_argument("--model_name_or_path", default=None, type=str, required=True, + help="Path to pre-trained model or shortcut name selected in the list: " ) + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions and checkpoints will be written.", ) + + # Other parameters + parser.add_argument('--markup', default='bios', type=str, + choices=['bios', 'bio']) + parser.add_argument('--loss_type', default='ce', type=str, + choices=['lsr', 'focal', 'ce']) + parser.add_argument("--config_name", default="", type=str, + help="Pretrained config name or path if not the same as model_name") + parser.add_argument("--tokenizer_name", default="", type=str, + help="Pretrained tokenizer name or path if not the same as model_name", ) + parser.add_argument("--cache_dir", default="", type=str, + help="Where do you want to store the pre-trained models downloaded from s3", ) + parser.add_argument("--train_max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) + parser.add_argument("--eval_max_seq_length", default=512, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", ) + parser.add_argument("--do_train", action="store_true", + help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", + help="Whether to run eval on the dev set.") + parser.add_argument("--do_predict", action="store_true", + help="Whether to run predictions on the test set.") + parser.add_argument("--evaluate_during_training", action="store_true", + help="Whether to run evaluation during training at each logging step.", ) + parser.add_argument("--do_lower_case", action="store_true", + help="Set this flag if you are using an uncased model.") + # adversarial training + parser.add_argument("--do_adv", action="store_true", + help="Whether to adversarial training.") + parser.add_argument('--adv_epsilon', default=1.0, type=float, + help="Epsilon for adversarial.") + parser.add_argument('--adv_name', default='word_embeddings', type=str, + help="name for adversarial layer.") + + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", ) + parser.add_argument("--learning_rate", default=5e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--crf_learning_rate", default=5e-5, type=float, + help="The initial learning rate for crf and linear layer.") + parser.add_argument("--weight_decay", default=0.01, type=float, + help="Weight decay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, + help="Max gradient norm.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--max_steps", default=-1, type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) + + parser.add_argument("--warmup_proportion", default=0.1, type=float, + help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training.") + parser.add_argument("--logging_steps", type=int, default=50, + help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument("--eval_all_checkpoints", action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) + parser.add_argument("--predict_checkpoints",type=int, default=0, + help="predict checkpoints starting with the same prefix as model_name ending and ending with step number") + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument("--overwrite_output_dir", action="store_true", + help="Overwrite the content of the output directory") + parser.add_argument("--overwrite_cache", action="store_true", + help="Overwrite the cached training and evaluation sets") + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") + parser.add_argument("--fp16", action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) + parser.add_argument("--fp16_opt_level", type=str, default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") + return parser \ No newline at end of file diff --git a/pytorch_version/tools/plot.py b/pytorch_version/tools/plot.py new file mode 100755 index 00000000..08e34e04 --- /dev/null +++ b/pytorch_version/tools/plot.py @@ -0,0 +1,68 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.metrics import confusion_matrix +plt.switch_backend('agg') + +def plot_confusion_matrix(y_true, y_pred, classes, + save_path,normalize=False,title=None, + cmap=plt.cm.Blues): + """ + This function prints and plots the confusion matrix. + Normalization can be applied by setting `normalize=True`. + """ + if not title: + if normalize: + title = 'Normalized confusion matrix' + else: + title = 'Confusion matrix, without normalization' + # Compute confusion matrix + cm = confusion_matrix(y_true=y_true, y_pred=y_pred) + # Only use the labels that appear in the data + if normalize: + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + print("Normalized confusion matrix") + else: + print('Confusion matrix, without normalization') + # --- plot--- # + plt.rcParams['savefig.dpi'] = 200 + plt.rcParams['figure.dpi'] = 200 + plt.rcParams['figure.figsize'] = [20, 20] # plot + plt.rcParams.update({'font.size': 10}) + fig, ax = plt.subplots() + im = ax.imshow(cm, interpolation='nearest', cmap=cmap) + # --- bar --- # + from mpl_toolkits.axes_grid1 import make_axes_locatable + divider = make_axes_locatable(ax) + cax = divider.append_axes("right", size="5%", pad=0.05) + plt.colorbar(im, cax=cax) + # --- bar --- # + # ax.figure.colorbar(im, ax=ax) + # We want to show all ticks... + ax.set(xticks=np.arange(cm.shape[1]), + yticks=np.arange(cm.shape[0]), + # ... and label them with the respective list entries + xticklabels=classes, yticklabels=classes, + title=title, + ylabel='True label', + xlabel='Predicted label') + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=45, ha="right", + rotation_mode="anchor") + # Loop over data dimensions and create text annotations. + fmt = '.2f' if normalize else 'd' + thresh = cm.max() / 2. + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(cm[i, j], fmt), + ha="center", va="center", + color="white" if cm[i, j] > thresh else "black") + fig.tight_layout() + plt.savefig(save_path) + +if __name__ == "__main__": + y_true = ['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] + y_pred = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O','B-PER', 'I-PER', 'O'] + classes = ['O','B-MISC', 'I-MISC','B-PER', 'I-PER'] + save_path = './ner_confusion_matrix.png' + plot_confusion_matrix(y_true,y_pred,classes,save_path) \ No newline at end of file