Skip to content

Commit

Permalink
support ChineseGLUE (#217)
Browse files Browse the repository at this point in the history
* machine reading comprehension
  • Loading branch information
kinghuin authored and nepeplwu committed Nov 13, 2019
1 parent 8419f9d commit 271883b
Show file tree
Hide file tree
Showing 29 changed files with 1,932 additions and 745 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,8 @@ dmypy.json

# Pyre type checker
.pyre/

# pycharm
.DS_Store
.idea/
FETCH_HEAD
442 changes: 23 additions & 419 deletions demo/reading-comprehension/predict.py

Large diffs are not rendered by default.

43 changes: 30 additions & 13 deletions demo/reading-comprehension/reading_comprehension.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,28 +31,42 @@
parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=384, help="Number of words of the longest seqence.")
parser.add_argument("--null_score_diff_threshold", type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument("--n_best_size", type=int, default=20,help="The total number of n-best predictions to generate in the ""nbest_predictions.json output file.")
parser.add_argument("--max_answer_length", type=int, default=30,help="The maximum length of an answer that can be generated. This is needed ""because the start and end predictions are not conditioned on one another.")
parser.add_argument("--batch_size", type=int, default=8, help="Total examples' number in batch for training.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=True, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=True, help="Whether use data parallel.")
parser.add_argument("--version_2_with_negative", type=ast.literal_eval, default=False, help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.")
parser.add_argument("--use_pyreader", type=ast.literal_eval, default=False, help="Whether use pyreader to feed data.")
parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.")
parser.add_argument("--dataset", type=str, default="squad", help="Support squad, squad2.0, drcd and cmrc2018")
args = parser.parse_args()
# yapf: enable.

if __name__ == '__main__':
# Load Paddlehub bert_uncased_L-12_H-768_A-12 pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
# Download dataset and use ReadingComprehensionReader to read dataset
if args.dataset == "squad":
dataset = hub.dataset.SQUAD(version_2_with_negative=False)
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
elif args.dataset == "squad2.0" or args.dataset == "squad2":
args.dataset = "squad2.0"
dataset = hub.dataset.SQUAD(version_2_with_negative=True)
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
elif args.dataset == "drcd":
dataset = hub.dataset.DRCD()
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
elif args.dataset == "cmrc2018":
dataset = hub.dataset.CMRC2018()
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
else:
raise Exception(
"Only support datasets: squad, squad2.0, drcd and cmrc2018")

inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)

# Download dataset and use ReadingComprehensionReader to read dataset
dataset = hub.dataset.SQUAD(
version_2_with_negative=args.version_2_with_negative)

reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_length=args.max_seq_len,
max_seq_len=args.max_seq_len,
doc_stride=128,
max_query_length=64)

Expand All @@ -76,9 +90,10 @@
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
log_interval=10,
eval_interval=300,
save_ckpt_interval=10000,
use_pyreader=args.use_pyreader,
use_data_parallel=args.use_data_parallel,
save_ckpt_interval=1000,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
Expand All @@ -91,7 +106,9 @@
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
config=config)
config=config,
sub_task=args.dataset,
)

# Finetune by PaddleHub's API
reading_comprehension_task.finetune()
reading_comprehension_task.finetune_and_eval()
20 changes: 12 additions & 8 deletions demo/reading-comprehension/run_finetune.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0,1

# Recommending hyper parameters for difference task
# squad: batch_size=8, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5
# squad2.0: batch_size=8, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5
# cmrc2018: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=2.5e-5
# drcd: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=2.5e-5

dataset=cmrc2018
python -u reading_comprehension.py \
--batch_size=12 \
--batch_size=8 \
--use_gpu=True \
--checkpoint_dir="./ckpt_rc" \
--learning_rate=3e-5 \
--checkpoint_dir=./ckpt_${dataset} \
--learning_rate=2.5e-5 \
--weight_decay=0.01 \
--warmup_proportion=0.1 \
--num_epoch=2 \
--max_seq_len=384 \
--use_pyreader=True \
--use_data_parallel=True \
--version_2_with_negative=False
--max_seq_len=512 \
--dataset=${dataset}
18 changes: 7 additions & 11 deletions demo/reading-comprehension/run_predict.sh
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0

CKPT_DIR="./ckpt_rc"
RES_DIR="./result"

mkdir $RES_DIR
CKPT_DIR="./ckpt_cmrc2018"
dataset=cmrc2018

python -u predict.py \
--batch_size=12 \
--batch_size=8 \
--use_gpu=True \
--dataset=${dataset} \
--checkpoint_dir=${CKPT_DIR} \
--learning_rate=3e-5 \
--learning_rate=2.5e-5 \
--weight_decay=0.01 \
--warmup_proportion=0.1 \
--num_epoch=1 \
--max_seq_len=384 \
--max_seq_len=512 \
--use_pyreader=False \
--use_data_parallel=False \
--version_2_with_negative=False \
--result_dir=${RES_DIR}
--use_data_parallel=False
1 change: 0 additions & 1 deletion demo/sequence-labeling/run_sequence_label.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0

CKPT_DIR="./ckpt_sequence_label"
python -u sequence_label.py \
Expand Down
5 changes: 4 additions & 1 deletion demo/sequence-labeling/sequence_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

if __name__ == '__main__':
# Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)

Expand Down Expand Up @@ -69,6 +69,9 @@

# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
log_interval=10,
eval_interval=300,
save_ckpt_interval=10000,
use_data_parallel=args.use_data_parallel,
use_pyreader=args.use_pyreader,
use_cuda=args.use_gpu,
Expand Down
13 changes: 9 additions & 4 deletions demo/text-classification/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,24 @@
其中分类任务可以分为两大类:

* **单句分类**
- ChnSentiCorp
- ChineseGLUE-IFLYTEK
- ChineseGLUE-THUCNEWS
- GLUE-Cola
- GLUE-SST2

- ChnSentiCorp
* **句对分类**
- LCQMC
- NLPCC-DBQA
- ChineseGLUE-LCQMC
- ChineseGLUE-INEWS
- ChineseGLUE-TNEWS
- ChinesGLUE-BQ
- ChineseGLUE-XNLI_zh
- GLUE-MNLI
- GLUE-QQP
- GLUE-QNLI
- GLUE-STS-B
- GLUE-MRPC
- GLUE-RTE
- NLPCC-DBQA
- XNLI

## 如何开始Finetune
Expand Down
42 changes: 27 additions & 15 deletions demo/text-classification/run_classifier.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,36 @@ export CUDA_VISIBLE_DEVICES=0
DATASET="chnsenticorp"
CKPT_DIR="./ckpt_${DATASET}"

python -u text_classifier.py \
--batch_size=24 \
--use_gpu=True \
--dataset=${DATASET} \
--checkpoint_dir=${CKPT_DIR} \
--learning_rate=5e-5 \
--weight_decay=0.01 \
--max_seq_len=128 \
--num_epoch=3 \
--use_pyreader=True \
--use_data_parallel=True \
--use_taskid=False

# Recommending hyper parameters for difference task
# for ChineseGLUE:
# TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# XNLI_zh: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5
# INEWS: batch_size=4, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5
# DRCD: see demo: reading-comprehension
# CMRC2018: see demo: reading-comprehension
# BQ: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=100, lr=1e-5
# MSRANER: see demo: sequence-labeling
# THUCNEWS: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=5e-5
# IFLYTEKDATA: batch_size=16, weight_decay=0, num_epoch=5, max_seq_len=256, lr=1e-5

# for other tasks:
# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5
# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5
# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5
# TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# QQP: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# QNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# SST-2: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
Expand All @@ -22,23 +47,10 @@ CKPT_DIR="./ckpt_${DATASET}"
# mnli_mm: dev and test in mismatched dataset.
# The difference can be seen in https://www.nyu.edu/projects/bowman/multinli/paper.pdf.
# If you are not sure which one to pick, just use mnli or mnli_m.
# XNLI: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5
# XNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5
# Specify the language with an underscore like xnli_zh.
# ar- Arabic bg- Bulgarian de- German
# el- Greek en- English es- Spanish
# fr- French hi- Hindi ru- Russian
# sw- Swahili th- Thai tr- Turkish
# ur- Urdu vi- Vietnamese zh- Chinese (Simplified)

python -u text_classifier.py \
--batch_size=24 \
--use_gpu=True \
--dataset=${DATASET} \
--checkpoint_dir=${CKPT_DIR} \
--learning_rate=5e-5 \
--weight_decay=0.01 \
--max_seq_len=128 \
--num_epoch=3 \
--use_pyreader=True \
--use_data_parallel=True \
--use_taskid=False \
28 changes: 22 additions & 6 deletions demo/text-classification/text_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,36 @@
# Download dataset and use ClassifyReader to read dataset
if args.dataset.lower() == "chnsenticorp":
dataset = hub.dataset.ChnSentiCorp()
module = hub.Module(name="ernie")
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc"]
elif args.dataset.lower() == "tnews":
dataset = hub.dataset.TNews()
module = hub.Module(name="ernie")
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc", "f1"]
elif args.dataset.lower() == "nlpcc_dbqa":
dataset = hub.dataset.NLPCC_DBQA()
module = hub.Module(name="ernie")
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc"]
elif args.dataset.lower() == "lcqmc":
dataset = hub.dataset.LCQMC()
module = hub.Module(name="ernie")
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc"]
elif args.dataset.lower() == 'inews':
dataset = hub.dataset.INews()
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc", "f1"]
elif args.dataset.lower() == 'bq':
dataset = hub.dataset.BQ()
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc", "f1"]
elif args.dataset.lower() == 'thucnews':
dataset = hub.dataset.THUCNEWS()
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc", "f1"]
elif args.dataset.lower() == 'iflytek':
dataset = hub.dataset.IFLYTEK()
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc", "f1"]
elif args.dataset.lower() == "mrpc":
dataset = hub.dataset.GLUE("MRPC")
if args.use_taskid:
Expand Down Expand Up @@ -116,7 +132,7 @@
metrics_choices = ["acc"]
elif args.dataset.lower().startswith("xnli"):
dataset = hub.dataset.XNLI(language=args.dataset.lower()[-2:])
module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16")
metrics_choices = ["acc"]
else:
raise ValueError("%s dataset is not defined" % args.dataset)
Expand All @@ -140,7 +156,7 @@
pooled_output = outputs["pooled_output"]

# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
Expand Down
6 changes: 6 additions & 0 deletions paddlehub/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@
from .xnli import XNLI
from .glue import GLUE
from .tnews import TNews
from .inews import INews
from .drcd import DRCD
from .cmrc2018 import CMRC2018
from .bq import BQ
from .iflytek import IFLYTEK
from .thucnews import THUCNEWS

# CV Dataset
from .dogcat import DogCatDataset as DogCat
Expand Down
Loading

0 comments on commit 271883b

Please sign in to comment.