-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetuning.py
126 lines (103 loc) · 5.44 KB
/
finetuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""Script for fine-tuning Pegasus
Example usage:
# use arxiv dataset as example, with first 1000 docs as training data
from datasets import load_dataset
dataset = load_dataset("xsum")
train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, train_dataset)
trainer.train()
Reference:
https://huggingface.co/transformers/master/custom_datasets.html
"""
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
class PegasusDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels['input_ids'][idx]) # torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
def prepare_data(model_name,
train_texts, train_labels,
val_texts=None, val_labels=None,
test_texts=None, test_labels=None):
"""
Prepare input data for model fine-tuning
"""
tokenizer = PegasusTokenizer.from_pretrained(model_name)
prepare_val = False if val_texts is None or val_labels is None else True
prepare_test = False if test_texts is None or test_labels is None else True
def tokenize_data(texts, labels):
encodings = tokenizer(texts, truncation=True, padding=True)
decodings = tokenizer(labels, truncation=True, padding=True)
dataset_tokenized = PegasusDataset(encodings, decodings)
return dataset_tokenized
train_dataset = tokenize_data(train_texts, train_labels)
val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None
return train_dataset, val_dataset, test_dataset
def prepare_fine_tuning(model_name, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
"""
Prepare configurations and base model for fine-tuning
"""
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
if freeze_encoder:
for param in model.model.encoder.parameters():
param.requires_grad = False
if val_dataset is not None:
training_args = TrainingArguments(
output_dir=output_dir, # output directory
num_train_epochs=10, # total number of training epochs
per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows
per_device_eval_batch_size=1, # batch size for evaluation, can increase if memory allows
save_steps=500, # number of updates steps before checkpoint saves
save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
evaluation_strategy='steps', # evaluation strategy to adopt during training
eval_steps=100, # number of update steps before evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
else:
training_args = TrainingArguments(
output_dir=output_dir, # output directory
num_train_epochs=10, # total number of training epochs
per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows
save_steps=500, # number of updates steps before checkpoint saves
save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
)
return trainer
if __name__=='__main__':
# use arxiv dataset as example, with first 1000 docs as training data
from datasets import load_dataset
dataset = load_dataset("arxiv_dataset", data_dir='training_data_v3.json')
train_texts, train_labels = dataset['train']['document'][:100], dataset['train']['summary'][:100]
# use Pegasus Large model as base for fine-tuning
model_name = 'google/pegasus-large'
train_dataset, _, _ = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, train_dataset)
trainer.train()