-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7d63119
commit cae32a2
Showing
6 changed files
with
1,135 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/usr/bin/env bash | ||
|
||
|
||
python tweeter_2007_taskA.py | ||
python tweeter_2017_taskB.py | ||
python tweeter_2017_taskC.py | ||
|
||
|
||
python baseline_taskA.py | ||
python baseline_taskB.py | ||
python baseline_taskC.py |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#!/bin/bash | ||
|
||
|
||
echo "****************** Installing packages ******************" | ||
|
||
|
||
pip3 install matplotlib | ||
pip3 install random2 | ||
pip3 install numpy | ||
pip3 install pandas | ||
pip3 install torch | ||
pip3 install tqdm | ||
pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html | ||
pip3 install transformers | ||
pip3 install python-time | ||
pip3 install matplotlib | ||
pip3 install seaborn | ||
pip3 install scikit-learn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,367 @@ | ||
#Install packages | ||
import random | ||
import torch | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler | ||
from transformers import BertTokenizer | ||
|
||
#Device Selection | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
# Load the BERT tokenizer. | ||
print('Loading BERT tokenizer...') | ||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) | ||
|
||
|
||
# Load Dataset | ||
train_data_df = pd.read_csv("data/twitter-2016train-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet']) | ||
val_data_df = pd.read_csv("data/twitter-2016devtest-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet']) | ||
|
||
|
||
train_tweet = train_data_df.tweet.values | ||
y_train = train_data_df.label.values | ||
|
||
val_tweet = val_data_df.tweet.values | ||
y_val = val_data_df.label.values | ||
|
||
|
||
#Convert string classes into numeric classes | ||
train_labels=[] | ||
val_labels=[] | ||
label_dict = {'negative':0, 'neutral':1, 'positive':2} | ||
|
||
for label in y_train: | ||
train_labels.append(label_dict[label]) | ||
|
||
for label in y_val: | ||
val_labels.append(label_dict[label]) | ||
|
||
|
||
#Print length of Print and validation data | ||
print(len(train_labels)) | ||
print(len(val_labels)) | ||
|
||
|
||
#Data Processing | ||
def processdata(tweets,labels): | ||
input_ids = [] | ||
attention_masks = [] | ||
for tweet in tweets: | ||
encoded_dict = tokenizer.encode_plus( | ||
tweet, # Sentence to encode. | ||
add_special_tokens = True, # Add '[CLS]' and '[SEP]' | ||
max_length = 64, # Pad & truncate all sentences. | ||
pad_to_max_length = True, | ||
return_attention_mask = True, # Construct attn. masks. | ||
return_tensors = 'pt', # Return pytorch tensors. | ||
) | ||
input_ids.append(encoded_dict['input_ids']) | ||
attention_masks.append(encoded_dict['attention_mask']) | ||
|
||
input_ids = torch.cat(input_ids, dim=0) | ||
attention_masks = torch.cat(attention_masks, dim=0) | ||
labels = torch.tensor(labels) | ||
return input_ids,attention_masks,labels | ||
|
||
#Process train and validation data | ||
train_input_ids,train_attention_masks,train_labels = processdata(train_tweet,train_labels) | ||
val_input_ids,val_attention_masks,val_labels = processdata(val_tweet,val_labels) | ||
|
||
# Convert into Tensordata | ||
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels) | ||
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels) | ||
|
||
# Create dataloader | ||
|
||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler | ||
batch_size = 32 | ||
|
||
# Create the DataLoaders for our training and validation sets. | ||
train_dataloader = DataLoader( | ||
train_dataset, # The training samples. | ||
sampler = RandomSampler(train_dataset), # Select batches randomly | ||
batch_size = batch_size # Trains with this batch size. | ||
) | ||
|
||
# For validation the order doesn't matter, so we'll just read them sequentially. | ||
validation_dataloader = DataLoader( | ||
val_dataset, # The validation samples. | ||
sampler = SequentialSampler(val_dataset), # Pull out batches sequentially. | ||
batch_size = batch_size # Evaluate with this batch size. | ||
) | ||
|
||
|
||
# Load BERT Model | ||
from transformers import BertForSequenceClassification, AdamW, BertConfig | ||
|
||
# Load BertForSequenceClassification, the pretrained BERT model with a single | ||
# linear classification layer on top. | ||
model = BertForSequenceClassification.from_pretrained( | ||
"bert-base-uncased", | ||
num_labels = 3, # The number of output labels = 3 | ||
|
||
output_attentions = False, # Whether the model returns attentions weights. | ||
output_hidden_states = False, # Whether the model returns all hidden-states. | ||
) | ||
|
||
# Tell pytorch to run this model on the GPU. | ||
model.cuda() | ||
|
||
|
||
# Define Optimizer | ||
optimizer = AdamW(model.parameters(), | ||
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 | ||
eps = 1e-8 # args.adam_epsilon - default is 1e-8. | ||
) | ||
|
||
# Epchos and Scheduler | ||
from transformers import get_linear_schedule_with_warmup | ||
|
||
# Number of training epochs. The BERT authors recommend between 2 and 4. | ||
EPOCHS = 4 | ||
|
||
# Total number of training steps is [number of batches] x [number of epochs]. | ||
# (Note that this is not the same as the number of training samples). | ||
total_steps = len(train_dataloader) * EPOCHS | ||
|
||
# Create the learning rate scheduler. | ||
scheduler = get_linear_schedule_with_warmup(optimizer, | ||
num_warmup_steps = 0, # Default value in run_glue.py | ||
num_training_steps = total_steps) | ||
|
||
# Accuracy Function | ||
def accuracy(y_pred, y_test): | ||
acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0)) | ||
return acc | ||
|
||
|
||
# Time | ||
import time | ||
import datetime | ||
|
||
def format_time(elapsed): | ||
''' | ||
Takes a time in seconds and returns a string hh:mm:ss | ||
''' | ||
# Round to the nearest second. | ||
elapsed_rounded = int(round((elapsed))) | ||
|
||
# Format as hh:mm:ss | ||
return str(datetime.timedelta(seconds=elapsed_rounded)) | ||
|
||
# Train the Model | ||
training_stats=[] | ||
def train(model, train_loader, val_loader, optimizer,scheduler): | ||
total_step = len(train_loader) | ||
|
||
for epoch in range(EPOCHS): | ||
# Measure how long the training epoch takes. | ||
train_start = time.time() | ||
model.train() | ||
|
||
# Reset the total loss and accuracy for this epoch. | ||
total_train_loss = 0 | ||
total_train_acc = 0 | ||
for batch_idx, (pair_token_ids, mask_ids, y) in enumerate(train_loader): | ||
|
||
# Unpack this training batch from our dataloader. | ||
pair_token_ids = pair_token_ids.to(device) | ||
mask_ids = mask_ids.to(device) | ||
labels = y.to(device) | ||
|
||
#clear any previously calculated gradients before performing a backward pass | ||
optimizer.zero_grad() | ||
|
||
#Get the loss and prediction | ||
loss, prediction = model(pair_token_ids, | ||
token_type_ids=None, | ||
attention_mask=mask_ids, | ||
labels=labels).values() | ||
|
||
acc = accuracy(prediction, labels) | ||
|
||
# Accumulate the training loss and accuracy over all of the batches so that we can | ||
# calculate the average loss at the end | ||
total_train_loss += loss.item() | ||
total_train_acc += acc.item() | ||
|
||
# Perform a backward pass to calculate the gradients. | ||
loss.backward() | ||
|
||
# Clip the norm of the gradients to 1.0. | ||
# This is to help prevent the "exploding gradients" problem. | ||
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | ||
|
||
# Update parameters and take a step using the computed gradient. | ||
optimizer.step() | ||
|
||
# Update the learning rate. | ||
scheduler.step() | ||
|
||
# Calculate the average accuracy and loss over all of the batches. | ||
train_acc = total_train_acc/len(train_loader) | ||
train_loss = total_train_loss/len(train_loader) | ||
train_end = time.time() | ||
|
||
# Put the model in evaluation mode | ||
model.eval() | ||
|
||
total_val_acc = 0 | ||
total_val_loss = 0 | ||
val_start = time.time() | ||
with torch.no_grad(): | ||
for batch_idx, (pair_token_ids, mask_ids, y) in enumerate(val_loader): | ||
|
||
#clear any previously calculated gradients before performing a backward pass | ||
optimizer.zero_grad() | ||
|
||
# Unpack this validation batch from our dataloader. | ||
pair_token_ids = pair_token_ids.to(device) | ||
mask_ids = mask_ids.to(device) | ||
labels = y.to(device) | ||
|
||
#Get the loss and prediction | ||
loss, prediction = model(pair_token_ids, | ||
token_type_ids=None, | ||
attention_mask=mask_ids, | ||
labels=labels).values() | ||
|
||
# Calculate the accuracy for this batch | ||
acc = accuracy(prediction, labels) | ||
|
||
# Accumulate the validation loss and Accuracy | ||
total_val_loss += loss.item() | ||
total_val_acc += acc.item() | ||
|
||
# Calculate the average accuracy and loss over all of the batches. | ||
val_acc = total_val_acc/len(val_loader) | ||
val_loss = total_val_loss/len(val_loader) | ||
|
||
#end = time.time() | ||
val_end = time.time() | ||
hours, rem = divmod(val_end-train_start, 3600) | ||
minutes, seconds = divmod(rem, 60) | ||
|
||
print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}') | ||
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)) | ||
|
||
training_stats.append( | ||
{ | ||
'epoch': epoch + 1, | ||
'Training Loss': train_loss, | ||
'Valid. Loss': val_loss, | ||
'Valid. Accur.': val_acc, | ||
'Training Time': train_end-train_start, | ||
'Validation Time': val_end-val_start | ||
} | ||
) | ||
|
||
train(model, train_dataloader, validation_dataloader, optimizer,scheduler) | ||
|
||
|
||
# Training and validation loss visualization | ||
import matplotlib.pyplot as plt | ||
import seaborn as sns | ||
|
||
# Use plot styling from seaborn. | ||
sns.set(style='darkgrid') | ||
|
||
# Increase the plot size and font size. | ||
sns.set(font_scale=1.5) | ||
plt.rcParams["figure.figsize"] = (12,6) | ||
|
||
# Plot the learning curve. | ||
plt.plot(df_stats['Training Loss'], 'b-o', label="Training") | ||
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation") | ||
|
||
# Label the plot. | ||
plt.title("Training & Validation Loss") | ||
plt.xlabel("Epoch") | ||
plt.ylabel("Loss") | ||
plt.legend() | ||
plt.xticks([1, 2, 3, 4]) | ||
|
||
plt.show() | ||
|
||
|
||
|
||
# Load the Test dataset into a pandas dataframe. | ||
df = pd.read_csv("data/twitter-2016test-A.txt", delimiter='\t', header=None, names=['label', 'tweet','id']) | ||
|
||
# Report the number of sentences. | ||
print('Number of test sentences: {:,}\n'.format(df.shape[0])) | ||
|
||
# Create sentence and label lists | ||
|
||
test_tweet = df.tweet.values | ||
y_test = df.label.values | ||
|
||
input_ids = [] | ||
attention_masks = [] | ||
|
||
labels=[] | ||
|
||
for label in y_test: | ||
labels.append(label_dict[label]) | ||
|
||
# Process test data | ||
input_ids,attention_masks,labels = processdata(test_tweet,labels) | ||
|
||
# Set the batch size. | ||
batch_size = 32 | ||
|
||
# Create the DataLoader. | ||
prediction_data = TensorDataset(input_ids, attention_masks, labels) | ||
prediction_sampler = SequentialSampler(prediction_data) | ||
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) | ||
|
||
|
||
# Test the accuracy | ||
from sklearn import metrics | ||
from sklearn.metrics import precision_recall_fscore_support | ||
from sklearn.metrics import precision_score | ||
from sklearn.metrics import recall_score | ||
|
||
def test(model,prediction_dataloader): | ||
|
||
total_test_acc = 0 | ||
total_F1_Score = 0 | ||
total_precision = 0 | ||
total_recall = 0 | ||
|
||
for batch_idx, (pair_token_ids, mask_ids,y) in enumerate(prediction_dataloader): | ||
pair_token_ids = pair_token_ids.to(device) | ||
mask_ids = mask_ids.to(device) | ||
labels = y.to(device) | ||
|
||
loss, prediction = model(pair_token_ids, | ||
token_type_ids=None, | ||
attention_mask=mask_ids, | ||
labels=labels).values() | ||
|
||
|
||
acc = accuracy(prediction, labels) | ||
|
||
f1 = metrics.f1_score(labels.cpu(), torch.argmax(prediction, -1).cpu(), labels=[0, 1, 2], average='macro') | ||
precision = precision_score(labels.cpu(), torch.argmax(prediction, -1).cpu(),labels=[0, 1, 2], average='macro') | ||
recall = recall_score(labels.cpu(), torch.argmax(prediction, -1).cpu(),labels=[0, 1, 2], average='macro') | ||
|
||
total_test_acc += acc.item() | ||
total_F1_Score += f1 | ||
total_precision += precision | ||
total_recall += recall | ||
|
||
test_acc = total_test_acc/len(prediction_dataloader) | ||
test_f1 = total_F1_Score/len(prediction_dataloader) | ||
test_precision = total_precision/len(prediction_dataloader) | ||
test_recall = total_recall/len(prediction_dataloader) | ||
|
||
|
||
print(f'test_acc: {test_acc:.4f}') | ||
print(f'f1 Score: {test_f1:.4f}') | ||
print(f'precision: {test_precision:.4f}') | ||
print(f'recall: {test_recall:.4f}') | ||
|
||
test(model,prediction_dataloader) |
Oops, something went wrong.