Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rupakdas18 authored May 7, 2021
1 parent 7d63119 commit cae32a2
Show file tree
Hide file tree
Showing 6 changed files with 1,135 additions and 0 deletions.
11 changes: 11 additions & 0 deletions EXPERIMENT.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash


python tweeter_2007_taskA.py
python tweeter_2017_taskB.py
python tweeter_2017_taskC.py


python baseline_taskA.py
python baseline_taskB.py
python baseline_taskC.py
Binary file added Experimental Result.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 18 additions & 0 deletions INSTALL.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash


echo "****************** Installing packages ******************"


pip3 install matplotlib
pip3 install random2
pip3 install numpy
pip3 install pandas
pip3 install torch
pip3 install tqdm
pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
pip3 install transformers
pip3 install python-time
pip3 install matplotlib
pip3 install seaborn
pip3 install scikit-learn
367 changes: 367 additions & 0 deletions twitter_2017_taskA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,367 @@
#Install packages
import random
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer

#Device Selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


# Load Dataset
train_data_df = pd.read_csv("data/twitter-2016train-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
val_data_df = pd.read_csv("data/twitter-2016devtest-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])


train_tweet = train_data_df.tweet.values
y_train = train_data_df.label.values

val_tweet = val_data_df.tweet.values
y_val = val_data_df.label.values


#Convert string classes into numeric classes
train_labels=[]
val_labels=[]
label_dict = {'negative':0, 'neutral':1, 'positive':2}

for label in y_train:
train_labels.append(label_dict[label])

for label in y_val:
val_labels.append(label_dict[label])


#Print length of Print and validation data
print(len(train_labels))
print(len(val_labels))


#Data Processing
def processdata(tweets,labels):
input_ids = []
attention_masks = []
for tweet in tweets:
encoded_dict = tokenizer.encode_plus(
tweet, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 64, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids,attention_masks,labels

#Process train and validation data
train_input_ids,train_attention_masks,train_labels = processdata(train_tweet,train_labels)
val_input_ids,val_attention_masks,val_labels = processdata(val_tweet,val_labels)

# Convert into Tensordata
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

# Create dataloader

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 32

# Create the DataLoaders for our training and validation sets.
train_dataloader = DataLoader(
train_dataset, # The training samples.
sampler = RandomSampler(train_dataset), # Select batches randomly
batch_size = batch_size # Trains with this batch size.
)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
val_dataset, # The validation samples.
sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
batch_size = batch_size # Evaluate with this batch size.
)


# Load BERT Model
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels = 3, # The number of output labels = 3

output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()


# Define Optimizer
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)

# Epchos and Scheduler
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4.
EPOCHS = 4

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)

# Accuracy Function
def accuracy(y_pred, y_test):
acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
return acc


# Time
import time
import datetime

def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))

# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))

# Train the Model
training_stats=[]
def train(model, train_loader, val_loader, optimizer,scheduler):
total_step = len(train_loader)

for epoch in range(EPOCHS):
# Measure how long the training epoch takes.
train_start = time.time()
model.train()

# Reset the total loss and accuracy for this epoch.
total_train_loss = 0
total_train_acc = 0
for batch_idx, (pair_token_ids, mask_ids, y) in enumerate(train_loader):

# Unpack this training batch from our dataloader.
pair_token_ids = pair_token_ids.to(device)
mask_ids = mask_ids.to(device)
labels = y.to(device)

#clear any previously calculated gradients before performing a backward pass
optimizer.zero_grad()

#Get the loss and prediction
loss, prediction = model(pair_token_ids,
token_type_ids=None,
attention_mask=mask_ids,
labels=labels).values()

acc = accuracy(prediction, labels)

# Accumulate the training loss and accuracy over all of the batches so that we can
# calculate the average loss at the end
total_train_loss += loss.item()
total_train_acc += acc.item()

# Perform a backward pass to calculate the gradients.
loss.backward()

# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

# Update parameters and take a step using the computed gradient.
optimizer.step()

# Update the learning rate.
scheduler.step()

# Calculate the average accuracy and loss over all of the batches.
train_acc = total_train_acc/len(train_loader)
train_loss = total_train_loss/len(train_loader)
train_end = time.time()

# Put the model in evaluation mode
model.eval()

total_val_acc = 0
total_val_loss = 0
val_start = time.time()
with torch.no_grad():
for batch_idx, (pair_token_ids, mask_ids, y) in enumerate(val_loader):

#clear any previously calculated gradients before performing a backward pass
optimizer.zero_grad()

# Unpack this validation batch from our dataloader.
pair_token_ids = pair_token_ids.to(device)
mask_ids = mask_ids.to(device)
labels = y.to(device)

#Get the loss and prediction
loss, prediction = model(pair_token_ids,
token_type_ids=None,
attention_mask=mask_ids,
labels=labels).values()

# Calculate the accuracy for this batch
acc = accuracy(prediction, labels)

# Accumulate the validation loss and Accuracy
total_val_loss += loss.item()
total_val_acc += acc.item()

# Calculate the average accuracy and loss over all of the batches.
val_acc = total_val_acc/len(val_loader)
val_loss = total_val_loss/len(val_loader)

#end = time.time()
val_end = time.time()
hours, rem = divmod(val_end-train_start, 3600)
minutes, seconds = divmod(rem, 60)

print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

training_stats.append(
{
'epoch': epoch + 1,
'Training Loss': train_loss,
'Valid. Loss': val_loss,
'Valid. Accur.': val_acc,
'Training Time': train_end-train_start,
'Validation Time': val_end-val_start
}
)

train(model, train_dataloader, validation_dataloader, optimizer,scheduler)


# Training and validation loss visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()



# Load the Test dataset into a pandas dataframe.
df = pd.read_csv("data/twitter-2016test-A.txt", delimiter='\t', header=None, names=['label', 'tweet','id'])

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

# Create sentence and label lists

test_tweet = df.tweet.values
y_test = df.label.values

input_ids = []
attention_masks = []

labels=[]

for label in y_test:
labels.append(label_dict[label])

# Process test data
input_ids,attention_masks,labels = processdata(test_tweet,labels)

# Set the batch size.
batch_size = 32

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


# Test the accuracy
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def test(model,prediction_dataloader):

total_test_acc = 0
total_F1_Score = 0
total_precision = 0
total_recall = 0

for batch_idx, (pair_token_ids, mask_ids,y) in enumerate(prediction_dataloader):
pair_token_ids = pair_token_ids.to(device)
mask_ids = mask_ids.to(device)
labels = y.to(device)

loss, prediction = model(pair_token_ids,
token_type_ids=None,
attention_mask=mask_ids,
labels=labels).values()


acc = accuracy(prediction, labels)

f1 = metrics.f1_score(labels.cpu(), torch.argmax(prediction, -1).cpu(), labels=[0, 1, 2], average='macro')
precision = precision_score(labels.cpu(), torch.argmax(prediction, -1).cpu(),labels=[0, 1, 2], average='macro')
recall = recall_score(labels.cpu(), torch.argmax(prediction, -1).cpu(),labels=[0, 1, 2], average='macro')

total_test_acc += acc.item()
total_F1_Score += f1
total_precision += precision
total_recall += recall

test_acc = total_test_acc/len(prediction_dataloader)
test_f1 = total_F1_Score/len(prediction_dataloader)
test_precision = total_precision/len(prediction_dataloader)
test_recall = total_recall/len(prediction_dataloader)


print(f'test_acc: {test_acc:.4f}')
print(f'f1 Score: {test_f1:.4f}')
print(f'precision: {test_precision:.4f}')
print(f'recall: {test_recall:.4f}')

test(model,prediction_dataloader)
Loading

0 comments on commit cae32a2

Please sign in to comment.