Add files via upload

rupakdas18 · May 7, 2021 · cae32a2 · cae32a2
1 parent 7d63119
commit cae32a2
Show file tree

Hide file tree

Showing 6 changed files with 1,135 additions and 0 deletions.
diff --git a/EXPERIMENT.sh b/EXPERIMENT.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+
+python tweeter_2007_taskA.py
+python tweeter_2017_taskB.py
+python tweeter_2017_taskC.py
+
+
+python baseline_taskA.py
+python baseline_taskB.py
+python baseline_taskC.py
diff --git a/Experimental Result.PNG b/Experimental Result.PNG
diff --git a/INSTALL.sh b/INSTALL.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+
+echo "****************** Installing packages ******************"
+
+
+pip3 install matplotlib
+pip3 install random2
+pip3 install numpy
+pip3 install pandas
+pip3 install torch
+pip3 install tqdm
+pip3 install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
+pip3 install transformers
+pip3 install python-time
+pip3 install matplotlib
+pip3 install seaborn
+pip3 install scikit-learn
diff --git a/twitter_2017_taskA.py b/twitter_2017_taskA.py
@@ -0,0 +1,367 @@
+#Install packages
+import random
+import torch
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer
+
+#Device Selection
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Load the BERT tokenizer.
+print('Loading BERT tokenizer...')
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
+
+
+# Load Dataset
+train_data_df = pd.read_csv("data/twitter-2016train-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
+val_data_df = pd.read_csv("data/twitter-2016devtest-A.txt", delimiter='\t', header=None, names=['id', 'label', 'tweet'])
+
+
+train_tweet = train_data_df.tweet.values
+y_train = train_data_df.label.values
+
+val_tweet = val_data_df.tweet.values
+y_val = val_data_df.label.values
+
+
+#Convert string classes into numeric  classes
+train_labels=[]
+val_labels=[]
+label_dict = {'negative':0, 'neutral':1, 'positive':2}
+
+for label in y_train:
+  train_labels.append(label_dict[label])
+
+for label in y_val:
+  val_labels.append(label_dict[label])
+
+
+#Print length of Print and validation data
+print(len(train_labels))
+print(len(val_labels))
+
+
+#Data Processing
+def processdata(tweets,labels):
+  input_ids = []
+  attention_masks = []
+  for tweet in tweets:
+    encoded_dict = tokenizer.encode_plus(
+                        tweet,                      # Sentence to encode.
+                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                        max_length = 64,           # Pad & truncate all sentences.
+                        pad_to_max_length = True,
+                        return_attention_mask = True,   # Construct attn. masks.
+                        return_tensors = 'pt',     # Return pytorch tensors.
+                   )
+    input_ids.append(encoded_dict['input_ids'])
+    attention_masks.append(encoded_dict['attention_mask'])
+
+  input_ids = torch.cat(input_ids, dim=0)
+  attention_masks = torch.cat(attention_masks, dim=0)
+  labels = torch.tensor(labels)
+  return input_ids,attention_masks,labels
+
+#Process train and validation data
+train_input_ids,train_attention_masks,train_labels = processdata(train_tweet,train_labels)
+val_input_ids,val_attention_masks,val_labels = processdata(val_tweet,val_labels)
+
+# Convert into Tensordata
+train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
+val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
+
+# Create dataloader
+
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+batch_size = 32
+
+# Create the DataLoaders for our training and validation sets.
+train_dataloader = DataLoader(
+            train_dataset,  # The training samples.
+            sampler = RandomSampler(train_dataset), # Select batches randomly
+            batch_size = batch_size # Trains with this batch size.
+        )
+
+# For validation the order doesn't matter, so we'll just read them sequentially.
+validation_dataloader = DataLoader(
+            val_dataset, # The validation samples.
+            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
+            batch_size = batch_size # Evaluate with this batch size.
+        )
+
+
+# Load BERT Model
+from transformers import BertForSequenceClassification, AdamW, BertConfig
+
+# Load BertForSequenceClassification, the pretrained BERT model with a single 
+# linear classification layer on top. 
+model = BertForSequenceClassification.from_pretrained(
+    "bert-base-uncased", 
+    num_labels = 3, # The number of output labels = 3
+
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+# Tell pytorch to run this model on the GPU.
+model.cuda()
+
+
+# Define Optimizer
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+# Epchos and Scheduler
+from transformers import get_linear_schedule_with_warmup
+
+# Number of training epochs. The BERT authors recommend between 2 and 4. 
+EPOCHS = 4
+
+# Total number of training steps is [number of batches] x [number of epochs]. 
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * EPOCHS
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+# Accuracy Function
+def accuracy(y_pred, y_test):
+  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
+  return acc
+
+
+# Time
+import time
+import datetime
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+# Train the Model
+training_stats=[]
+def train(model, train_loader, val_loader, optimizer,scheduler):  
+  total_step = len(train_loader)
+
+  for epoch in range(EPOCHS):
+    # Measure how long the training epoch takes.
+    train_start = time.time()
+    model.train()
+
+    # Reset the total loss and accuracy for this epoch.
+    total_train_loss = 0
+    total_train_acc  = 0
+    for batch_idx, (pair_token_ids, mask_ids, y) in enumerate(train_loader):
+
+      # Unpack this training batch from our dataloader. 
+      pair_token_ids = pair_token_ids.to(device)
+      mask_ids = mask_ids.to(device)
+      labels = y.to(device)
+
+      #clear any previously calculated gradients before performing a backward pass
+      optimizer.zero_grad()
+
+      #Get the loss and prediction
+      loss, prediction = model(pair_token_ids, 
+                             token_type_ids=None, 
+                             attention_mask=mask_ids, 
+                             labels=labels).values()
+
+      acc = accuracy(prediction, labels)
+
+      # Accumulate the training loss and accuracy over all of the batches so that we can
+      # calculate the average loss at the end
+      total_train_loss += loss.item()
+      total_train_acc  += acc.item()
+
+      # Perform a backward pass to calculate the gradients.
+      loss.backward()
+
+      # Clip the norm of the gradients to 1.0.
+      # This is to help prevent the "exploding gradients" problem.
+      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+      # Update parameters and take a step using the computed gradient.
+      optimizer.step()
+
+      # Update the learning rate.
+      scheduler.step()
+
+    # Calculate the average accuracy and loss over all of the batches.
+    train_acc  = total_train_acc/len(train_loader)
+    train_loss = total_train_loss/len(train_loader)
+    train_end = time.time()
+
+    # Put the model in evaluation mode
+    model.eval()
+
+    total_val_acc  = 0
+    total_val_loss = 0
+    val_start = time.time()
+    with torch.no_grad():
+      for batch_idx, (pair_token_ids, mask_ids, y) in enumerate(val_loader):
+
+        #clear any previously calculated gradients before performing a backward pass
+        optimizer.zero_grad()
+
+        # Unpack this validation batch from our dataloader. 
+        pair_token_ids = pair_token_ids.to(device)
+        mask_ids = mask_ids.to(device)
+        labels = y.to(device)
+
+        #Get the loss and prediction
+        loss, prediction = model(pair_token_ids, 
+                             token_type_ids=None, 
+                             attention_mask=mask_ids, 
+                             labels=labels).values()
+
+        # Calculate the accuracy for this batch
+        acc = accuracy(prediction, labels)
+
+        # Accumulate the validation loss and Accuracy
+        total_val_loss += loss.item()
+        total_val_acc  += acc.item()
+
+    # Calculate the average accuracy and loss over all of the batches.
+    val_acc  = total_val_acc/len(val_loader)
+    val_loss = total_val_loss/len(val_loader)
+
+    #end = time.time()
+    val_end = time.time()
+    hours, rem = divmod(val_end-train_start, 3600)
+    minutes, seconds = divmod(rem, 60)
+
+    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
+    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
+
+    training_stats.append(
+        {
+            'epoch': epoch + 1,
+            'Training Loss': train_loss,
+            'Valid. Loss': val_loss,
+            'Valid. Accur.': val_acc,
+            'Training Time': train_end-train_start,
+            'Validation Time': val_end-val_start
+        }
+    )
+
+train(model, train_dataloader, validation_dataloader, optimizer,scheduler)
+
+
+# Training and validation loss visualization
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Use plot styling from seaborn.
+sns.set(style='darkgrid')
+
+# Increase the plot size and font size.
+sns.set(font_scale=1.5)
+plt.rcParams["figure.figsize"] = (12,6)
+
+# Plot the learning curve.
+plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
+plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
+
+# Label the plot.
+plt.title("Training & Validation Loss")
+plt.xlabel("Epoch")
+plt.ylabel("Loss")
+plt.legend()
+plt.xticks([1, 2, 3, 4])
+
+plt.show()
+
+
+
+# Load the Test dataset into a pandas dataframe.
+df = pd.read_csv("data/twitter-2016test-A.txt", delimiter='\t', header=None, names=['label', 'tweet','id'])
+
+# Report the number of sentences.
+print('Number of test sentences: {:,}\n'.format(df.shape[0]))
+
+# Create sentence and label lists
+
+test_tweet = df.tweet.values
+y_test = df.label.values
+
+input_ids = []
+attention_masks = []
+
+labels=[]
+
+for label in y_test:
+  labels.append(label_dict[label])
+
+# Process test data  
+input_ids,attention_masks,labels = processdata(test_tweet,labels)
+
+# Set the batch size.  
+batch_size = 32  
+
+# Create the DataLoader.
+prediction_data = TensorDataset(input_ids, attention_masks, labels)
+prediction_sampler = SequentialSampler(prediction_data)
+prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+
+
+# Test the accuracy
+from sklearn import metrics
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import precision_score
+from sklearn.metrics import recall_score
+
+def test(model,prediction_dataloader): 
+
+  total_test_acc  = 0
+  total_F1_Score = 0
+  total_precision = 0
+  total_recall = 0
+
+  for batch_idx, (pair_token_ids, mask_ids,y) in enumerate(prediction_dataloader):
+    pair_token_ids = pair_token_ids.to(device)
+    mask_ids = mask_ids.to(device)
+    labels = y.to(device)
+
+    loss, prediction = model(pair_token_ids, 
+                             token_type_ids=None, 
+                             attention_mask=mask_ids, 
+                             labels=labels).values()
+
+
+    acc = accuracy(prediction, labels)
+
+    f1 = metrics.f1_score(labels.cpu(), torch.argmax(prediction, -1).cpu(), labels=[0, 1, 2], average='macro')
+    precision = precision_score(labels.cpu(), torch.argmax(prediction, -1).cpu(),labels=[0, 1, 2], average='macro')
+    recall = recall_score(labels.cpu(), torch.argmax(prediction, -1).cpu(),labels=[0, 1, 2], average='macro')
+
+    total_test_acc  += acc.item()
+    total_F1_Score += f1
+    total_precision += precision
+    total_recall += recall
+
+  test_acc  = total_test_acc/len(prediction_dataloader)
+  test_f1 = total_F1_Score/len(prediction_dataloader)
+  test_precision = total_precision/len(prediction_dataloader)
+  test_recall = total_recall/len(prediction_dataloader)
+
+
+  print(f'test_acc: {test_acc:.4f}')
+  print(f'f1 Score: {test_f1:.4f}')
+  print(f'precision: {test_precision:.4f}')
+  print(f'recall: {test_recall:.4f}')
+
+test(model,prediction_dataloader)