From f10317656890bedb87103bbe10d10ae58a87ddeb Mon Sep 17 00:00:00 2001 From: as3eem Date: Wed, 10 Nov 2021 11:53:13 +0530 Subject: [PATCH] code and dataset updated --- .DS_Store | Bin 0 -> 6148 bytes HOPE_data/.DS_Store | Bin 0 -> 6148 bytes HOPE_data/HOPE_data_source/URL_sources.txt | 348 ++++++++++++++++++ .../HOPE_therapy_session_transcripts/163.csv | 116 ++++++ .../HOPE_therapy_session_transcripts/204.csv | 49 +++ .../HOPE_therapy_session_transcripts/206.csv | 73 ++++ .../HOPE_therapy_session_transcripts/27.csv | 63 ++++ .../HOPE_therapy_session_transcripts/48.csv | 49 +++ .../HOPE_therapy_session_transcripts/67.csv | 55 +++ .../HOPE_therapy_session_transcripts/75.csv | 55 +++ .../HOPE_therapy_session_transcripts/97.csv | 27 ++ SPARTA_model/.DS_Store | Bin 0 -> 6148 bytes SPARTA_model/Trainer.py | 148 ++++++++ SPARTA_model/config.py | 63 ++++ SPARTA_model/dataset/dataset.py | 108 ++++++ SPARTA_model/main.py | 51 +++ SPARTA_model/models/Attention.py | 114 ++++++ SPARTA_model/models/Classifier.py | 46 +++ SPARTA_model/models/DAC.py | 268 ++++++++++++++ SPARTA_model/models/GRU.py | 81 ++++ SPARTA_model/models/MHA.py | 73 ++++ SPARTA_model/models/Relevance.py | 99 +++++ SPARTA_model/models/RoBERTa.py | 39 ++ SPARTA_model/models/SpeakerClassifier.py | 46 +++ 24 files changed, 1971 insertions(+) create mode 100644 .DS_Store create mode 100644 HOPE_data/.DS_Store create mode 100644 HOPE_data/HOPE_data_source/URL_sources.txt create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/163.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/204.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/206.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/27.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/48.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/67.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/75.csv create mode 100644 HOPE_data/HOPE_therapy_session_transcripts/97.csv create mode 100644 SPARTA_model/.DS_Store create mode 100644 SPARTA_model/Trainer.py create mode 100644 SPARTA_model/config.py create mode 100644 SPARTA_model/dataset/dataset.py create mode 100644 SPARTA_model/main.py create mode 100644 SPARTA_model/models/Attention.py create mode 100644 SPARTA_model/models/Classifier.py create mode 100644 SPARTA_model/models/DAC.py create mode 100644 SPARTA_model/models/GRU.py create mode 100644 SPARTA_model/models/MHA.py create mode 100644 SPARTA_model/models/Relevance.py create mode 100644 SPARTA_model/models/RoBERTa.py create mode 100644 SPARTA_model/models/SpeakerClassifier.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f603810db08e88f64f53bef83a1f2fbfce0d1e68 GIT binary patch literal 6148 zcmeHKy-veG4F0ZkXjF!d3=G*|L81~5kXEV01c3T<_AWuUg{_N z0#1>!x&a>XL_5Y3C4XnnD2hcb7V9gXy*IXA>)luPQfvJs{-@$Y;+u6}vlJ6!z!)$F zj+O!TY>`ekpjl(U7%&Dl49NE(rV1vCRY3pgpz$LBu}!xZu4R^xoLDhYtO9a};zBAh zq&gijTu7%s*0@Bm3K-Jibog+3veOC0`O|s-*oVU<0nHi%#z2#SlW^IS{y*A3|2LED z$`~*P{uKkRoo4BfkEFeI>*1u=#?%Q_MB=J|n-ng5D`u>;;xnoj`eRuT6U8bZJrw&R M;At>p4E!kr-(0>_b^rhX literal 0 HcmV?d00001 diff --git a/HOPE_data/.DS_Store b/HOPE_data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..1a5f072c318adf80ed2b284d76b7d7669096b75b GIT binary patch literal 6148 zcmeHKJxc>Y5PhpDf;K5FuS;t{`!ifX1X0Kji21?z~uiRPv z<>~q2dwGK&<;_RsJ)X)|qtYI8j4D++M`K3i$kxyO z`nHQT%e-8lFM0X(UZ?4bw_UEg#;c2kSCjo!M<3qY*U?S*6MB_t;b7zZpAIYV)pO4L*rS`4S@^hY5s4p=B^I$UTzTzIn!9g6dB z=lYQkhgB@)!K^W`$-tpHYEu7OpZEWpY4&Uk7z2C6fU6|!q{#z?+PZZ#skIUHjw&K? mg`!WP3FX)iq!gb}y>Kp)0Wl6(DAGf*KLVZxGseK5GVl()I6%2_|DP{K;@CW=05<6eQ2Qc#P&M6IP zB_>o=chdPe=R5oIY~^Aia`We6Ml>a&49YmTK(j}9oV6!|WZ^+4b2PM~P5C%mJk&jJ zGaMrWymuoi=$YBozYRNone: + + self.text = data[fields['text']] + self.act = data[fields['act']] + self.label = data[fields['label']] + self.speaker = data[fields['speaker']] + self.ids = data[fields['id']] + + self.tokenizer = tokenizer + self.max_len = max_len + + # build label and speaker dict dictionary + for i in range(len(self.act)): + if self.act[i] not in DADataset.__label_dict.keys(): + DADataset.__label_dict[self.act[i]]=self.label[i] + + + def __len__(self)->int: + return len(self.text) + + def label_dict(self)->dict: + """ + label_dict method get the label vocab + Returns: + label dict, a dictionary containing class name as keys and indices as values + """ + + return DADataset.__label_dict + + + def speaker_dict(self)->dict: + """ + label_dict method get the label vocab + Returns: + label dict, a dictionary containing class name as keys and indices as values + """ + + return DADataset.__speaker_dict + + + + def __getitem__(self, index:int)->dict: + """ + __getitem__() method to get a sample from dataset + + Parameters: + index: index of datasets to fetch the utterance + + Returns: + item as dictionary + """ + + text = self.text[index] + + act = self.act[index] + label = self.label[index] + speaker = DADataset.__speaker_dict[self.speaker[index]] + session_id = self.ids[index].split("_")[0] + + + # encode the text + input_encoding = self.tokenizer.encode_plus( + text=text, + truncation=True, + max_length=self.max_len, + return_tensors="pt", + return_attention_mask=True, + padding="max_length", + return_length=True + + ) + + return { + "text":text, + "input_ids":input_encoding['input_ids'].squeeze(), + "attention_mask":input_encoding['attention_mask'].squeeze(), + "act":act, + "label":torch.tensor([label], dtype=torch.long).squeeze(), + "speaker":torch.tensor([speaker], dtype=torch.long).squeeze(), + "session_id":session_id + + } \ No newline at end of file diff --git a/SPARTA_model/main.py b/SPARTA_model/main.py new file mode 100644 index 0000000..9fb17ea --- /dev/null +++ b/SPARTA_model/main.py @@ -0,0 +1,51 @@ +from config import config +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from transformers import AutoTokenizer +import pytorch_lightning as pl +from Trainer import LightningModel +from .models.DAC import DACModel + +if __name__ == '__main__': + + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=config['model_name']) + logger = WandbLogger( + name="analyze"+"mha-final", + save_dir=config["save_dir"], + project=config["project"], + log_model=True, + ) + early_stopping = EarlyStopping( + monitor=config["monitor"], + min_delta=config["min_delta"], + patience=config['patience'], + ) + checkpoints = ModelCheckpoint( + filepath=config["filepath"], + monitor=config["monitor"], + save_top_k=1 + ) + ## Trainer + trainer = pl.Trainer( + logger=logger, + gpus=[0], + checkpoint_callback=checkpoints, + callbacks=[early_stopping], + default_root_dir="./", + max_epochs=config["epochs"], + precision=config["precision"], + enable_pl_optimizer=False, + automatic_optimization=True, + ) + + model = DACModel(config=config) + lm = LightningModel( + model=model, + tokenizer=tokenizer, + config=config + ) + trainer.fit(lm) + trainer.test(lm) + + diff --git a/SPARTA_model/models/Attention.py b/SPARTA_model/models/Attention.py new file mode 100644 index 0000000..913f9bf --- /dev/null +++ b/SPARTA_model/models/Attention.py @@ -0,0 +1,114 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn + + +class AttentionModel(nn.Module): + """ + Attention block to pay attention to dialouge history, this is multihead attention mechanism if num_heads = 1, it will be + context-context dot product attention. + + Parameters: + hidden_size: What is size of input vector + num_heads: Number of heads we want, embedding size should be divisible by num_heads + need_weights: Whether we want to project the key, query and value vectors or not + + """ + def __init__(self, hidden_size:int=768, num_heads:int=1, need_weights:bool=True)->None: + super(AttentionModel, self).__init__() + + assert hidden_size%num_heads==0, "hidden size should be divisible by nums of heads" + + self.num_heads = num_heads + self.hidden_size = hidden_size + self.head_size = hidden_size//num_heads + self.need_weights = need_weights + + # projection weights + self.k = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + self.q = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + self.v = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + self.o = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + + # no scalong as of now + + + + def forward(self, memory:torch.Tensor, current_utterance:torch.Tensor, attention_mask:torch.Tensor=None)->tuple: + """ + forward function of MHA which takes following argumennts + Arguments: + memory: memory in which previous utterances representations are stored, memory.shape = [window_size, embedding_size] + current_utterance: current utterance repsentation , current_utterance.shape = [1, embedding_size] + attention_mask: attention vector for keys to avoid paying attention to pad tokens + + Returns: attention maps and contextualized embedding as tuple + + Note: Since this will be applied to to memory and single utterance at a time + + Convert the memory and current utterance as key query vector + key: keys to compute attention weights, key.shape = [bs, k_len, embedding_size] + query: query for which we need context vector, query.shape = [bs, q_len, embedding_size] + value: values to get contextualized query, value.shape = [bs, v_len, embedding_size] + + """ + + # treat memory and current_utterance as key, value and query + key = memory.unsqueeze(0) # [1, memory_size, hidden_size] + query = current_utterance.unsqueeze(0) # [1, 1, hidden_size] + value = memory.unsqueeze(0) # [1, memory_size, dden_size] + + batch_size = key.shape[0] + + + + if self.need_weights: + key = self.k(key) + query = self.q(query) + value = self.v(value) + + # divide the key, qeury and value into heads + # key.shape, query.shape, value.shape = [bs, _len, embedding_size] + K = key.view(batch_size, -1, self.num_heads, self.head_size).permute(0, 2, 1, 3) + Q = query.view(batch_size, -1, self.num_heads, self.head_size).permute(0, 2, 1, 3) + V = value.view(batch_size, -1, self.num_heads, self.head_size).permute(0, 2, 1, 3) + # K.shape, Q.shape, V.shape = [bs, num_heads, _len, head_size] + + + # compute the attention weights + weights = torch.matmul(Q, K.permute(0, 1, 3, 2)) + # weights.shape = [bs, num_heads, q_len, k_len] + + + ##[4, 12, 1, 512] to [4, 1, 1, 512] + if attention_mask is not None: + # fill the masked position with small values (tends to zero) in order to pay zero attention + attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) + weights = weights.masked_fill(attention_mask == 0, -1e10) + + # apply the softmax over attention weights to normalize it + attention = torch.softmax(weights, dim=-1) + # attention.shape = [bs, num_heads, q_len, k_len] + + + # weighted sum of values where attention should be used as weights, k and query should be of same length + x = torch.matmul(attention, V) + # x.shape = [bs, num_heads, q_len, head_size] + + + x = x.permute(0, 2, 1, 3).contiguous() + # x.shape = [bs, q_len, num_heads, head_size] + + + x = x.view(batch_size, -1, self.hidden_size) + # x.shape = [bs, q_len, embedding_size] (original size of query) + + + # fed it to linear layer if need_weights is True + if self.need_weights: + x = self.o(x) + + ##print(f'x.shape after linear = {x.shape}') + + return x.squeeze(0), attention + \ No newline at end of file diff --git a/SPARTA_model/models/Classifier.py b/SPARTA_model/models/Classifier.py new file mode 100644 index 0000000..dbf7244 --- /dev/null +++ b/SPARTA_model/models/Classifier.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn + + + +class Classifier(nn.Module): + """ + A generic classifier module to classify the data (as feaures) + """ + + def __init__(self, input_size:int=768, dropout:float=0.10, hidden:list=[768, 512, 256, 128, 64], num_classes=2): + super(Classifier, self).__init__() + + self.input_size = input_size + self.dropout = dropout + self.hidden = hidden + self.num_classes = num_classes + + layers = [] + + + for output_size in hidden: + layers += [ + nn.Dropout(p=self.dropout), + nn.Linear(in_features=input_size, out_features=output_size), + nn.LeakyReLU() + ] + input_size = output_size + + layers.append(nn.Linear(in_features=hidden[-1], out_features=num_classes)) + + + self.classifier = nn.Sequential(*layers) + + def forward(self, x): + """ + Forward fuction of generic clasifier, which will take a feacture vector and return a tensor of size num_classes + Args: + x: feature vector to classify, x.shape = [batch_size, feature_size] + Returns: + outptus of size [batch_size, num_classes] + """ + outputs = self.classifier(x) + + return outputs \ No newline at end of file diff --git a/SPARTA_model/models/DAC.py b/SPARTA_model/models/DAC.py new file mode 100644 index 0000000..d25c3f9 --- /dev/null +++ b/SPARTA_model/models/DAC.py @@ -0,0 +1,268 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn +from .Attention import AttentionModel +from .RoBERTa import RepresentationModel +from .SpeakerClassifier import SpeakerClassifierModel +from .MHA import MHAModel +from .Relevance import RelevanceModel +from .GRU import GRUModel +from .Classifier import Classifier + + + +class DACModel(nn.Module): + + """ + Complete architecture of the model, all the modules are assmebled in this. + + """ + + def __init__(self, config:dict)->None: + + super(DACModel, self).__init__() + + + self.config = config + self.model_config = config['model_config'][config['select_model_config']] + print(self.model_config) + + # speaker invariant representation model, attetnion/relevance model and gru block + + # speaker invariant representation model + self.speaker_invariant = RepresentationModel( + model_name=config['model_name'], + hidden_size=config['hidden_size'] + ) + + # speaker-aware model + self.speaker_aware = SpeakerClassifierModel( + config=self.config + ) + # # load the model from checkpoints and then freeze the weights + print("Loading Speaker Classifier from Checkpoint ...") + self.speaker_aware.load_state_dict(torch.load(self.config['speaker_classifier_ckpt_path'])['state_dict']) + + # # freeze the weights of the model if you want + # if self.speaker_aware is not None: + # for param in self.speaker_aware.parameters(): + # param.requires_grad = False + # print("Speaker Classifier Loaded and Freezed !") + + + + # # attention block for speaker invaraint pooler + # if self.model_config['attention_type']=="mha": + self.si_attention = MHAModel( + hidden_size=config['hidden_size'], + num_heads=config['num_heads'], + need_weights=config['need_weights'], + dropout=config['dropout'], + device=self.config['device'], + ) + # # speaker aware attention block + self.sa_attention = MHAModel( + hidden_size=config['hidden_size'], + num_heads=config['num_heads'], + need_weights=config['need_weights'], + dropout=config['dropout'], + device=self.config['device'], + ) + + # elif self.model_config['attention_type']=="rel": + # relevance for speaker invariant pooler + self.si_attention = RelevanceModel( + hidden_size=config['hidden_size'], + need_weights=config['need_weights'], + device=self.config['device'], + ) + # # speaker aware relevance block + self.sa_attention = RelevanceModel( + hidden_size=config['hidden_size'], + need_weights=config['need_weights'], + device=self.config['device'], + ) + + + # gru for speaker invaraint hs + self.si_gru = GRUModel( + input_size=config['hidden_size'], + hidden_size=config['hidden_size'], + num_layers=config['num_layers'], + device=self.config['device'], + + ) + + # speaker aware gru + self.sa_gru = GRUModel( + input_size=config['hidden_size'], + hidden_size=config['hidden_size'], + num_layers=config['num_layers'], + device=self.config['device'], + + ) + + + self.classifier = Classifier( + input_size=self.model_config['dac_inputs']*config['hidden_size'], + dropout=self.config['dropout'], + hidden=self.config['hidden'], + num_classes=self.config['num_dialogue_acts'] + ) + + + def forward(self, + batch, + + # previous_sid:str, + + si_memory:torch.Tensor=None, + sa_memory:torch.Tensor=None, + + sa_global_context:torch.Tensor=None, + si_global_context:torch.Tensor=None, + ): + """ + forward function of dialogue act classifier + Args: + input_ids:input tokens input_ids.shape = [bs, seq_len] + attention_mask: masks for tokens, for pad tokens it will be zero for others it will be 1 + attention.shape = [bs, seq_len] + + si_memory:speaker invariant fixed window size memory that stores the previous utterances (pooler outputs) + si_memory.shape = [window_size, hidden_size] + sa_memory: speaker aware fixed window size memory that stores the prebious utterances (pooler outputs) + sa_memory.shape = [window_size, hidden_size] + + sa_pooler: speaker aware pooler output of current utterance + sa_pooler.shape = [bs, hidden_size] + sa_hidden_states: speaker aware hidden states of current utterance + sa_hidden_states.shape = [bs, seq_len, hidden_size] + + mem_attention_mask:attention mask for memory if we have history less than the window size some of the them + will be masked. + mem_attenion_mask.shape = [windows_size] + sa_global_context: global context (initial hidden state) for speaker aware GRU + sa_global_context.shape = [bs, hidden_size] + si_global_context: global context (initial hidden state) for speaker invariant GRU + sa_global_context.shape = [bs, hidden_size] + Returns: + + """ + + # adjusted during experiment + previous_sid="none" + + batch_size = batch['input_ids'].shape[0] + + + """Speaker Invariant Model""" + + # get the speaker aware representations + si_hidden_states, si_pooler = self.speaker_invariant( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + ) + + + # feature and attention vector for si attention block + si_attn_x = torch.empty((0, self.config['hidden_size']), device=self.config['device']) + si_attn = torch.empty((0, self.config['window_size']), device=self.config['device']) + + + for i, x in enumerate(zip(si_pooler.detach(), batch['session_id'])): + + si_utterance, current_sid = x[0].unsqueeze(0), x[1] + + xi_attn, attn_i = self.si_attention( + memory=si_memory, + current_utterance=si_utterance, + ) + si_attn_x = torch.vstack((si_attn_x, xi_attn)) + si_attn = torch.vstack((si_attn, attn_i.squeeze(0))) + + # update the memory + si_memory = torch.vstack((si_utterance, si_memory[:-1])) + + si_gru_x, si_global_context = self.si_gru( + embedding=si_hidden_states, + global_context=si_global_context, + attention_mask=batch["attention_mask"], + session_ids=batch['session_id'], + previous_sid=previous_sid + ) + + # """Speaker Aware Model""" + + + # # feature and attention vector for sa attention block + sa_attn_x = torch.empty((0, self.config['hidden_size']), device=self.config['device']) + sa_attn = torch.empty((0, self.config['window_size']), device=self.config['device']) + + + # # for speaker aware we are going to take all speaker aware components including local context, global context and speaker aware invariant representation + + # get speaker invariant representation + sa_hidden_states, sa_pooler, _ = self.speaker_aware( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + ) + + for i, x in enumerate(zip(sa_pooler.detach(), batch['session_id'])): + + sa_utterance, current_sid = x[0].unsqueeze(0), x[1] + + xa_attn, attn_a = self.sa_attention( + memory=sa_memory, + current_utterance=sa_utterance, + ) + + sa_attn_x = torch.vstack((sa_attn_x, xa_attn)) + sa_attn = torch.vstack((sa_attn, attn_a.squeeze(0))) + + # update the speaker aware memory + sa_memory = torch.vstack((sa_utterance.to(self.config['device']), sa_memory[:-1].to(self.config['device']))) + + sa_gru_x, sa_global_context = self.sa_gru( + embedding=sa_hidden_states, + global_context=sa_global_context, + attention_mask=batch["attention_mask"], + session_ids=batch['session_id'], + previous_sid=previous_sid + ) + + + + # concat all the features vector + features = torch.empty((batch_size, 0), device=self.config['device']) + + + + for i, x in enumerate([si_pooler, si_attn_x, si_gru_x, sa_pooler, sa_attn_x, sa_gru_x]): + if x.shape[0]==batch_size: + features = torch.hstack((features, x)) + + + logits = self.classifier(features) + + + + # return updated memories, attentions and logits + + return { + # memory + "si_memory":si_memory, + "sa_memory":sa_memory, + + "si_global_context":si_global_context, + "sa_global_context":sa_global_context, + + "si_attn":si_attn, + "sa_attn":sa_attn, + + # logits to compute the loss + "logits":logits, + + # "previous_sid":current_sid + } + \ No newline at end of file diff --git a/SPARTA_model/models/GRU.py b/SPARTA_model/models/GRU.py new file mode 100644 index 0000000..dc97d3d --- /dev/null +++ b/SPARTA_model/models/GRU.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn + + + + +class GRUModel(nn.Module): + + """ + GRU model that takes the global context along with current utterance representation and gives a context vector + + Parameters: + input_size: size of the input to GRU, which will be contextualized embedding means hidden size + hidden_size: size of the GRU's hidden vectors + num_layer: how many layers should be there in GRU + bidirectional: whether bidirectional or not if True both forwar and backward hidden states will be concatented + + """ + + def __init__(self, input_size:int=768, hidden_size:int=768, num_layers:int=1, bidirectional:bool=False, max_len:int=512, device=torch.device("cpu"))->None: + super(GRUModel, self).__init__() + + self.max_len = max_len + + self.device = device + + self.gru = nn.GRU( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + batch_first=True, + bidirectional=bidirectional + ) + + + def forward(self, embedding:torch.Tensor, attention_mask:torch.Tensor, session_ids:list, previous_sid:str, global_context:torch.Tensor=None)->tuple: + """ + forward function of GRUModel which will take contextualized embeddings and global context as input + and will produce next hidden states. + + Arguements: + embedding: contextextualized embeddings (hidden states) coming from representation model [bs, seq_len, hidden_size] + attention_mask: To ignore the pad tokens + global_context: last hidden state from previous utterance [bs, num_direction, hidden_size] + + Returns: + gloabl_context: last hidden state of current utterance + + """ + bs, seq_len, hidden_size = embedding.shape[0], embedding.shape[1], embedding.shape[2] + + outputs = torch.empty((0, hidden_size), device=self.device) + + if global_context is None: + global_context = torch.randn((1, 1, hidden_size), device=self.device) + + + for i, x in enumerate(zip(embedding, attention_mask, session_ids)): + + + + input_, current_sid = x[0], x[2] + + input_len = self.max_len - torch.unique(x[1], return_counts=True)[1][0].item() + if input_len < 1: + input_len = 2 + + # padded tokenz remove mannual with fixed len + _, global_context = self.gru(input=input_[:input_len].unsqueeze(0), hx=global_context.detach()) + + + outputs = torch.vstack((outputs, global_context.squeeze())) + + + return (outputs, global_context.detach()) + + + + + \ No newline at end of file diff --git a/SPARTA_model/models/MHA.py b/SPARTA_model/models/MHA.py new file mode 100644 index 0000000..83e560b --- /dev/null +++ b/SPARTA_model/models/MHA.py @@ -0,0 +1,73 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn + + +class MHAModel(nn.Module): + """ + This is pytorch's implementation of Multihead Attention + Parameters: + embedding_size: What is size of input embedding + num_heads: Number of heads we want, embedding size should be divisible by num_heads + need_weights: Whether we want to project the key, query and value vectors or not + """ + + def __init__(self, hidden_size:int=768, num_heads:int=12, need_weights:bool=True, dropout:float=0.10, device=torch.device("cpu")): + """ + This is pytorch's MHA model + Parameters: + embedding_size: What is size of input embedding + num_heads: Number of heads we want, embedding size should be divisible by num_heads + need_weights: Whether we want to project the key, query and value vectors or not + """ + + super(MHAModel, self).__init__() + + self.device = device + + self.hidden_size = hidden_size + self.num_heads = num_heads + self.need_weights = need_weights + + # multihead attention model + self.mha = nn.MultiheadAttention( + embed_dim=self.hidden_size, + num_heads=self.num_heads, + dropout=dropout, + ) + + def forward(self, memory:torch.Tensor, current_utterance:torch.Tensor, attention_mask:torch.Tensor=None)->tuple: + """ + forward function of MHA which takes following argumennts + Arguments: + memory: memory in which previous utterances representations are stored, memory.shape = [window_size, embedding_size] + current_utterance: current utterance repsentation , current_utterance.shape = [1, embedding_size] + attention_mask: attention vector for keys to avoid paying attention to pad tokens + + Returns: attention maps and contextualized embedding as tuple + + Note: Since this will be applied to to memory and single utterance at a time + + Convert the memory and current utterance as key query vector + key: keys to compute attention weights, key.shape = [bs, k_len, embedding_size] + query: query for which we need context vector, query.shape = [bs, q_len, embedding_size] + value: values to get contextualized query, value.shape = [bs, v_len, embedding_size] + + """ + + # treat memory and current_utterance as key, value and query + key = memory.unsqueeze(0).to(self.device) # [1, memory_size, hidden_size] + query = current_utterance.unsqueeze(0).to(self.device) # [1, 1, hidden_size] + value = memory.unsqueeze(0).to(self.device) # [1, memory_size, dden_size] + + x, attention = self.mha( + key=key, + query=query, + value=value, + need_weights=self.need_weights + ) + + return x.squeeze(0), attention + + + \ No newline at end of file diff --git a/SPARTA_model/models/Relevance.py b/SPARTA_model/models/Relevance.py new file mode 100644 index 0000000..0342f45 --- /dev/null +++ b/SPARTA_model/models/Relevance.py @@ -0,0 +1,99 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn + + +class RelevanceModel(nn.Module): + """ + Relvance model is somehwhat similar to attention but it a scaled by a distance measure + Params: + hidden_size: size of the key, query, values hidden vectors + need_weights: should we apply linear projection or not + """ + + def __init__(self, hidden_size:int=768, need_weights:bool=True, device=torch.device("cpu"))->None: + + super(RelevanceModel, self).__init__() + + self.device = device + + self.hidden_size = hidden_size + self.need_weights = need_weights + + + + # linear layers to linearly project key, query and values and weighted sum of values where attention as weight + self.k = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + self.q = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + self.v = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + self.o = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False) + + def distance_fn(self, d:int, T:float=0.20)->torch.Tensor: + """ + Function to compute the distance weights + Args: + d: distance (a single element tensor)from current utterance to that utterance + T: a hyperparameter, to cool down the distance function, T basically controls the distance weight + as well similarity weights. + if T is low max value of distance will be high as well as max-min will be low. I have tested the T + for different range. + T seems good between range 0.10 - 0.30 anything below 0.50 is tolerable + + Returns: distance weight as tensor + + """ + return 1/(torch.exp(torch.tensor([d]))**T) # different functions have been tested but simple inverse was used finally + + + def forward(self, memory:torch.Tensor, current_utterance:torch.Tensor, attention_mask:torch.Tensor=None)->tuple: + """ + forward function of Relvance which takes keys queries and values and returns the weighted sum + Arguments: + memory: memory in which previous utterances representations are stored, memory.shape = [window_size, embedding_size] + current_utterance: current utterance repsentation , current_utterance.shape = [1, embedding_size] + attention_mask: attention over memory, initially it all be set to 0 but as we will fill the memory it will + be unmasked. + + Returns: attention maps and contextualized embedding as tuple + """ + window_size = memory.shape[0] + + # treat memory and current_utterance as key, value and query + key = memory.to(self.device) # [1, memory_size, hidden_size] + query = current_utterance.to(self.device) # [1, 1, hidden_size] + value = memory.to(self.device) # [1, memory_size, dden_size] + + if self.need_weights: + key = self.k(key) + value = self.v(value) + query = self.q(query) + + + + # computing similarity function between memory + similarity = torch.cosine_similarity(key, query) + # similarity.shape = [window_size] + + # get the distance weights with the help of distance function defined + distance = torch.linspace(start=1, end=window_size, steps=window_size, dtype=torch.float).apply_(self.distance_fn).to(self.device) + # distance.shape = [window_size] + + # compute the energy with the help of similarity and distance which is product of these two + energy = similarity*distance + #energy.shape [window_size] + + # mask the not use full context + if attention_mask is not None: + energy = energy.masked_fill(attention_mask == 0, -1e10) + + + # compute the relevance score by applying softmax over energy + relevance = energy.softmax(dim=-1) + # relevance.shape [window_size] + + # get the weighted average of values where relevance is weight + x = torch.matmul(relevance.unsqueeze(0), value) + + return x, relevance + + \ No newline at end of file diff --git a/SPARTA_model/models/RoBERTa.py b/SPARTA_model/models/RoBERTa.py new file mode 100644 index 0000000..d77d430 --- /dev/null +++ b/SPARTA_model/models/RoBERTa.py @@ -0,0 +1,39 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn +from transformers import AutoConfig, AutoModel +class RepresentationModel(nn.Module): + """ + Represention Model to get the contextualized vectors (hidden states) and sentence representation (pooler output) + + Parameters: + model_name: name of pretrained model as feature extractor or represenation model + + + """ + + def __init__(self, model_name:str, hidden_size:int=768)->None: + super(RepresentationModel, self).__init__() + + # feature extractor with default config + self.config = AutoConfig.from_pretrained(pretrained_model_name_or_path=model_name, hidden_size=hidden_size) + self.base = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name, config=self.config) + + def forward(self, input_ids:torch.Tensor, attention_mask:torch.Tensor=None)->tuple: + """ + forward function of representation model which takes input_ids and attention_mask(optional)and gives contextualized + embeddings. + + Arguments: + input_ids:input ids tensor + attention_mask: attention maks, 0 for pad tokens 1 for sequence tokens + + Returns: + hidden states and pooler output in that order + """ + + # feed the input_ids and attention to the representation model + hidden_states, pooler = self.base(input_ids=input_ids, attention_mask=attention_mask) + return hidden_states, pooler + + \ No newline at end of file diff --git a/SPARTA_model/models/SpeakerClassifier.py b/SPARTA_model/models/SpeakerClassifier.py new file mode 100644 index 0000000..479dacb --- /dev/null +++ b/SPARTA_model/models/SpeakerClassifier.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn +from typing import Any, NoReturn +from .RoBERTa import RepresentationModel +from .Classifier import Classifier + +class SpeakerClassifierModel(nn.Module): + """Speaker Classifier model, isolated from complete architecture, easy ablation study""" + + def __init__(self, config:dict): + super(SpeakerClassifierModel, self).__init__() + + self.config = config + + self.base = RepresentationModel( + model_name=config['model_name'], + hidden_size=config['hidden_size'] + ) + + self.classifier = Classifier( + input_size=config['hidden_size'], + dropout=config['dropout'], + num_classes=config['num_speakers'] + ) + + def forward(self, input_ids:torch.Tensor, attention_mask:torch.Tensor=None)->tuple: + """ + forward function of Speaker Classifier model + Args: + input_ids:input tokens input_ids.shape = [batch, seq_len] + attention_mask: masks for tokens, for pad tokens it will be zero for others it will be 1 + Returns: a tuple containing hidden states, pooler output and logits + """ + + # get the repsentation + hidden_states, pooler = self.base( + input_ids=input_ids, + attention_mask=attention_mask, + ) + + # classify the speakers + logits = self.classifier(pooler) + + return hidden_states, pooler, logits + + \ No newline at end of file