Cannot retrain model #44

hoanghapham · 2024-11-18T21:54:07Z

Hi, I'm retraining this model, but somehow the backward step cannot update the encoders' parameters.

Here's the code for the loss function:

import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F

def contrastive_loss(E_a: Tensor, E_t: Tensor, temperature: float = 0.5, device="cpu") -> Tensor:
    sum_term = 0
    batch_size = len(E_a)
    N = range(batch_size)

    for i in N:
        pos = torch.exp(F.cosine_similarity(E_a[i], E_t[i], dim=-1) / temperature)
        a_t_neg = 0
        t_a_neg = 0

        for j in N:
            a_t_neg = a_t_neg + torch.exp(F.cosine_similarity(E_a[i], E_t[j], dim=-1) / temperature)
            t_a_neg = t_a_neg + torch.exp(F.cosine_similarity(E_t[i], E_a[j], dim=-1) / temperature)

        a_t = torch.log(pos / a_t_neg)
        t_a = torch.log(pos / t_a_neg)
        sum_term = sum_term - (a_t + t_a)
    
    loss = 1 / (2*batch_size) * sum_term
    loss.to(device)
    return loss

class ContrastiveLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input: Tensor, target: Tensor, temperature: float) -> Tensor:
        return contrastive_loss(input, target, temperature)

Training loop:

audio_encoder = AudioEncoder(
    audioenc_name="HTSAT",
    d_in=768,
    d_out=1024,
    sample_rate=16000,
    window_size=1024,
    hop_size=320,
    mel_bins=64,
    fmin=50,
    fmax=8000,
    classes_num=527
)

audio_encoder.requires_grad_(True)

text_encoder = TextEncoder(
    text_model="gpt2",
    d_out=1024,
    transformer_embed_dim=768
)
text_encoder.requires_grad_(True)

print("=================")

audio_optimizer = torch.optim.Adam(audio_encoder.parameters(), lr=0.001)
text_optimizer = torch.optim.Adam(text_encoder.parameters(), lr=0.001)

loss_function = ContrastiveLoss()
# loss_function = nn.CrossEntropyLoss()

use_device = "cpu"
epochs = 1
batch_size = 5
limit = 5

audio_encoder.to(device=use_device)
text_encoder.to(device=use_device)

epoch_avg_losses = []

text_encoder.train()
audio_encoder.train()
data_loader = DataLoader(dataset, batch_size=5)

for epoch in range(epoches):

    current_losses = []
    indices = tqdm(range(0, limit, batch_size), desc=f"Epoch: {epoch}")

    for audio_tensor, text_dict_raw in data_loader:
        # subsets = dataset[i: i+batch_size]
        text_input = {
            "input_ids": text_dict_raw["input_ids"].reshape(batch_size, -1), 
            "attention_mask": text_dict_raw["attention_mask"].reshape(batch_size, -1)}

        audio_optimizer.zero_grad()
        text_optimizer.zero_grad()

        audio_embeded, _ = audio_encoder(audio_tensor.reshape(batch_size, -1))
        text_embedded = text_encoder(text_input)

        loss_val = loss_function(audio_embeded, text_embedded)
        current_losses.append(loss_val.item())
        
        loss_val.backward(retain_graph=True)
        audio_optimizer.step()
        text_optimizer.step()        
        indices.set_postfix({"loss_val": loss_val.item()})
                         
    epoch_avg_losses.append(sum(current_losses) / len(current_losses))

I suspect that I did something wrong in my loss function so I tested this with the default CrossEntropyLoss, but the two encoder's parameters were not updated either. Can someone help?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cannot retrain model #44

Cannot retrain model #44

hoanghapham commented Nov 18, 2024

Cannot retrain model #44

Cannot retrain model #44

Comments

hoanghapham commented Nov 18, 2024