models.py


import os
import torch
import logging
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    RobertaModel,
    BertModel,
    BertPreTrainedModel,
    RobertaConfig,
    RobertaForSequenceClassification,
    XLMRobertaForSequenceClassification
)

class NegEntropy(object):
    def __call__(self, outputs):
        probs = torch.softmax(outputs, dim=1)
        return torch.mean(torch.sum(probs.log()*probs, dim=1))
    
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class BertClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class SequenceClassificationFp16(BertPreTrainedModel):
    r"""
        https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py#L1122
    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        if config.model_type == 'bert':
            self.bert = BertModel(config)
            self.classifier = BertClassificationHead(config)
            if config.secondary_num_labels is not None:
                temp_num_labels = config.num_labels
                config.num_labels = config.secondary_num_labels
                self.sec_classifier = BertClassificationHead(config)
                config.num_labels = temp_num_labels
                self.secondary_num_labels=config.secondary_num_labels
        elif config.model_type == 'xlm-roberta':
            self.roberta = RobertaModel(config)
            self.classifier = RobertaClassificationHead(config)
            if config.secondary_num_labels is not None:
                temp_num_labels = config.num_labels
                config.num_labels = config.secondary_num_labels
                self.sec_classifier = RobertaClassificationHead(config)
                config.num_labels = temp_num_labels
                self.secondary_num_labels=config.secondary_num_labels
        else:
            raise NotImplementedError()
        self.config = config
        self.init_weights()

    @torch.cuda.amp.autocast()
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        conf_penalty=None,
        marginal_entropy=None,
        sec_classifier=None,
        conf_coef=1
    ):
        if self.config.model_type == "bert":
            outputs = self.bert(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )
        elif self.config.model_type == "xlm-roberta":
            outputs = self.roberta(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )
            
        sequence_output = outputs[0]
        num_of_label = self.num_labels
        if sec_classifier is None:
            logits = self.classifier(sequence_output)
        else:
            num_of_label = self.secondary_num_labels
            logits = self.sec_classifier(sequence_output)

        loss = None
        per_sample_loss = None
        if labels is not None:
            if num_of_label == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                per_sample_loss_fct = CrossEntropyLoss(reduction='none')
                loss = loss_fct(logits.view(-1, num_of_label), labels.view(-1))
                per_sample_loss = per_sample_loss_fct(logits.view(-1, num_of_label), labels.view(-1))
        
        if conf_penalty is not None:
            if loss is None:
                loss = 0
            neg_entropy_loss_func = NegEntropy()
            loss = loss + conf_coef*neg_entropy_loss_func(logits)
            per_sample_loss = per_sample_loss + neg_entropy_loss_func(logits)
        
        if marginal_entropy is not None:
            if loss is None:
                loss = 0
            neg_entropy_loss_func = NegEntropy()
            loss = loss - neg_entropy_loss_func(torch.mean(logits, dim=0, keepdim=True)) # neg_entropy_loss_func(logits.mean(axis=0))
    
        return loss, (per_sample_loss, logits, sequence_output)
        

class SequenceClassification(BertPreTrainedModel):
    r"""
        https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py#L1122
    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        if config.model_type == 'bert':
            self.bert = BertModel(config)
            self.classifier = BertClassificationHead(config)
            if config.secondary_num_labels is not None:
                temp_num_labels = config.num_labels
                config.num_labels = config.secondary_num_labels
                self.sec_classifier = BertClassificationHead(config)
                config.num_labels = temp_num_labels
                self.secondary_num_labels=config.secondary_num_labels
        elif config.model_type == 'xlm-roberta':
            self.roberta = RobertaModel(config)
            self.classifier = RobertaClassificationHead(config)
            if config.secondary_num_labels is not None:
                temp_num_labels = config.num_labels
                config.num_labels = config.secondary_num_labels
                self.sec_classifier = RobertaClassificationHead(config)
                config.num_labels = temp_num_labels
                self.secondary_num_labels=config.secondary_num_labels
        else:
            raise NotImplementedError()
        self.config = config
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        conf_penalty=None,
        marginal_entropy=None,
        sec_classifier=None,
        conf_coef=1
    ):
        if self.config.model_type == "bert":
            outputs = self.bert(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )
        elif self.config.model_type == "xlm-roberta":
            outputs = self.roberta(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )
            
        sequence_output = outputs[0]
        num_of_label = self.num_labels
        if sec_classifier is None:
            logits = self.classifier(sequence_output)
        else:
            num_of_label = self.secondary_num_labels
            logits = self.sec_classifier(sequence_output)

        loss = None
        per_sample_loss = None
        if labels is not None:
            if num_of_label == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                per_sample_loss_fct = CrossEntropyLoss(reduction='none')
                loss = loss_fct(logits.view(-1, num_of_label), labels.view(-1))
                per_sample_loss = per_sample_loss_fct(logits.view(-1, num_of_label), labels.view(-1))
        
        if conf_penalty is not None:
            probs = logits.softmax(-1)
            if loss is None:
                loss = 0
            neg_entropy_loss_func = NegEntropy()
            loss = loss + conf_coef*neg_entropy_loss_func(probs)
            #per_sample_loss = per_sample_loss + neg_entropy_loss_func(logits)
        
        if marginal_entropy is not None:
            if conf_penalty is None:
                probs = logits.softmax(-1)
            if loss is None:
                loss = 0
            neg_entropy_loss_func = NegEntropy()
            loss = loss - neg_entropy_loss_func(torch.mean(probs, dim=0, keepdim=True)) # neg_entropy_loss_func(logits.mean(axis=0))
    
        return loss, (per_sample_loss, logits, sequence_output)


class TokenClassificationFp16(BertPreTrainedModel):
    r"""
        https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py#L1122
    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        if config.model_type == 'bert':
            self.bert = BertModel(config)
        elif config.model_type == 'xlm-roberta':
            self.roberta = RobertaModel(config)
        else:
            raise NotImplementedError()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        self.config = config
        self.init_weights()

    @torch.cuda.amp.autocast()
    def forward(
            self, 
            input_ids=None, 
            attention_mask=None, 
            token_type_ids=None,
            position_ids=None, 
            head_mask=None, 
            inputs_embeds=None, 
            labels=None, 
            output_attentions=None,
            output_hidden_states=None,
            conf_penalty=None,
        ):

        if self.config.model_type == "bert":
            outputs = self.bert(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )
        elif self.config.model_type == "xlm-roberta":
            outputs = self.roberta(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            neg_entropy_loss_func = NegEntropy()
            loss_fct_token = CrossEntropyLoss(reduction='none')
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
                if conf_penalty is not None:
                    loss = loss + neg_entropy_loss_func(active_logits)
                per_token_loss = loss_fct_token(logits.view(-1, self.num_labels), labels.view(-1))
                
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
                if conf_penalty is not None:
                    loss = loss - neg_entropy_loss_func(logits.view(-1, self.num_labels))
                per_token_loss = loss_fct_token(logits.view(-1, self.num_labels), labels.view(-1))
        else:
            return None, ([None], logits, sequence_output)
                
        return loss, (per_token_loss, logits, sequence_output)


class TokenClassification(BertPreTrainedModel):
    r"""
        https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py#L1122
    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        if config.model_type == 'bert':
            self.bert = BertModel(config)
        elif config.model_type == 'xlm-roberta':
            self.roberta = RobertaModel(config)
        else:
            raise NotImplementedError()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        
        self.config = config
        self.init_weights()

    def forward(
            self, 
            input_ids=None, 
            attention_mask=None, 
            token_type_ids=None,
            position_ids=None, 
            head_mask=None, 
            inputs_embeds=None, 
            labels=None, 
            output_attentions=None,
            output_hidden_states=None,
            conf_penalty=None,
        ):

        if self.config.model_type == "bert":
            outputs = self.bert(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )
        elif self.config.model_type == "xlm-roberta":
            outputs = self.roberta(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    position_ids=position_ids,
                    head_mask=head_mask,
                    inputs_embeds=inputs_embeds,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )

        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            neg_entropy_loss_func = NegEntropy()
            loss_fct_token = CrossEntropyLoss(reduction='none')
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
                if conf_penalty is not None:
                    loss = loss + neg_entropy_loss_func(active_logits)
                per_token_loss = loss_fct_token(logits.view(-1, self.num_labels), labels.view(-1))
                
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
                if conf_penalty is not None:
                    loss = loss + neg_entropy_loss_func(logits.view(-1, self.num_labels))
                per_token_loss = loss_fct_token(logits.view(-1, self.num_labels), labels.view(-1))
        else:
            return None, ([None], logits, sequence_output)
        return loss, (per_token_loss, logits, sequence_output)


def load_model(
        task_name, 
        config_name, 
        tokenizer_name, 
        model_name_or_path, 
        num_labels, 
        model_type, 
        logger=None,
        do_lower_case=False, 
        cache_dir=None,
        is_fp16=False,
        verbose=True,
        secondary_num_labels=None
    ):
    if logger is None:
        logger = logging.getLogger(__name__)
    if verbose:
        logger.info("Loading config ...")
    config = AutoConfig.from_pretrained(
        config_name if config_name else model_name_or_path,
        num_labels=num_labels,
        finetuning_task=task_name,
        cache_dir=None,
    )
    config.secondary_num_labels = secondary_num_labels
    # print("model_type : %s, config.model_type : %s" % (model_type, config.model_type))    
    try:
        assert model_type == config.model_type
    except:
        assert model_type.split("-")[0] == config.model_type
        
    if verbose:
        logger.info("Loading tokenizer ...")
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path = tokenizer_name if tokenizer_name else model_name_or_path,
        do_lower_case=do_lower_case,
        cache_dir=cache_dir,
    )
    if verbose:
        logger.info("Loading model ...")
    model = get_model_class(task_name, is_fp16).from_pretrained(
        pretrained_model_name_or_path = model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=cache_dir,
    )
    # Check if saved optimizer or scheduler states exist
    optimizer = None
    scheduler = None
    if os.path.isfile(os.path.join(model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(model_name_or_path, "scheduler.pt")
    ):
        optimizer.load_state_dict(torch.load(os.path.join(model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(model_name_or_path, "scheduler.pt")))
    
    return config, tokenizer, model, optimizer, scheduler


def save_model(exp_dir, args, global_step, model, tokenizer, optimizer=None, scheduler=None, logger=None, prefix=""):
    if logger is None:
        logger = logging.getLogger(__name__)
    checkpoints = os.path.join(exp_dir, "checkpoints")
    output_dir = os.path.join(checkpoints, "{}_checkpoint-{}".format(prefix, global_step))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    model_to_save = (
        model.module if hasattr(model, "module") else model
    )  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    torch.save(args, os.path.join(output_dir, "training_args.bin"))
    # logger.info("Saving model checkpoint to %s", output_dir)

    if optimizer is not None:
        logger.info("Saving optimizer states to %s", output_dir)
        # torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
    if scheduler is not None:
        logger.info("Saving scheduler states to %s", output_dir)
        # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
    
    
def get_model_class(task_name, is_fp16):
    key = task_name+"_fp16" if is_fp16 else task_name
    model_dict = {
        "xnli": SequenceClassification,
        "pawsx": SequenceClassification,
        "xnli_fp16": SequenceClassificationFp16,
        "pawsx_fp16": SequenceClassificationFp16
    }
    return model_dict[key]