processors.py

import os
import csv
import json
import torch
import logging
import dataclasses
import numpy as np
from dataclasses import dataclass
from typing import List, Optional, Union
from torch.utils.data import TensorDataset

@dataclass
class InputExample:
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    """

    guid: str
    text_a: str
    text_b: Optional[str] = None
    label: Optional[str] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"


@dataclass(frozen=True)
class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    """

    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(dataclasses.asdict(self)) + "\n"


class DataProcessor:
    """Base class for data converters for sequence classification data sets."""

    def get_example_from_tensor_dict(self, tensor_dict):
        """Gets an example from a dict with tensorflow tensors.
        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        """
        raise NotImplementedError()

    def get_train_examples(self, data_dir):
        """Gets a collection of :class:`InputExample` for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of :class:`InputExample` for the dev set."""
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        """Gets a collection of :class:`InputExample` for the test set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    def tfds_map(self, example):
        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
        This method converts examples to the correct format."""
        if len(self.get_labels()) > 1:
            example.label = self.get_labels()[int(example.label)]
        return example

    @classmethod
    def _read_tsv(cls, input_file, encoding="utf-8-sig", quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding=encoding) as f:
            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))


class XnliProcessor(DataProcessor):
    """Processor for the XNLI dataset.
    Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""

    def __init__(self, src_lang, dev_lang, tgt_lang, seed, percentage=100.0, n_shot=None, pad_token_label_id=-100):
        self.seed = seed
        self.n_class = len(self.get_labels())
        self.percentage = percentage
        self.n_shot = n_shot
        self.src_lang = src_lang
        self.dev_lang = dev_lang
        self.tgt_lang = tgt_lang
        self.output_mode = "classification"
        self.pad_token_label_id=pad_token_label_id

    def get_train_examples(self, data_file_info):
        """See base class."""
        data_file_info = data_file_info.split(";")
        address, encoding, lang = data_file_info[0], data_file_info[1], data_file_info[2]
        assert lang in self.src_lang.split(";")
        lines = self._read_tsv(input_file = address, encoding=encoding)
        # Select x% data from the sample
        np.random.seed(self.seed)
        data_idx = [i for i in range(len(lines)) if i > 0]
        np.random.shuffle(data_idx)
        data_split_size = int(((float(len(lines)-1)*self.percentage)//100))
        # Select 1/5/n shot samples
        class_dict = {}
        # max_samples_to_be_read = self.n_shot * self.n_class if self.n_shot is not None else data_split_size
        max_samples_to_be_read = data_split_size
        examples = []
        # for (i, line) in enumerate(lines):
        for _i in data_idx:
            line = lines[_i]
            guid = "%s-%s" % ("train", _i)
            text_a = line[0]
            text_b = line[1]
            if str(text_a).strip() == "" or  str(text_b).strip() == "":
                continue
            label = "contradiction" if line[2] == "contradictory" else line[2]
            if class_dict.get(label, 0) == self.n_shot:
                continue
            class_dict[label] = class_dict.get(label, 0) + 1
            assert isinstance(text_a, str), f"Training input {text_a} is not a string"
            assert isinstance(text_b, str), f"Training input {text_b} is not a string"
            assert isinstance(label, str), f"Training label {label} is not a string"
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
            if len(examples) == data_split_size:
                break
            if len(examples) == max_samples_to_be_read:
                break
        # try:
        #     assert  len(examples) == max_samples_to_be_read
        # except:
        #     raise AssertionError("Couldn't find {} of samples, sample read : {}, n_shot {}, {}% data : {}".format(
        #                     max_samples_to_be_read, len(examples), self.n_shot, self.percentage, data_split_size
        #                 )
        #             )
        return examples

    def get_dev_examples(self, data_file_info, mode):
        """See base class."""
        data_file_info = data_file_info.split(";")
        address, encoding, lang = data_file_info[0], data_file_info[1], data_file_info[2]
        # if mode == "dev":
        #     assert lang in self.dev_lang.split(";")
        lines = self._read_tsv(input_file = address, encoding=encoding)
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            language = line[0]
            if language != lang:
                continue
            guid = "%s-%s" % (mode, i)
            text_a = line[6]
            text_b = line[7]
            label = line[1]
            assert isinstance(text_a, str), f"Training input {text_a} is not a string"
            assert isinstance(text_b, str), f"Training input {text_b} is not a string"
            assert isinstance(label, str), f"Training label {label} is not a string"
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
    

    def get_test_examples(self, data_file_info, mode):
        lang = data_file_info.split(";")[-1]
        # assert lang in self.tgt_lang.split(";")
        return self.get_dev_examples(data_file_info, mode)


    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]

    def load_and_cache_examples(self, args, tokenizer, mode, data_file_info, logger=None):
        if logger is None:
            logger = logging.getLogger(__name__)
        if args.local_rank not in [-1, 0] and mode == "train":
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        address, encoding, lang = data_file_info.split(";")[0], data_file_info.split(";")[1], data_file_info.split(";")[2]
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.output_dir,
            "cached_{}_{}_{}_{}_{}_{}_{}_{}".format(
                args.tokenizer_name,
                mode if mode != "train" else mode+'-'+str(self.percentage),
                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                str(args.max_seq_length),
                os.path.split(address)[-1],
                encoding,
                lang,
                str(self.n_shot)
            ),
        )
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            features = torch.load(cached_features_file)
        else:
            logger.info("Creating features from {} file at {}".format(data_file_info, args.output_dir))
            label_list = self.get_labels()
            if args.task_name in ["mnli", "mnli-mm"] and args.model_type in ["xlm-roberta" "roberta"]:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            
            if mode=='train':
                examples = (
                    self.get_train_examples(data_file_info) 
                )
            elif mode=="dev":
                examples = (
                    self.get_dev_examples(data_file_info, mode) 
                )
            elif mode=="test":
                examples = (
                    self.get_test_examples(data_file_info, mode) 
                )
            elif mode=="support":
                if args.support_split == "train":
                    examples = (
                        self.get_train_examples(data_file_info) 
                    )
                elif args.support_split == "dev" or args.support_split == "test":
                    examples = (
                        self.get_dev_examples(data_file_info, mode) 
                    )
                else:
                    raise NotImplementedError
            else:
                raise NotImplementedError("Mode ({}) is invalid.".format(mode))

            features = self.convert_examples_to_features(
                examples, tokenizer, 
                label_list=label_list, 
                max_length=args.max_seq_length, 
                output_mode=self.output_mode,
                pad_on_left=False,
                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                pad_token_segment_id=0,
                logger=logger
            )
            if args.local_rank in [-1, 0]:
                logger.info("Saving features into cached file %s", cached_features_file)
                torch.save(features, cached_features_file)

        if args.local_rank == 0 and mode=="train":
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)

        if features[0].token_type_ids is None:
            # For RoBERTa (a potential bug!)
            all_token_type_ids = torch.tensor([[0] * args.max_seq_length for f in features], dtype=torch.long)
        else:
            all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        if self.output_mode == "classification":
            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
        elif self.output_mode == "regression":
            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
        return dataset


    def convert_examples_to_features(
        self,
        examples,
        tokenizer,
        max_length,
        label_list,
        output_mode,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        mask_padding_with_zero=True,
        logger=None
    ):
        if logger is None:
            logger = logging.getLogger(__name__)
        if max_length is None:
            max_length = tokenizer.max_len

        label_map = {label: i for i, label in enumerate(label_list)}

        features = []
        for (ex_index, example) in enumerate(examples):
            len_examples = 0
            len_examples = len(examples)
            if ex_index % 10000 == 0:
                logger.info("Writing example %d/%d" % (ex_index, len_examples))

            inputs = tokenizer.encode_plus(
                example.text_a, example.text_b, 
                add_special_tokens=True, max_length=max_length, 
                return_token_type_ids=True, truncation='longest_first'
            )
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
            assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
                len(attention_mask), max_length
            )
            assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
                len(token_type_ids), max_length
            )

            if output_mode == "classification":
                label = label_map[example.label]
            elif output_mode == "regression":
                label = float(example.label)
            else:
                raise KeyError(output_mode)

            features.append(
                InputFeatures(
                    input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
                )
            )

        return features


    def black_box_convert_examples_to_features_fast(
        self,
        examples,
        tokenizer,
        max_length,
        label_list,
        output_mode,
        logger
    ):
        if max_length is None:
            max_length = tokenizer.max_len

        label_map = {label: i for i, label in enumerate(label_list)}

        def label_from_example(example):
            if example.label is None:
                return None
            if output_mode == "classification":
                return label_map[example.label]
            elif output_mode == "regression":
                return float(example.label)
            raise KeyError(output_mode)

        labels = [label_from_example(example) for example in examples]

        batch_encoding = tokenizer(
            [(example.text_a, example.text_b) for example in examples],
            max_length=max_length,
            padding="max_length",
            truncation='longest_first'
        )
        # batch_encoding is a dict with two key, input_ids and attention_mask
        # input id for each of the samples are listed one by one in input ids
        # same goes for attention_mask
        features = []

        for i in range(len(examples)):
            if i % 10000==0:
                logger.info("Writing example : {}/{}".format(i, len(examples)))
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}
            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

            # if i < 5 :
            #     logger.info("*** Example ***")
            #     logger.info("guid : {}".format(examples[i].guid))
            #     for k, v in inputs.items():
            #         logger.info("{} : {}".format(k, v))
            #     logger.info("label : {} (id = {})".format(examples[i].label, labels[i]))
            #     assert label_from_example(examples[i]) == labels[i]

        return features
    

class PawsXProcessor(DataProcessor):
    """Processor for the XNLI dataset.
    Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""

    def __init__(self, src_lang, dev_lang, tgt_lang, seed, percentage=100.0, n_shot=None, pad_token_label_id=-100):
        self.seed = seed
        self.n_class = len(self.get_labels())
        self.percentage = percentage
        self.n_shot = n_shot
        self.src_lang = src_lang
        self.dev_lang = dev_lang
        self.tgt_lang = tgt_lang
        self.output_mode = "classification"
        self.pad_token_label_id=pad_token_label_id

    def get_train_examples(self, data_file_info):
        """See base class."""
        data_file_info = data_file_info.split(";")
        address, encoding, lang = data_file_info[0], data_file_info[1], data_file_info[2]
        # assert lang in self.src_lang.split(";")
        lines = self._read_tsv(input_file = address, encoding=encoding)
        # Select x% data from the sample
        np.random.seed(self.seed)
        data_idx = [i for i in range(len(lines)) if i > 0]
        np.random.shuffle(data_idx)
        data_split_size = int(((float(len(lines)-1)*self.percentage)//100))
        # Select 1/5/n shot samples
        class_dict = {}
        # max_samples_to_be_read = self.n_shot * self.n_class if self.n_shot is not None else data_split_size
        max_samples_to_be_read = data_split_size
        examples = []
        # for (i, line) in enumerate(lines):
        for _i in data_idx:
            line = lines[_i]
            guid = "%s-%s" % ("train", line[0])
            text_a = line[1]
            text_b = line[2]
            label = line[3]
            assert label == "0" or label == "1"
            if class_dict.get(label, 0) == self.n_shot:
                continue
            class_dict[label] = class_dict.get(label, 0) + 1
            assert isinstance(text_a, str), f"Training input {text_a} is not a string"
            assert isinstance(text_b, str), f"Training input {text_b} is not a string"
            assert isinstance(label, str), f"Training label {label} is not a string"
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
            if len(examples) == data_split_size:
                break
            if len(examples) == max_samples_to_be_read:
                break
        try:
            assert  len(examples) == max_samples_to_be_read
        except:
            raise AssertionError("Couldn't find {} of samples, sample read : {}, n_shot {}, {}% data : {}".format(
                            max_samples_to_be_read, len(examples), self.n_shot, self.percentage, data_split_size
                        )
                    )
        return examples

    
    def get_dev_examples(self, data_file_info, mode):
        data_file_info = data_file_info.split(";")
        address, encoding, lang = data_file_info[0], data_file_info[1], data_file_info[2]
        # if mode == "dev":
        #     assert lang in self.dev_lang.split(";")
        lines = self._read_tsv(input_file = address, encoding=encoding)
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % ("train", line[0])
            text_a = line[1]
            text_b = line[2]
            label = line[3]
            assert label == "0" or label == "1"
            assert isinstance(text_a, str), f"Training input {text_a} is not a string"
            assert isinstance(text_b, str), f"Training input {text_b} is not a string"
            assert isinstance(label, str), f"Training label {label} is not a string"
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


    def get_test_examples(self, data_file_info, mode):
        lang = data_file_info.split(";")[-1]
        # assert lang in self.tgt_lang.split(";")
        return self.get_dev_examples(data_file_info, mode)


    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def load_and_cache_examples(self, args, tokenizer, mode, data_file_info, logger=None):
        if logger is None:
            logger = logging.getLogger(__name__)
        if args.local_rank not in [-1, 0] and mode == "train":
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        address, encoding, lang = data_file_info.split(";")[0], data_file_info.split(";")[1], data_file_info.split(";")[2]
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.output_dir,
            "cached_{}_{}_{}_{}_{}_{}_{}_{}".format(
                args.tokenizer_name,
                mode if mode != "train" else mode+'-'+str(self.percentage),
                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                str(args.max_seq_length),
                os.path.split(address)[-1],
                encoding,
                lang,
                str(self.n_shot)
            ),
        )
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            features = torch.load(cached_features_file)
        else:
            logger.info("Creating features from {} file at {}".format(data_file_info, args.output_dir))
            label_list = self.get_labels()
            if args.task_name in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            
            if mode=='train':
                examples = (
                    self.get_train_examples(data_file_info) 
                )
            elif mode=="dev":
                examples = (
                    self.get_dev_examples(data_file_info, mode) 
                )
            elif mode=="test":
                examples = (
                    self.get_test_examples(data_file_info, mode) 
                )
            elif mode=="support":
                if args.support_split=='train':
                    examples = (
                        self.get_train_examples(data_file_info) 
                    )
                elif args.support_split=="dev":
                    examples = (
                        self.get_dev_examples(data_file_info, mode) 
                    )
                elif args.support_split=="test":
                    examples = (
                        self.get_test_examples(data_file_info, mode) 
                    )
                else:
                    raise NotImplementedError("Mode ({}) is invalid.".format(mode))

            features = self.convert_examples_to_features(
                examples, tokenizer, 
                label_list=label_list, 
                max_length=args.max_seq_length, 
                output_mode=self.output_mode,
                pad_on_left=False,
                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                pad_token_segment_id=0,
                logger=logger
            )
            if args.local_rank in [-1, 0]:
                logger.info("Saving features into cached file %s", cached_features_file)
                torch.save(features, cached_features_file)

        if args.local_rank == 0 and mode=="train":
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)

        if features[0].token_type_ids is None:
            # For RoBERTa (a potential bug!)
            all_token_type_ids = torch.tensor([[0] * args.max_seq_length for f in features], dtype=torch.long)
        else:
            all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        if self.output_mode == "classification":
            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
        elif self.output_mode == "regression":
            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
        return dataset


    def convert_examples_to_features(
        self,
        examples,
        tokenizer,
        max_length,
        label_list,
        output_mode,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        mask_padding_with_zero=True,
        logger=None
    ):
        if logger is None:
            logger = logging.getLogger(__name__)
        if max_length is None:
            max_length = tokenizer.max_len

        label_map = {label: i for i, label in enumerate(label_list)}

        features = []
        for (ex_index, example) in enumerate(examples):
            len_examples = 0
            len_examples = len(examples)
            if ex_index % 10000 == 0:
                logger.info("Writing example %d/%d" % (ex_index, len_examples))

            inputs = tokenizer.encode_plus(
                example.text_a, example.text_b, 
                add_special_tokens=True, max_length=max_length, 
                return_token_type_ids=True, truncation='longest_first'
            )
            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

            assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
            assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
                len(attention_mask), max_length
            )
            assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
                len(token_type_ids), max_length
            )

            if output_mode == "classification":
                label = label_map[example.label]
            elif output_mode == "regression":
                label = float(example.label)
            else:
                raise KeyError(output_mode)

            features.append(
                InputFeatures(
                    input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
                )
            )

        return features


processor_dict = {
    "xnli": XnliProcessor,
    "pawsx": PawsXProcessor,
}

def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def compute_metrics(task_name, preds, labels):
    task_name=task_name.split("_fp16")[0]
    assert len(preds) == len(
        labels
    ), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
    if task_name == "xnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "pawsx":
        return {"acc": simple_accuracy(preds, labels)}
    else:
        raise KeyError(task_name)