From bd66e65f8d9fc2161f622c5b024f7767158f4386 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 21:58:48 +0100 Subject: [PATCH 1/8] Improve high_level module docstrings --- src/langsfer/high_level.py | 108 +++++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 34 deletions(-) diff --git a/src/langsfer/high_level.py b/src/langsfer/high_level.py index d7d8d47..1d06b52 100644 --- a/src/langsfer/high_level.py +++ b/src/langsfer/high_level.py @@ -1,7 +1,20 @@ -"""This module contains high-level user functions -for well-known methods described in papers and publications. -These are meant to make it easier for users who don't necessarily -want or need to care about all the details of the package. +""" +This module provides high-level user functions for well-known methods +described in research papers and publications, facilitating their +application for cross-lingual transfer learning in language models. + +These functions abstract away the complex details of the underlying methods +to offer an easy-to-use interface for users who want to implement language +model transfer techniques without needing to dive into the low-level +implementation details. + +The module supports various strategies such as: +- WECHSEL: Cross-lingual transfer using pre-trained embeddings and a bilingual dictionary. +- CLP Transfer: A cross-lingual and progressive transfer method for efficient language model training. +- FOCUS: Specializing pre-trained multilingual models through efficient token combinations using Sparsemax. + +Functions in this module are designed to work with tokenizers and pre-trained +embeddings from various models, including FastText and Transformers. """ import os @@ -46,30 +59,35 @@ def wechsel( Described in [WECHSEL: Effective initialization of subword embeddings for cross-lingual transfer of monolingual language models.](https://arxiv.org/abs/2112.06598) Minixhofer, Benjamin, Fabian Paischer, and Navid Rekabsaz. arXiv preprint arXiv:2112.06598 (2021). - It is a cross-lingual language transfer method that efficiently initializes the embedding parameters of a language model in a target language using the embedding parameters from an existing model in a source language, facilitating more efficient training in the new language. + The WECHSEL method efficiently initializes the embedding parameters of a language model in a target language + by leveraging the embedding parameters of a pre-trained model in a source language. This facilitates more efficient + training in the target language by aligning and transferring knowledge from the source language. The method requires as input: - - a tokenizer in the source language, - - a pre-trained language model in the source language, - - a tokenizer in the target language, + - tokenizer of the source language model, + - pre-trained language model as source, + - tokenizer of the target language model, - 2 monolingual fastText embeddings for source and target languages respectively. - They can be obtained in one of 2 ways: + They can be obtained in one of 2 ways: - using pre-trained fastText embeddings, - trainining fastText embeddings from scratch. Args: - source_tokenizer: Source model's tokenizer. - source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer. - target_tokenizer: Target model's tokenizer. - target_auxiliary_embeddings: FastText auxiliary embeddings in the target language. - source_auxiliary_embeddings: FastText auxiliary embeddings in the source language. - bilingual_dictionary: Dictionary mapping words in source language to words in target language. - bilingual_dictionary_file: Path to a bilingual dictionary file. - temperature: Softmax temperature to apply for weight computation. - k: Number of closest / most similar tokens to consider for weight computation. - batch_size: Size of the batches of non-overlapping token computations. + source_tokenizer: Tokenizer of the source language model. + source_embeddings_matrix: 2D matrix containing the weights of the source model's embedding layer. + target_tokenizer: Tokenizer of the target language model. + target_auxiliary_embeddings: FastText auxiliary embeddings for the target language. + source_auxiliary_embeddings: FastText auxiliary embeddings for the source language. + bilingual_dictionary: Optional dictionary mapping source language words to target language words. + bilingual_dictionary_file: Optional path to a file containing a bilingual dictionary. + temperature: Softmax temperature used to adjust weight computation. + k: Number of closest tokens to consider for weight computation. + batch_size: Number of tokens to process in each batch for non-overlapping token computations. + + Returns: + The embedding initializer object for the target model, based on WECHSEL. """ embeddings_initializer = WeightedAverageEmbeddingsInitialization( source_tokenizer=source_tokenizer, @@ -105,19 +123,26 @@ def clp_transfer( Described in [CLP-Transfer: Efficient language model training through cross-lingual and progressive transfer learning.](https://arxiv.org/abs/2301.09626) Ostendorff, Malte, and Georg Rehm. arXiv preprint arXiv:2301.09626 (2023). + CLP Transfer is a technique that combines cross-lingual and progressive transfer learning for efficient training + of language models. The method initializes the target embeddings by transferring knowledge from a source model + through embeddings and auxiliary information. + The method requires as input: - - a tokenizer in the source language, - - a pre-trained language model in the source language, - - a tokenizer in the target language, - - a helper pre-trained language model in the target language. + - tokenizer of the source language model, + - pre-trained language model as source, + - tokenizer of the target language model, + - helper pre-trained language model in the target language. Args: - source_tokenizer: Source model's tokenizer. - source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer. - target_tokenizer: Target model's tokenizer. - target_auxiliary_embeddings: Auxiliary embeddingsin the target language. - batch_size: Size of the batches of non-overlapping token computations. + source_tokenizer: Tokenizer of the source language model. + source_embeddings_matrix: 2D matrix containing the weights of the source model's embedding layer. + target_tokenizer: Tokenizer of the target language model. + target_auxiliary_embeddings: FastText auxiliary embeddings for the target language. + batch_size: Number of tokens to process in each batch for non-overlapping token computations. + + Returns: + The embedding initializer object for the target model, based on CLP-Transfer. """ embeddings_initializer = WeightedAverageEmbeddingsInitialization( source_tokenizer=source_tokenizer, @@ -146,13 +171,28 @@ def focus( Described in [FOCUS: Effective Embedding Initialization for Specializing Pretrained Multilingual Models on a Single Language.](https://arxiv.org/abs/2305.14481) Dobler, Konstantin, and Gerard de Melo. arXiv preprint arXiv:2305.14481 (2023). + The FOCUS method specializes pre-trained multilingual models by efficiently combining overlapping token embeddings + using Sparsemax weights. It utilizes auxiliary embeddings and calculates the target language embeddings based + on the source embeddings and the overlap between the source and target token sets. + + The method requires as input: + + - tokenizer of the source language model, + - pre-trained language model as source, + - tokenizer of the target language model, + - 2 monolingual fastText embeddings for source and target languages respectively + trained from scratch for both languages using pre-tokenized text with the respective language tokenizer. + Args: - source_tokenizer: Source model's tokenizer. - source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer. - target_tokenizer: Target model's tokenizer. - target_auxiliary_embeddings: FastText auxiliary embeddings in the target language. - source_auxiliary_embeddings: FastText auxiliary embeddings in the source language. - batch_size: Size of the batches of non-overlapping token computations. + source_tokenizer: Tokenizer of the source language model. + source_embeddings_matrix: 2D matrix containing the weights of the source model's embedding layer. + target_tokenizer: Tokenizer of the target language model. + target_auxiliary_embeddings: FastText auxiliary embeddings for the target language. + source_auxiliary_embeddings: FastText auxiliary embeddings for the source language. + batch_size: Number of tokens to process in each batch for non-overlapping token computations. + + Returns: + The embedding initializer object for the target model, based on FOCUS. """ embeddings_initializer = WeightedAverageEmbeddingsInitialization( source_tokenizer=source_tokenizer, From 058d54d257b462b866fc708faa1c8407c3e521b6 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 22:00:48 +0100 Subject: [PATCH 2/8] Add docstrings to initialization module --- src/langsfer/initialization.py | 119 ++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/src/langsfer/initialization.py b/src/langsfer/initialization.py index ac62ca8..ed01ca9 100644 --- a/src/langsfer/initialization.py +++ b/src/langsfer/initialization.py @@ -1,3 +1,19 @@ +"""This module provides classes for embedding initialization methods used +in cross-lingual transfer learning and language model specialization. + +These classes implement various strategies for initializing the embedding +layers of target language models based on source language embeddings. +They provide flexible and efficient ways to transfer knowledge from +pre-trained models in one language to models in another language. + +Classes in this module allow for fine-tuned control over the embedding +initialization process through several configurable strategies: +- Alignment strategies (e.g., bilingual dictionaries, identity alignment) +- Similarity measures (e.g., cosine similarity) +- Weight computation techniques (e.g., softmax, sparsemax) +- Token overlap strategies (e.g., exact match, fuzzy match) +""" + import logging from abc import ABC, abstractmethod @@ -24,13 +40,41 @@ class EmbeddingInitializer(ABC): + """Abstract base class for initializing embeddings. + + This class serves as the base for various embedding initialization strategies. + Subclasses should implement the `initialize` method to compute embeddings based on specific strategies. + """ + @abstractmethod def initialize( self, *, seed: int | None = None, show_progress: bool = False - ) -> NDArray: ... + ) -> NDArray: + """Abstract method to initialize the embeddings of target tokens. + + This method should be implemented by subclasses to compute and return embeddings. + + Args: + seed: An optional seed for the random number generator. + show_progress: If True, displays a progress bar for the initialization process. + + Returns: + A 2D array containing the initialized target embeddings. + """ + ... class RandomEmbeddingsInitialization(EmbeddingInitializer): + """Random initialization of embeddings using a normal distribution. + + This class initializes embeddings by generating random values based on the mean and + standard deviation of the source embeddings. + + Args: + source_embeddings_matrix: A 2D array containing the source embeddings matrix. + target_tokenizer: A tokenizer for the target language. + """ + def __init__( self, source_embeddings_matrix: NDArray, @@ -43,6 +87,17 @@ def __init__( def initialize( self, *, seed: int | None = None, show_progress: bool = False ) -> NDArray: + """Initialize the target embeddings using random values. + + Generates a random target embeddings matrix based on the mean and standard deviation of the source embeddings matrix. + + Args: + seed: An optional seed for the random number generator. + show_progress: If True, displays a progress bar for the initialization process. + + Returns: + A 2D array containing the randomly initialized target embeddings. + """ rng = np.random.default_rng(seed) target_embeddings_matrix = rng.normal( np.mean(self.source_embeddings_matrix, axis=0), @@ -57,6 +112,25 @@ def initialize( class WeightedAverageEmbeddingsInitialization(EmbeddingInitializer): + """Weighted average initialization of embeddings based on source embeddings. + + This class computes the target embeddings by first copying the embeddings of overlapping tokens + from the source model and then computing the embeddings of non-overlapping tokens as a weighted + average of the source tokens based on similarity. + + Args: + source_tokenizer: The tokenizer of the source language. + source_embeddings_matrix: A 2D array containing the source embeddings matrix. + target_tokenizer: The tokenizer of the target language. + target_auxiliary_embeddings: FastText auxiliary embeddings for the target language. + source_auxiliary_embeddings: Optional FastText auxiliary embeddings for the source language. + alignment_strategy: The strategy used to align source and target embeddings. + similarity_strategy: The strategy used to compute token similarities. + weights_strategy: The strategy used to compute token weights. + token_overlap_strategy: The strategy used to determine token overlap. + batch_size: The size of batches for non-overlapping token computations. + """ + def __init__( self, source_tokenizer: PreTrainedTokenizerBase, @@ -86,6 +160,19 @@ def __init__( def initialize( self, *, seed: int | None = None, show_progress: bool = False ) -> NDArray: + """Initialize target embeddings using weighted averages. + + This method computes the target embeddings by first copying the embeddings of + overlapping tokens from the source model and then calculating the embeddings for + non-overlapping tokens as a weighted average of source tokens based on cosine similarity. + + Args: + seed: An optional seed for the random number generator. + show_progress: If True, displays a progress bar for the initialization process. + + Returns: + A 2D array containing the initialized target embeddings. + """ rng = np.random.default_rng(seed) # Initialize target embeddings as random @@ -138,6 +225,20 @@ def _compute_non_overlapping_token_embeddings( *, show_progress: bool = False, ) -> NDArray: + """Compute embeddings for non-overlapping tokens as weighted averages. + + This method calculates the embeddings for non-overlapping target tokens based on a weighted + average of the source embeddings, using cosine similarity to determine the weights. + + Args: + overlapping_target_token_ids: List of token IDs for overlapping target tokens. + overlapping_source_token_ids: List of token IDs for overlapping source tokens. + non_overlapping_target_token_ids: List of token IDs for non-overlapping target tokens. + show_progress: If True, displays a progress bar for the initialization process. + + Returns: + A 2D array containing the embeddings for the non-overlapping target tokens. + """ # Map source and target subword tokens to auxiliary token space target_subword_embeddings = self._map_tokens_into_auxiliary_embedding_space( self.target_tokenizer, @@ -212,6 +313,22 @@ def _map_tokens_into_auxiliary_embedding_space( tokenizer: PreTrainedTokenizerBase, embeddings: AuxiliaryEmbeddings, ) -> NDArray: + """Map tokens into the auxiliary embedding space. + + This method converts token IDs from a tokenizer into their corresponding vector + representations in an auxiliary embedding space. The embeddings are retrieved + from the provided auxiliary embeddings source. + + Args: + tokenizer: A pre-trained tokenizer. + embeddings: An object containing the auxiliary embeddings, which provides + vector representations for tokens. + + Returns: + A 2D array of shape (n_tokens, embedding_dim), + where each row corresponds to the embedding of a token from the tokenizer + in the auxiliary embedding space. + """ embeddings_matrix = np.zeros( (len(tokenizer), embeddings.embeddings_matrix.shape[1]) ) From 4465b806be105490c558842b923cbca61ee20083 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 22:06:51 +0100 Subject: [PATCH 3/8] Add docstrings to similarity module --- src/langsfer/similarity.py | 42 +++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/langsfer/similarity.py b/src/langsfer/similarity.py index 0c0cecf..ae1c4c0 100644 --- a/src/langsfer/similarity.py +++ b/src/langsfer/similarity.py @@ -1,3 +1,5 @@ +"""This module provides classes for computing similarities between 2D arrays.""" + from abc import ABC, abstractmethod from sklearn.metrics.pairwise import cosine_similarity @@ -8,10 +10,48 @@ class SimilarityStrategy(ABC): + """Abstract base class for similarity computation strategies. + + This class defines the interface for computing similarities between two 2D arrays. + Concrete subclasses must implement the `apply` method to define their specific similarity computation logic. + """ + @abstractmethod - def apply(self, v: NDArray, w: NDArray) -> NDArray: ... + def apply(self, v: NDArray, w: NDArray) -> NDArray: + """Abstract method to compute similarity between two 2D arrays. + + Args: + v: 2D array. + w: 2D array. + + Returns: + A 2D array containing the cosine similarity between the input arrays. + """ + ... class CosineSimilarity(SimilarityStrategy): + """Concrete implementation of the `SimilarityStrategy` that computes the cosine similarity + between two 2D arrays. + + Cosine similarity is a measure of similarity between two vectors based on the cosine of + the angle between them. It is widely used in tasks such as document comparison, word + embedding similarity, and more. + """ + def apply(self, v: NDArray, w: NDArray) -> NDArray: + """Compute the cosine similarity between two vectors `v` and `w`. + + Cosine similarity is calculated as the dot product of `v` and `w` divided by + the product of their magnitudes. It returns a value between -1 and 1, where + 1 indicates that the vectors are identical, and -1 indicates that they are + diametrically opposed. + + Args: + v: 2D array. + w: 2D array. + + Returns: + A 2D array containing the cosine similarity between the input arrays. + """ return cosine_similarity(v, w) From d2abaa70f46b6d0e829f5abc2dda06178fed4573 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sun, 24 Nov 2024 12:02:29 +0100 Subject: [PATCH 4/8] Add docstrings to weights module --- src/langsfer/utils.py | 3 + src/langsfer/weights.py | 152 ++++++++++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 20 deletions(-) diff --git a/src/langsfer/utils.py b/src/langsfer/utils.py index 4fd740d..85333f5 100644 --- a/src/langsfer/utils.py +++ b/src/langsfer/utils.py @@ -58,6 +58,9 @@ def train_fasttext_model( window : The maximum distance between the current and predicted word within a sentence. min_count : The model ignores all words with total frequency lower than this. epochs : Number of iterations (epochs) over the corpus. + + Returns: + Trained FastText model. """ model = FastText(vector_size=vector_size, window=window, min_count=min_count) model.build_vocab(corpus_iterable=corpus_iterable) diff --git a/src/langsfer/weights.py b/src/langsfer/weights.py index 0c5168d..6edf38c 100644 --- a/src/langsfer/weights.py +++ b/src/langsfer/weights.py @@ -1,3 +1,9 @@ +"""This module provides various weight strategies for transforming input scores +into weights that are used to compute target embedding vectors as weighted averages. +The strategies are designed to modify the input scores based on different +criteria, such as ranking, sparsity, or normalization. +""" + from abc import ABC, abstractmethod from typing import Optional @@ -9,11 +15,39 @@ class WeightsStrategy(ABC): + """Abstract base class for weight computation strategies. + + This class defines the interface for applying a weight transformation to input scores. + Subclasses must implement the `_compute_weights` method to define their specific weight computation logic. + + Additionally, the `apply` is a concrete method that handles the application of the transformation, + with optional chaining of multiple weight strategies via the `compose` method. + + Attributes: + _next_strategy: Optionally references another `WeightsStrategy` that can be applied after the current one. + """ + _next_strategy: Optional["WeightsStrategy"] = None def apply(self, scores: NDArray) -> NDArray: + """Applies the weight transformation to the input scores. + + This method first checks that the input scores are two-dimensional, computes the weights using + the `_compute_weights` method, and optionally applies a next strategy if defined. + + Args: + scores: A 2D array of input scores to be transformed. + + Returns: + A 2D array of transformed weights. + + Raises: + RuntimeError: If the input scores or output weights are not 2-dimensional. + """ if scores.ndim != 2: - raise ValueError(f"scores must have 2 dimensions instead of {scores.ndim}") + raise RuntimeError( + f"scores must have 2 dimensions instead of {scores.ndim}" + ) weights = self._compute_weights(scores) if self._next_strategy is not None: weights = self._next_strategy.apply(weights) @@ -27,6 +61,20 @@ def apply(self, scores: NDArray) -> NDArray: def _compute_weights(self, scores: NDArray) -> NDArray: ... def compose(self, other: "WeightsStrategy") -> "WeightsStrategy": + """Chains another weight strategy to apply after the current strategy. + + The resulting strategy will apply the current strategy first, then apply the + `other` strategy to the transformed weights. + + Args: + other: Another `WeightsStrategy` to apply after the current one. + + Returns: + The current strategy, now with a chained `other` strategy. + + Raises: + ValueError: If `other` is not an instance of `WeightsStrategy`. + """ if not isinstance(other, WeightsStrategy): raise ValueError( f"other must be an instance of WeightsStrategy instead of {type(other)}" @@ -36,41 +84,95 @@ def compose(self, other: "WeightsStrategy") -> "WeightsStrategy": class IdentityWeights(WeightsStrategy): + """Weight strategy that returns the input scores unchanged. + + This strategy applies no transformation to the input scores and simply returns them as is. + + Example: + >>> weight_strategy = IdentityWeights() + >>> scores = np.array([[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]]) + >>> weight_strategy.apply(scores).tolist() + [[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]] + """ + def _compute_weights(self, scores: NDArray) -> NDArray: + """Returns the input scores without any modification. + + Args: + scores: A 2D array of input scores. + + Returns: + The same input scores as the output. + """ return scores class SoftmaxWeights(WeightsStrategy): + """Weight strategy that applies the softmax transformation to the input scores. + + Softmax normalizes the scores into a probability distribution, where each score is divided by the + sum of the exponentials of all scores in the row, resulting in values between 0 and 1. + + Args: + temperature: A scaling factor applied to the scores before applying softmax. + + Example: + >>> weight_strategy = SoftmaxWeights() + >>> scores = np.array([[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]]) + >>> weight_strategy.apply(scores).tolist() + [[0.09003058735208934, 0.24472848513183193, 0.6652409275160788], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]] + """ + def __init__(self, temperature: float = 1.0) -> None: self._epsilon = 1e-7 self.temperature = temperature + self._epsilon def _compute_weights(self, scores: NDArray) -> NDArray: + """Computes the softmax weights for the input scores. + + Softmax is applied by dividing the scores by the specified temperature to control the spread + of the probability distribution. + + Args: + scores: A 2D array of input scores. + + Returns: + A 2D array of transformed scores that represent a probability distribution. + """ weights = softmax(scores / self.temperature, axis=1) return weights class SparsemaxWeights(WeightsStrategy): - """Implements Sparsemax weight strategy. + """Weight strategy that applies the Sparsemax transformation to the input scores. - Described in Martins, Andre, and Ramon Astudillo. + Sparsemax is a sparse alternative to softmax, where less significant values are set to zero. + This implementation follows the method described in the paper: [From softmax to sparsemax: A sparse model of attention and multi-label classification.](https://proceedings.mlr.press/v48/martins16) - International conference on machine learning. PMLR, 2016. The implementation is a slightly modified version of this code: https://github.com/AndreasMadsen/course-02456-sparsemax/blob/cd73efc1267b5c3b319fb3dc77774c99c10d5d82/python_reference/sparsemax.py#L4 The original code is license under the [MIT license.](https://github.com/AndreasMadsen/course-02456-sparsemax/blob/cd73efc1267b5c3b319fb3dc77774c99c10d5d82/LICENSE.md) Examples: - >>> from langsfer.weight import SparsemaxWeights - >>> import numpy as np - >>> weights_strategy = SparsemaxWeights() - >>> scores = np.array([[0.0, 1.0, 2.0], [10, 20, 30]]) - >>> weights_strategy.apply(scores).tolist() - [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]] + >>> weight_strategy = SparsemaxWeights() + >>> scores = np.array([[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]]) + >>> weight_strategy.apply(scores).tolist() + [[0.0, 0.0, 1.0], [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]] """ def _compute_weights(self, scores: NDArray) -> NDArray: + """Computes the Sparsemax weights for the input scores. + + Sparsemax is computed by first sorting the scores, calculating a threshold `tau`, and + then zeroing out values that are below this threshold. + + Args: + scores: A 2D array of input scores. + + Returns: + A 2D array of transformed weights with sparsity enforced. + """ # Translate by max for numerical stability scores = scores - scores.max(axis=-1, keepdims=True) @@ -93,29 +195,39 @@ def _compute_weights(self, scores: NDArray) -> NDArray: class TopKWeights(WeightsStrategy): - """Weight strategy that keeps the top-k highest input scores per row - and sets all the other ones to -np.inf in order for them to be ignored in future computations - e.g. if this strategy is followed by the softmax strategy then those values become 0. + """Weight strategy that retains only the top-k highest values per row and sets all other values to -np.inf. + + This strategy is useful in situations where only the top-k scores are relevant, and the remaining values + should be ignored in subsequent computations, e.g. if this strategy is followed by the softmax strategy then those values become 0. This implementation method is heavily inspired by the one provided in the following stackoverflow answer: https://stackoverflow.com/a/59405060 The original code is licensed under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) + Args: + k: The number of top values to retain for each row. + Examples: - >>> from langsfer.weights import TopKWeights - >>> import numpy as np + >>> scores = np.array([[1.0, 2.0, 3.0], [4.0, 4.0, 4.0]]) >>> weight_strategy = TopKWeights(k=1) - >>> weight_strategy.apply(np.array([[3, 1, 10]])).tolist() - [[-np.inf, -np.inf, 10]] - - Args: - k: Number of highest values per row to keep + >>> weight_strategy.apply(scores).tolist() + [[-inf, -inf, 3.0], [4.0, 4.0, 4.0]] """ def __init__(self, k: int = 10) -> None: self.k = k def _compute_weights(self, scores: NDArray) -> NDArray: + """Computes the Top-K weights for the input scores. + + The scores are first partitioned to find the top-k values per row. All other values are set to -np.inf. + + Args: + scores: A 2D array of input scores. + + Returns: + A 2D array of weights with only the top-k values per row kept and others replaced with -np.inf. + """ # Get unsorted indices of top-k values topk_indices = np.argpartition(scores, -self.k, axis=1)[:, -self.k :] rows, _ = np.indices((scores.shape[0], self.k)) From cb05e5d8b7f0a7f8614c1548f97f8b5ceefb573d Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sun, 24 Nov 2024 12:20:34 +0100 Subject: [PATCH 5/8] Add docstrings to alignment module --- src/langsfer/alignment.py | 79 +++++++++++++++++++++++++++++++++++++-- src/langsfer/weights.py | 2 + 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/src/langsfer/alignment.py b/src/langsfer/alignment.py index 7427606..9261d33 100644 --- a/src/langsfer/alignment.py +++ b/src/langsfer/alignment.py @@ -1,3 +1,8 @@ +"""This module provides strategies for aligning embedding matrices using different techniques. + +The `AlignmentStrategy` class is an abstract base class that defines the interface for embedding alignment strategies. +""" + import logging import os import warnings @@ -16,18 +21,47 @@ class AlignmentStrategy(ABC): + """Abstract base class for defining strategies to align embedding matrices. + + Subclasses must implement the `apply` method to define the logic for aligning + the embedding matrix based on their specific alignment technique. + """ + @abstractmethod def apply(self, embedding_matrix: NDArray) -> NDArray: ... class IdentityAlignment(AlignmentStrategy): + """Alignment strategy that does not alter the input embedding matrix. + + This strategy simply returns the input embedding matrix unchanged. + + Example: + >>> identity_alignment = IdentityAlignment() + >>> aligned_embeddings = identity_alignment.apply(embedding_matrix) + >>> # aligned_embeddings will be the same as embedding_matrix + """ + def apply(self, embedding_matrix: NDArray) -> NDArray: + """Returns the input embedding matrix unchanged. + + Args: + embedding_matrix: 2D embedding matrix to be aligned. + + Returns: + The same embedding matrix as the output, without any modifications. + """ return embedding_matrix class BilingualDictionaryAlignment(AlignmentStrategy): """Alignment strategy that uses a bilingual dictionary to compute the alignment matrix. + This strategy uses word pairs from a bilingual dictionary to compute an alignment + matrix between the source and target embedding matrices. The dictionary maps words in the + source language to words in the target language. The alignment matrix is computed by + applying orthogonal Procrustes analysis to the word vector correspondences. + The bilingual dictionary maps words in the source language to words in the target language and is expected to be of the form: @@ -39,10 +73,10 @@ class BilingualDictionaryAlignment(AlignmentStrategy): ``` Args: - source_word_embeddings: Word embeddings of the source language - target_word_embeddings: Word embeddings of the target language - bilingual_dictionary: Dictionary mapping words in source language to words in target language - bilingual_dictionary_file: Path to a bilingual dictionary file + source_word_embeddings: Word embeddings of the source language. + target_word_embeddings: Word embeddings of the target language. + bilingual_dictionary: Dictionary mapping words in source language to words in target language. + bilingual_dictionary_file: Path to a bilingual dictionary file containing word pairs. """ def __init__( @@ -75,6 +109,23 @@ def __init__( def _load_bilingual_dictionary( file_path: str | os.PathLike, ) -> dict[str, list[str]]: + """Loads a bilingual dictionary from a file. + + The file is expected to contain word pairs, one per line, separated by tabs, e.g.: + + ``` + english_word1 \t target_word1\n + english_word2 \t target_word2\n + ... + english_wordn \t target_wordn\n + ``` + + Args: + file_path: Path to the bilingual dictionary file. + + Returns: + A dictionary where the keys are source language words, and the values are lists of target language words. + """ bilingual_dictionary: dict[str, list[str]] = {} for line in open(file_path): @@ -91,6 +142,15 @@ def _load_bilingual_dictionary( return bilingual_dictionary def _compute_alignment_matrix(self) -> NDArray: + """Computes the alignment matrix using the bilingual dictionary. + + The method iterates over the bilingual dictionary, retrieving word vector correspondences from the + source and target language embeddings. It uses orthogonal Procrustes analysis to compute the + transformation matrix that aligns the source word embeddings with the target word embeddings. + + Returns: + A 2D array representing the alignment matrix. + """ logger.info( "Computing word embedding alignment matrix from bilingual dictionary" ) @@ -145,6 +205,17 @@ def _compute_alignment_matrix(self) -> NDArray: return alignment_matrix def apply(self, embedding_matrix: NDArray) -> NDArray: + """Applies the computed alignment matrix to the given embedding matrix. + + The embedding matrix is transformed by multiplying it with the alignment matrix + obtained from the bilingual dictionary. + + Args: + embedding_matrix: 2D embedding matrix to be aligned. + + Returns: + Aligned embedding matrix. + """ alignment_matrix = self._compute_alignment_matrix() aligned_embedding_matrix = embedding_matrix @ alignment_matrix return aligned_embedding_matrix diff --git a/src/langsfer/weights.py b/src/langsfer/weights.py index 6edf38c..08bc3a8 100644 --- a/src/langsfer/weights.py +++ b/src/langsfer/weights.py @@ -2,6 +2,8 @@ into weights that are used to compute target embedding vectors as weighted averages. The strategies are designed to modify the input scores based on different criteria, such as ranking, sparsity, or normalization. + +The `WeightsStrategy` class is an abstract base class that defines the interface for weight strategies. """ from abc import ABC, abstractmethod From 7755088ea1d03af53c8d261518eaa09269ad86f4 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sun, 24 Nov 2024 12:35:27 +0100 Subject: [PATCH 6/8] Add docstrings to embeddings module --- src/langsfer/embeddings.py | 216 ++++++++++++++++++++++++++++++++----- 1 file changed, 189 insertions(+), 27 deletions(-) diff --git a/src/langsfer/embeddings.py b/src/langsfer/embeddings.py index 43b67ed..a9cf79f 100644 --- a/src/langsfer/embeddings.py +++ b/src/langsfer/embeddings.py @@ -1,3 +1,8 @@ +""" +This module provides classes and methods for working with various types of token embeddings, +including FastText and Transformer-based embeddings. +""" + import gzip import logging import os @@ -25,34 +30,85 @@ class AuxiliaryEmbeddings(ABC): - """Base class for auxiliary embeddings.""" + """Abstract base class for auxiliary token embeddings. + + This class defines common methods for any embedding source, such as retrieving + the embeddings matrix, vocabulary, and specific token (character, subword, word) operations (e.g., token-to-ID, + ID-to-token, token-to-vector). + + Subclasses should implement the methods to provide specific functionality for + different embedding models (e.g., FastText, Transformers). + """ @property @abstractmethod - def embeddings_matrix(self) -> NDArray: ... + def embeddings_matrix(self) -> NDArray: + """Returns the matrix of embeddings. + + Returns: + 2D array where each row represents the embedding of a token in the vocabulary. + """ + ... @property @abstractmethod - def vocabulary(self) -> list[str]: ... + def vocabulary(self) -> list[str]: + """Returns the vocabulary of the embeddings. + + Returns: + List of tokens in the vocabulary corresponding to the embeddings. + """ + ... @abstractmethod - def get_id_for_token(self, token: str) -> int: ... + def get_id_for_token(self, token: str) -> int: + """Retrieves the index (ID) for the given token. + + Args: + token: The token whose ID is to be retrieved. + + Returns: + ID corresponding to the token. + """ + ... @abstractmethod - def get_token_for_id(self, id_: int) -> str: ... + def get_token_for_id(self, id_: int) -> str: + """Retrieves the token corresponding to the given ID. + + Args: + id_: ID whose corresponding token is to be retrieved. + + Returns: + Token corresponding to the ID. + """ + ... @abstractmethod - def get_vector_for_token(self, token: str) -> str: ... + def get_vector_for_token(self, token: str) -> str: + """Retrieves the embedding vector for the given token. + + Args: + token: Tord whose embedding vector is to be retrieved. + + Returns: + Vector corresponding to the token. + """ + ... class FastTextEmbeddings(AuxiliaryEmbeddings): - """Loads embeddings from a pretrained FastText model from a local path or a url. + """Class for loading and working with FastText embeddings. - Args: - model: FastText model. + This class allows you to load pre-trained FastText embeddings either from a local file + or by downloading them from the FastText website. It provides functionality to access + the embeddings matrix, vocabulary, and individual token embeddings. - Attributes: - model: FastText model. + FastText embeddings are useful for handling subword information, which is beneficial + for morphologically rich languages or words that are out of vocabulary (OOV). + + Args: + model: A pre-trained FastText model loaded using Gensim. """ VALID_LANGUAGE_IDS = { @@ -223,6 +279,18 @@ def __init__(self, model: FastText) -> None: def from_model_name_or_path( cls, model_name_or_path: os.PathLike | str, *, force: bool = False ) -> None: + """Loads a FastText model from a local path or from a FastText website URL. + + If a valid language ID is provided, the model will be downloaded from FastText's + website. Otherwise, a model from the specified local path is loaded. + + Args: + model_name_or_path: The name or path of the model to load. + force: If True, forces the re-download of the model even if it exists locally. + + Returns: + An instance of the FastTextEmbeddings class with the loaded model. + """ if os.path.exists(model_name_or_path): if Path(model_name_or_path).suffix == ".bin": model = load_facebook_model(model_name_or_path) @@ -243,20 +311,23 @@ def from_model_name_or_path( def _download_model( language_id: str, *, force: bool = False, chunk_size: int = 2**13 ) -> Path: - """Download pre-trained common-crawl vectors from fastText's website + """Download pre-trained common-crawl vectors from fastText's website. + https://fasttext.cc/docs/en/crawl-vectors.html + The model is cached locally to avoid re-downloading in the future. + Original code from: https://github.com/facebookresearch/fastText/blob/02c61efaa6d60d6bb17e6341b790fa199dfb8c83/python/fasttext_module/fasttext/util/util.py#L183 License: [MIT](https://github.com/facebookresearch/fastText/blob/02c61efaa6d60d6bb17e6341b790fa199dfb8c83/LICENSE) Args: - language_id: String representing the ID of the language, e.g. "ar", for which the model will be downloaded - force: If True, overwrite cached files - chunk_size: + language_id: String representing the language ID (e.g. "ar", "en") of the model to download. + force: If True, forces re-downloading the model even if it exists locally. + chunk_size: The size of the chunks to download at a time. Returns: - Path to downloaded and extracted model file + Path to downloaded and extracted FastText model file """ if language_id not in FastTextEmbeddings.VALID_LANGUAGE_IDS: raise Exception( @@ -293,29 +364,63 @@ def _download_model( @property def embeddings_matrix(self) -> NDArray: + """Returns the embeddings matrix of the loaded FastText model. + + Returns: + 2D array containing the embeddings for each token in the vocabulary. + """ return self._model.wv.vectors @property def vocabulary(self) -> list[str]: + """Returns the vocabulary of the FastText model. + + Returns: + lList of tokens in the model's vocabulary. + """ tokens: list[str] = list(self._model.wv.key_to_index.keys()) return tokens def get_id_for_token(self, token: str) -> int: + """Retrieves the index (ID) for the given token in the FastText model. + + Args: + token: Tord whose ID is to be retrieved. + + Returns: + The ID corresponding to the token. + """ return self._model.wv.get_index(token) def get_token_for_id(self, id_: int) -> str: + """Retrieves the token corresponding to the given ID. + + Args: + id_: ID whose corresponding token is to be retrieved. + + Returns: + Token corresponding to the ID. + """ return self._model.wv.index_to_key[id_] - def get_vector_for_token(self, token: str) -> str: + def get_vector_for_token(self, token: str) -> NDArray: + """Retrieves the embedding vector for the given token in the FastText model. + + Args: + token: Token whose embedding vector is to be retrieved. + + Returns: + Embedding vector corresponding to the token. + """ return self._model.wv.get_vector(token) @staticmethod def _reduce_matrix( - X_orig: NDArray, dim: int, eigv: NDArray | None + X_orig: NDArray, dim: int, eigv: NDArray | None = None ) -> tuple[NDArray, NDArray]: - """ - Reduces the dimension of a `(m, n)` matrix `X_orig` - to a `(m, dim)` matrix `X_reduced`. + """Reduces the dimensionality of the FastText embeddings matrix using PCA. + + Reduces the dimension of a `(m, n)` matrix `X_orig` to a `(m, dim)` matrix `X_reduced` using PCA. It uses only the first 100000 rows of `X_orig` to do the mapping. Matrix types are all `np.float32` in order to avoid unncessary copies. @@ -325,6 +430,14 @@ def _reduce_matrix( MIT License: https://github.com/facebookresearch/fastText/blob/02c61efaa6d60d6bb17e6341b790fa199dfb8c83/LICENSE + + Args: + X_orig: Original matrix of embeddings. + dim: Target dimensionality after reduction. + eigv: Optional eigenvector matrix, used for dimension reduction. + + Returns: + Tuple containing the reduced matrix and the eigenvector matrix. """ if eigv is None: mapping_size = 100000 @@ -339,8 +452,9 @@ def _reduce_matrix( return (X_reduced, eigv) def reduce_model_dimension(self, target_dim: int) -> None: - """Computes the PCA of the input and the output matrices - and sets the reduced ones. + """Reduces the dimensionality of the FastText model using PCA. + + This method adjusts both the input and output matrices of the model to the target dimension. Original code taken from: https://github.com/facebookresearch/fastText/blob/02c61efaa6d60d6bb17e6341b790fa199dfb8c83/python/fasttext_module/fasttext/util/util.py @@ -362,13 +476,15 @@ def reduce_model_dimension(self, target_dim: int) -> None: class TransformersEmbeddings(AuxiliaryEmbeddings): - """Loads embeddings from a pretrained model from a local path or the HuggingFace Hub. + """Class for loading and working with Transformer-based model embeddings. - Loads the specified model and extracts the input embeddings - weights as a numpy array. + This class allows you to load pre-trained embeddings from transformer-based models + (such as BERT, GPT, etc.) from the HuggingFace model hub or a local path. It provides + functionality to access the embeddings matrix, vocabulary, and individual token embeddings. Args: - model_name_or_path: Name or path of model to load. + embeddings_matrix: Embeddings matrix as a NumPy array. + tokenizer: Tokenizer associated with the embeddings. """ def __init__( @@ -381,6 +497,18 @@ def __init__( def from_model_name_or_path( cls, model_name_or_path: os.PathLike | str, *, trust_remote_code: bool = False ) -> "TransformersEmbeddings": + """Loads a transformer model and extracts its embeddings matrix. + + The method supports models from HuggingFace's model hub and local paths. It also + loads the corresponding tokenizer. + + Args: + model_name_or_path: The model's name or local path. + trust_remote_code: Whether to trust code from the remote model. + + Returns: + An instance of the TransformersEmbeddings class. + """ model = AutoModel.from_pretrained( model_name_or_path, trust_remote_code=trust_remote_code ) @@ -409,6 +537,11 @@ def _get_unembedding_matrix(model: PreTrainedModel) -> NDArray: @property def embeddings_matrix(self) -> NDArray: + """Returns the embeddings matrix from the transformer model. + + Returns: + 2D array of embeddings for each token in the vocabulary. + """ return self._embeddings_matrix @property @@ -417,15 +550,44 @@ def tokenizer(self) -> PreTrainedTokenizerBase: @property def vocabulary(self) -> list[str]: + """Returns the vocabulary of the tokenizer. + + Returns: + List of tokens in the tokenizer's vocabulary. + """ tokens = list(self._tokenizer.vocab.keys()) return tokens def get_id_for_token(self, token: str) -> int: + """Retrieves the token ID for a given token using the tokenizer. + + Args: + token: Tord whose ID is to be retrieved. + + Returns: + The ID corresponding to the token. + """ return self._tokenizer.convert_tokens_to_ids(token) def get_token_for_id(self, id_: int) -> str: + """Retrieves the token corresponding to a given ID using the tokenizer. + + Args: + id_: ID whose corresponding token is to be retrieved. + + Returns: + The token corresponding to the ID. + """ return self._tokenizer.decode(id_).strip() def get_vector_for_token(self, token: str) -> str: + """Retrieves the embedding vector for the given token using the model. + + Args: + token: Token whose embedding vector is to be retrieved. + + Returns: + Embedding vector corresponding to the token. + """ id_ = self.get_id_for_token(token) return self._embeddings_matrix[id_] From de6eccf8401ee3e9aa25e1b9eb6e1fa8b72daef6 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sun, 24 Nov 2024 12:41:31 +0100 Subject: [PATCH 7/8] Add docstrings to token_overlap module --- src/langsfer/token_overlap.py | 87 +++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/src/langsfer/token_overlap.py b/src/langsfer/token_overlap.py index e6c742d..87685a0 100644 --- a/src/langsfer/token_overlap.py +++ b/src/langsfer/token_overlap.py @@ -1,3 +1,12 @@ +""" +This module contains strategies for finding token overlap between source and target tokenizer vocabularies. + +It defines several strategies for detecting the intersection and difference of tokens in two tokenizers' vocabularies. +These strategies are used to analyze and compare token vocabularies, which is essential for cross-lingual transfer and token alignment. + +Each strategy provides a method to apply the logic to two tokenizers and return the overlapping and non-overlapping tokens. +""" + import logging from abc import ABC, abstractmethod @@ -15,8 +24,10 @@ class TokenOverlapStrategy(ABC): - """Abstract strategy for finding the overlapping and non-overlapping (missing) tokens - between a source tokenizer vocabulary and target tokenizer vocabulary. + """Abstract base class for strategies that compute token overlap between source and target tokenizers. + + This class provides an abstract method `apply()` which should be implemented by subclasses to define the strategy + for finding overlapping and non-overlapping (missing) tokens between the vocabularies of two tokenizers. """ @abstractmethod @@ -28,11 +39,11 @@ def apply( class ExactMatchTokenOverlap(TokenOverlapStrategy): - """Token overlap strategy for finding the overlapping and non-overlapping (missing) tokens - that match exactly between a source tokenizer vocabulary and target tokenizer vocabulary. + """Strategy to find overlapping and non-overlapping tokens that match exactly between source and target tokenizers. - The source tokenizer's vocabulary to match is returned by the `_get_source_vocab` method - and target tokenizer's vocabulary ot match is returned by the `_get_target_vocab` method. + This class compares the vocabularies of the source and target tokenizers and finds the tokens that exactly match + in both vocabularies. The `_get_source_vocab` and `_get_target_vocab` methods are used to retrieve the vocabularies + of the source and target tokenizers respectively. """ def apply( @@ -40,16 +51,16 @@ def apply( source_tokenizer: PreTrainedTokenizerBase, target_tokenizer: PreTrainedTokenizerBase, ) -> tuple[list[str], list[str]]: - """Applies the strategy to the source and target tokenizer vocabularies. + """Finds the overlapping and non-overlapping tokens that exactly match between source and target tokenizers. Args: - source_tokenizer: Source tokenizer. - target_tokenizer: Target tokenizer. + source_tokenizer: Tokenizer for the source language model. + target_tokenizer: Tkenizer for the target language model. Returns: - A tuple containing: - overlapping_tokens: Sorted list of overlapping tokens. - missing_tokens: Sorted list of missing tokens. + Tuple containing: + - overlapping_tokens: A sorted list of tokens that appear in both the source and target vocabularies. + - non_overlapping_tokens: A sorted list of tokens that are in the target vocabulary but not in the source vocabulary. """ overlapping_tokens: list[str] = [] non_overlapping_tokens: list[str] = [] @@ -74,8 +85,10 @@ def _get_target_vocab(self, tokenizer: PreTrainedTokenizerBase) -> set[str]: class NoTokenOverlap(ExactMatchTokenOverlap): - """Subclass of ExactMatchTokenOverlap that returns an empty set as the source tokenizer's vocabulary - to guarantee that no overlapping token is found. + """Subclass of `ExactMatchTokenOverlap` that ensures no tokens overlap between source and target vocabularies. + + This class overrides the `_get_source_vocab` method to return an empty set, ensuring that no tokens from the source + vocabulary will be considered overlapping with the target vocabulary. """ def _get_source_vocab(self, tokenizer: PreTrainedTokenizerBase) -> set[str]: @@ -84,8 +97,10 @@ def _get_source_vocab(self, tokenizer: PreTrainedTokenizerBase) -> set[str]: class SpecialTokenOverlap(ExactMatchTokenOverlap): - """Subclass of ExactMatchTokenOverlap that returns an only special tokens from the source tokenizer's vocabulary - to guarantee that only overlapping special tokens are found. + """Subclass of `ExactMatchTokenOverlap` that finds only special tokens that overlap between the source and target vocabularies. + + This class overrides the `_get_source_vocab` method to return only special tokens from the source tokenizer's vocabulary, + ensuring that only special tokens are considered for overlap comparison. """ def _get_source_vocab(self, tokenizer: PreTrainedTokenizerBase) -> set[str]: @@ -99,10 +114,11 @@ def _get_source_vocab(self, tokenizer: PreTrainedTokenizerBase) -> set[str]: class FuzzyMatchTokenOverlap(TokenOverlapStrategy): - """Token overlap strategy for finding the overlapping and non-overlapping (missing) tokens - between a source tokenizer vocabulary and target tokenizer vocabulary whose canonicalized form matches. + """Strategy to find overlapping and non-overlapping tokens between source and target tokenizers using fuzzy matching. - Inspired by the fuzzy token matcher described and implemented in FOCUS. + This class uses a technique inspired by the FOCUS fuzzy token matcher to compare the canonical forms of tokens + between the source and target tokenizers. The canonical form of a token is a lowercased version without any tokenizer + prefixes (such as WordPiece's `##`, BPE's `Ġ`, or Unigram's `▁`). """ BPE_TOKEN_PREFIX = "Ġ" @@ -114,6 +130,19 @@ def apply( source_tokenizer: PreTrainedTokenizerBase, target_tokenizer: PreTrainedTokenizerBase, ) -> tuple[list[str], list[str]]: + """Finds the overlapping and non-overlapping tokens between source and target tokenizers based on their canonicalized forms. + + Args: + source_tokenizer: Tokenizer for the source language. + target_tokenizer: Tokenizer for the target language. + + Returns: + Tuple containing: + - overlapping_tokens: A list of tokens from the target tokenizer that + match canonicalized tokens from the source tokenizer. + - non_overlapping_tokens: A list of tokens from the target tokenizer that + do not match any canonicalized tokens from the source tokenizer. + """ canonical_source_vocab = self._canonicalize_vocab(source_tokenizer) canonical_target_vocab = self._canonicalize_vocab(target_tokenizer) canonical_source_tokens = set(x for x in canonical_source_vocab.values()) @@ -129,6 +158,17 @@ def apply( return overlapping_tokens, non_overlapping_tokens def _canonicalize_vocab(self, tokenizer: PreTrainedTokenizerBase) -> dict[str, str]: + """Canonicalizes the vocabulary of a tokenizer by converting each token to its canonical form. + + This method processes the tokens of the tokenizer's vocabulary by removing any tokenizer-specific prefixes + and converting tokens to lowercase. + + Args: + tokenizer: Tokenizer whose vocabulary is to be canonicalized. + + Returns: + Dictionary mapping tokens to their canonicalized forms. + """ canonical_vocab: dict[str, str] = {} for token, token_idx in sorted(tokenizer.vocab.items(), key=lambda x: x[1]): @@ -139,6 +179,15 @@ def _canonicalize_vocab(self, tokenizer: PreTrainedTokenizerBase) -> dict[str, s def _canonicalize_token( self, tokenizer: PreTrainedTokenizerBase, token_id: int ) -> str: + """Converts a token to its canonical form by removing tokenizer-specific prefixes and converting to lowercase. + + Args: + tokenizer: Tokenizer used to convert the token ID to its canonical form. + token_id: ID of the token to canonicalize. + + Returns: + Canonical form of the token. + """ # We use `convert_ids_to_tokens` instead of `decode` # because the former adds the beginning of word prefix to tokens # and because it doesn't outright remove tokens like '\u2028' From a6c076dc66ebeb7b11a8493496f2cd3b4aff2885 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sun, 24 Nov 2024 13:01:03 +0100 Subject: [PATCH 8/8] Update readme --- README.md | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 094c500..2a41b1f 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,8 @@ The library currently implements the following methods: - [CLP-Transfer: Efficient language model training through cross-lingual and progressive transfer learning.](https://arxiv.org/abs/2301.09626) Ostendorff, Malte, and Georg Rehm. arXiv preprint arXiv:2301.09626 (2023). - [FOCUS: Effective Embedding Initialization for Specializing Pretrained Multilingual Models on a Single Language.](https://arxiv.org/abs/2305.14481) Dobler, Konstantin, and Gerard de Melo. arXiv preprint arXiv:2305.14481 (2023). +Langsfer is flexible enough to allow mixing and matching strategies between different embedding initialization schemes. For example, you can combine fuzzy token overlap with the CLP-Transfer method to refine the initialization process based on fuzzy matches between source and target tokens. This flexibility enables you to experiment with a variety of strategies for different language transfer tasks, making it easier to fine-tune models for your specific use case. + ## Quick Start ### Installation @@ -79,6 +81,7 @@ source_tokenizer = AutoTokenizer.from_pretrained("roberta-base") target_tokenizer = AutoTokenizer.from_pretrained("benjamin/roberta-base-wechsel-german") source_model = AutoModel.from_pretrained("roberta-base") +# For models with non-tied embeddings you can choose whether you should transfer the input and output embeddings separately. source_embeddings_matrix = source_model.get_input_embeddings().weight.detach().numpy() source_auxiliary_embeddings = FastTextEmbeddings.from_model_name_or_path("en") @@ -105,8 +108,7 @@ To initialize the target embeddings you would then use: target_embeddings_matrix = embedding_initializer.initialize(seed=16, show_progress=True) ``` -The result is an object of type `TransformersEmbeddings` that contain the initialized -embeddings in its `embeddings_matrix` field and the target tokenizer in its `tokenizer` field. +The result is a 2D arrays that contains the initialized embeddings matrix for the target language model. We can then replace the source model's embeddings matrix with this newly initialized embeddings matrix: @@ -123,6 +125,30 @@ target_model.get_input_embeddings().weight.data = torch.as_tensor(target_embeddi target_model.save_pretrained("path/to/target_model") ``` +## Roadmap + +Here are some of the planned developments for Langsfer: + +- **Performance Optimization**: Improve the efficiency and usability of the library to streamline workflows + and improve computational performance. + +- **Model Training & Hugging Face Hub Publishing**: Train both small and large models with embeddings initialized using Langsfer + and publish the resulting models to the Hugging Face Hub for public access and use. + +- **Parameter-Efficient Fine-Tuning**: Investigate using techniques such as LoRA (Low-Rank Adaptation) + to enable parameter-efficient fine-tuning, making it easier to adapt models to specific languages with minimal overhead. + +- **Implement New Methods**: Extend Langsfer with additional language transfer methods, including: + + - [Ofa: A framework of initializing unseen subword embeddings for efficient large-scale multilingual continued pretraining.](https://arxiv.org/abs/2311.08849) + Liu, Y., Lin, P., Wang, M. and Schütze, H., 2023. arXiv preprint arXiv:2311.08849. + - [Zero-Shot Tokenizer Transfer.](https://arxiv.org/abs/2405.07883) + Minixhofer, B., Ponti, E.M. and Vulić, I., 2024. arXiv preprint arXiv:2405.07883. + +- **Comprehensive Benchmarking**: Run extensive benchmarks across all implemented methods to evaluate their performance, identify strengths + and weaknesses, and compare results to establish best practices for language transfer. + + ## Contributing Refer to the [contributing guide](CONTRIBUTING.md) for instructions on you can make contributions to this repository.