Merge pull request #4 from luozhouyang/dev

Add albert adapter
luozhouyang · Jul 5, 2020 · fda24dd · fda24dd
2 parents b519d62 + cb99ac2
commit fda24dd
Show file tree

Hide file tree

Showing 12 changed files with 433 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,21 @@
 
 Transformer-based models implemented in tensorflow 2.x(Keras).
 
+## Contents
+
+- [transformers-keras](#transformers-keras)
+  - [Contents](#contents)
+  - [Installation](#installation)
+  - [Models](#models)
+  - [Transformer](#transformer)
+  - [BERT](#bert)
+    - [Pretraining a new BERT model](#pretraining-a-new-bert-model)
+    - [Load a pretrained BERT model](#load-a-pretrained-bert-model)
+  - [ALBERT](#albert)
+    - [Pretraining a new ALBERT model](#pretraining-a-new-albert-model)
+    - [Load a pretrained ALBERT model](#load-a-pretrained-albert-model)
+
+
 ## Installation
 
 ```bash
@@ -53,6 +68,14 @@ runner.train(train_files, epochs=10, callbacks=None)
 
 ## BERT
 
+You can use `BERT` models in two ways:
+
+* [Pretraining a new BERT model](#pretraining-a-new-bert-model)
+* [Load a pretrained model](#load-a-pretrained-bert-model)
+
+
+### Pretraining a new BERT model
+
 Use your own data to pretrain a BERT model.
 
 ```python
@@ -82,9 +105,9 @@ Tips:
 > You can subclass `transformers_keras.tokenizers.BertTFRecordDatasetBuilder` to parse custom tfrecord examples as you need.
 
 
-### Load the pretrained model
+### Load a pretrained BERT model
 
-You can use an `Adapter` to load pretrained models.
+You can use an `BertAdapter` to load pretrained models.
 
 Here is an example.
 
@@ -94,8 +117,8 @@ from transformers_keras.adapters import BertAdapter
 # download the pretrained model and extract it to some path
 PRETRAINED_BERT_MODEL = '/path/to/chinese_L-12_H-768_A-12'
 
-adapter = BertAdapter()
-model = adapter.adapte(PRETRAINED_BERT_MODEL)
+adapter = BertAdapter(strategy='chinese-bert-base')
+model, vocab_file = adapter.adapte(PRETRAINED_BERT_MODEL)
 
 print('model inputs: {}'.format(model.inputs))
 print('model outputs: {}'.format(model.outputs))
@@ -109,11 +132,20 @@ model inputs: [<tf.Tensor 'input_ids:0' shape=(None, 512) dtype=int32>, <tf.Tens
 model outputs: [<tf.Tensor 'predictions/Identity:0' shape=(512, 21128) dtype=float32>, <tf.Tensor 'relations/Identity:0' shape=(2,) dtype=float32>]
 ```
 
+You can implement a custom `Strategy` to load pretrained models from anywhere. 
+The `transformers_keras.adapters.bert_adapter.ChineseBertBaseStrategy` is an good example.
+
 Then, you can use this model to do anything you want!
 
 
 ## ALBERT
 
+You can use `ALBERT` model in two ways:
+* [Pretraining a new ALBERT model](#pretraining-a-new-albert-model)
+* [Load a pretrained ALBERT model](#load-a-pretrained-albert-model)
+
+
+### Pretraining a new ALBERT model
 You should process your data to tfrecord format. Modify this script `transformers_keras/utils/bert_tfrecord_custom_generator.py` as you need.
 
 
@@ -139,3 +171,35 @@ train_files = ['testdata/bert_custom_pretrain.tfrecord']
 runner.train(train_files, epochs=10, callbacks=None)
 
 ```
+
+### Load a pretrained ALBERT model
+
+You can use an `AlbertAdapter` to load pretrained models.
+
+Here is an example.
+
+```python
+from transformers_keras.adapters import AlbertAdapter
+
+# download the pretrained model and extract it to some path
+PRETRAINED_BERT_MODEL = '/path/to/zh_albert_large'
+
+adapter = AlbertAdapter(strategy='zh-albert-large')
+model, vocab_file = adapter.adapte(PRETRAINED_BERT_MODEL)
+
+print('model inputs: {}'.format(model.inputs))
+print('model outputs: {}'.format(model.outputs))
+
+```
+
+will print:
+
+```bash
+model inputs: [<tf.Tensor 'input_ids:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'segment_ids:0' shape=(None, 512) dtype=int32>, <tf.Tensor 'input_mask:0' shape=(None, 512) dtype=int32>]
+model outputs: [<tf.Tensor 'predictions/Identity:0' shape=(None, 512, 21128) dtype=float32>, <tf.Tensor 'relations/Identity:0' shape=(None, 2) dtype=float32>]
+```
+
+You can implement a custom `Strategy` to load pretrained models from anywhere. 
+The `transformers_keras.adapters.albert_adapter.ChineseAlbertLargeStrategy` is an good example.
+
+Then, you can use this model to do anything you want!
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="transformers_keras",
-    version="0.1.2",
+    version="0.1.3",
     description="Transformer-based models implemented in tensorflow 2.x(Keras)",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/transformers_keras/__init__.py b/transformers_keras/__init__.py
@@ -37,7 +37,7 @@
 from .runners import AlbertRunner, BertRunner, TransformerRunner
 
 __name__ = 'transformers_keras'
-r_version__ = '0.1.2'
+r_version__ = '0.1.3'
 
 logging.basicConfig(format="%(asctime)s %(levelname)s %(filename)15s %(lineno)4d] %(message)s", level=logging.INFO)
 
@@ -55,8 +55,8 @@ def build_pretraining_bert_model(model_config):
     bert = Bert4PreTraining(**model_config)
     outputs = bert(inputs)
 
-    predictions = tf.keras.layers.Lambda(lambda x: x[0], name='predictions')(outputs[0])
-    relations = tf.keras.layers.Lambda(lambda x: x[1], name='relations')(outputs[1])
+    predictions = tf.keras.layers.Lambda(lambda x: x, name='predictions')(outputs[0])
+    relations = tf.keras.layers.Lambda(lambda x: x, name='relations')(outputs[1])
 
     model = tf.keras.Model(inputs=inputs, outputs=[predictions, relations])
     lr = model_config.get('learning_rate', 3e-5)
@@ -78,3 +78,47 @@ def build_pretraining_bert_model(model_config):
         })
     model.summary()
     return model
+
+
+def build_pretraining_albert_model(model_config):
+    max_sequence_length = model_config.get('max_positions', 512)
+    input_ids = tf.keras.layers.Input(
+        shape=(max_sequence_length,), dtype=tf.int32, name='input_ids')
+    input_mask = tf.keras.layers.Input(
+        shape=(max_sequence_length,), dtype=tf.int32, name='input_mask')
+    segment_ids = tf.keras.layers.Input(
+        shape=(max_sequence_length,), dtype=tf.int32, name='segment_ids')
+
+    inputs = (input_ids, segment_ids, input_mask)
+    albert = Albert4PreTraining(**model_config)
+    predictions, relations, all_states, all_attn_weights = albert(inputs=inputs)
+
+    predictions = tf.keras.layers.Lambda(lambda x: x, name='predictions')(predictions)
+    relations = tf.keras.layers.Lambda(lambda x: x, name='relations')(relations)
+
+    model = tf.keras.Model(
+        inputs=[input_ids, segment_ids, input_mask], outputs=[predictions, relations])
+
+    lr = model_config.get('learning_rate', 3e-5)
+    epsilon = model_config.get('epsilon', 1e-12)
+    clipnorm = model_config.get('clipnorm', 1.0)
+
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(learning_rate=lr, epsilon=epsilon, clipnorm=clipnorm),
+        loss={
+            'predictions': MaskedSparseCategoricalCrossentropy(
+                mask_id=0, from_logits=True, name='pred_loss'),
+            'relations': tf.keras.losses.CategoricalCrossentropy(
+                from_logits=True, name='rel_loss'),
+        },
+        metrics={
+            'predictions': [
+                MaskedSparseCategoricalAccuracy(
+                    mask_id=0, from_logits=False, name='pred_acc'),
+            ],
+            'relations': [
+                tf.keras.metrics.CategoricalAccuracy(name='rel_acc'),
+            ]
+        })
+    model.summary()
+    return model
diff --git a/transformers_keras/adapters/__init__.py b/transformers_keras/adapters/__init__.py
@@ -1,2 +1,3 @@
-from .abstract_adapter import AbstractAdapter
-from .bert_adapter import BertAdapter
+from .abstract_adapter import AbstractAdapter, AbstractStrategy, PretrainedModelAdapter
+from .albert_adapter import AlbertAdapter, ChineseAlbertLargeStrategy
+from .bert_adapter import BertAdapter, ChineseBertBaseStrategy
diff --git a/transformers_keras/adapters/abstract_adapter.py b/transformers_keras/adapters/abstract_adapter.py
@@ -1,9 +1,101 @@
 import abc
+import logging
 import os
 
+import tensorflow as tf
+
+
+class AbstractStrategy(abc.ABC):
+    """Pretrained model load strategy."""
+
+    def mapping_config(self, pretrained_config_file):
+        """Convert pretrained configs to model configs.
+
+        Args:
+            pretrained_config_file: File path of pretrained model's config
+
+        Returns:
+            A python dict, model config
+        """
+        raise NotImplementedError()
+
+    def build_model(self, model_config):
+        """Build and compile model accroding to model config.
+
+        Args:
+            model_config: A python dict, model's config
+
+        Returns:
+            A keras model, instance of `tf.keras.Model`, compiled.
+        """
+        raise NotImplementedError()
+
+    def mapping_variables(self, model_config, model, ckpt):
+        """Mapping pretrained variables to model's variables.
+
+        Args:
+            model_config: A python dict, model's config
+            model: A keras model, compiled
+            ckpt: Python str, pretrained checkpoint model.
+
+        Returns:
+            Python dict, variables' name mapping
+        """
+        raise NotImplementedError()
+
+    def zip_weights(self, model, ckpt, variables_mapping):
+        """Zip weights and values.
+
+        Args:
+            model: A keras model, compiled
+            ckpt: Python str, pretrained checkpoint model
+            variables_mapping: Python dict, variables' name mapping
+
+        Returns:
+            A List of tuple (model_weight, pretrained_weight)
+        """
+        raise NotImplementedError()
+
 
 class AbstractAdapter(abc.ABC):
 
     @abc.abstractmethod
-    def adapte(self, pretrain_model_dir, checkpoint, **kwargs):
+    def adapte(self, pretrain_model_dir, **kwargs):
         raise NotImplementedError()
+
+    def _parse_files(self, pretrain_model_dir):
+        config_file, ckpt, vocab = None, None, None
+        if not os.path.exists(pretrain_model_dir):
+            logging.info('pretrain model dir: {} is not exists.'.format(pretrain_model_dir))
+            return
+        for f in os.listdir(pretrain_model_dir):
+            if str(f).endswith('config.json'):
+                config_file = os.path.join(pretrain_model_dir, f)
+            if 'vocab' in str(f):
+                vocab = os.path.join(pretrain_model_dir, f)
+            if 'ckpt' in str(f):
+                n = '.'.join(str(f).split('.')[:-1])
+                ckpt = os.path.join(pretrain_model_dir, n)
+        return config_file, ckpt, vocab
+
+
+class PretrainedModelAdapter(AbstractAdapter):
+    """Base class of pretrain models' adapter."""
+
+    def __init__(self, strategy: AbstractStrategy):
+        """Init.
+
+        Args:
+            strategy: An instance of `AbstractStrategy`
+        """
+        super().__init__()
+        self.strategy = strategy
+
+    def adapte(self, pretrain_model_dir, **kwargs):
+        config_file, ckpt, vocab_file = self._parse_files(pretrain_model_dir)
+        model_config = self.strategy.mapping_config(config_file)
+        model = self.strategy.build_model(model_config)
+        names_mapping = self.strategy.mapping_variables(model_config, model, ckpt)
+        weights_values = self.strategy.zip_weights(model, ckpt, names_mapping)
+        tf.keras.backend.batch_set_value(weights_values)
+        return model, vocab_file