From 254356005e87f005f2e41537a8bc8e4e247c8cda Mon Sep 17 00:00:00 2001 From: lvyufeng Date: Tue, 16 Apr 2024 16:00:16 +0800 Subject: [PATCH] update to ms2.2 --- 5-1.Transformer/Transformer.ipynb | 267 +++++------ 5-1.Transformer/Transformer_pytorch.ipynb | 486 ++++++++++++++----- 5-2.BERT/BERT.ipynb | 63 ++- 5-2.BERT/BERT_pytorch.ipynb | 538 +++++++++++----------- 4 files changed, 780 insertions(+), 574 deletions(-) diff --git a/5-1.Transformer/Transformer.ipynb b/5-1.Transformer/Transformer.ipynb index a7f811b..b98ce69 100644 --- a/5-1.Transformer/Transformer.ipynb +++ b/5-1.Transformer/Transformer.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "saving-region", "metadata": {}, "outputs": [], @@ -11,7 +11,7 @@ "import numpy as np\n", "import mindspore.nn as nn\n", "import mindspore.ops as ops\n", - "from mindspore import Tensor, ms_function\n", + "from mindspore import Tensor\n", "import matplotlib.pyplot as plt\n", "from layers import Dense, Embedding, Conv1d\n", "# S: Symbol that shows starting of decoding input\n", @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "burning-dining", "metadata": {}, "outputs": [], @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "resistant-giant", "metadata": {}, "outputs": [], @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "confirmed-console", "metadata": {}, "outputs": [], @@ -63,27 +63,27 @@ " batch_size, len_q = seq_q.shape\n", " batch_size, len_k = seq_k.shape\n", " \n", - " pad_attn_mask = ops.equal(seq_k, 0)\n", - " pad_attn_mask = pad_attn_mask.expand_dims(1) # batch_size x 1 x len_k(=len_q), one is masking\n", - "\n", - " return ops.BroadcastTo((batch_size, len_q, len_k))(pad_attn_mask) # batch_size x len_q x len_k" + " pad_attn_mask = seq_k.eq(0).unsqueeze(1) # batch_size x 1 x len_k(=len_q), one is masking\n", + " return pad_attn_mask.broadcast_to((batch_size, len_q, len_k)) # batch_size x len_q x len_k" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "parallel-magic", "metadata": {}, "outputs": [], "source": [ - "def get_attn_subsequent_mask(subsequent_mask):\n", - " subsequent_mask = subsequent_mask.expand_dims(0)\n", + "def get_attn_subsequent_mask(seq):\n", + " attn_shape = [seq.shape[0], seq.shape[1], seq.shape[1]]\n", + " subsequent_mask = np.triu(np.ones(attn_shape), k=1)\n", + " subsequent_mask = Tensor.from_numpy(subsequent_mask).to(mindspore.uint8)\n", " return subsequent_mask" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "capable-target", "metadata": {}, "outputs": [], @@ -91,61 +91,57 @@ "class ScaledDotProductAttention(nn.Cell):\n", " def __init__(self, d_k):\n", " super().__init__()\n", - " self.scale = Tensor(d_k, mindspore.float32)\n", " self.softmax = nn.Softmax(axis=-1)\n", + " self.d_k = Tensor(d_k)\n", " \n", " def construct(self, Q, K, V, attn_mask):\n", - " K = K.transpose((0, 1, 3, 2))\n", - " scores = ops.matmul(Q, K) / ops.sqrt(self.scale) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", + " scores = ops.matmul(Q, K.swapaxes(-1, -2)) / ops.sqrt(self.d_k)# scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", " scores = scores.masked_fill(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n", - " attn = self.softmax(scores)\n", + " attn = ops.softmax(scores)\n", " context = ops.matmul(attn, V)\n", " return context, attn" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "interim-selection", "metadata": {}, "outputs": [], "source": [ "class MultiHeadAttention(nn.Cell):\n", - " def __init__(self, d_model, d_k, n_heads):\n", + " def __init__(self, d_model, d_k, d_v, n_heads):\n", " super().__init__()\n", " self.d_k = d_k\n", + " self.d_v = d_v\n", " self.n_heads = n_heads\n", " self.W_Q = Dense(d_model, d_k * n_heads)\n", " self.W_K = Dense(d_model, d_k * n_heads)\n", - " self.W_V = Dense(d_model, d_k * n_heads)\n", - " self.linear = Dense(n_heads * d_k, d_model)\n", + " self.W_V = Dense(d_model, d_v * n_heads)\n", + " self.linear = Dense(n_heads * d_v, d_model)\n", " self.layer_norm = nn.LayerNorm((d_model, ), epsilon=1e-5)\n", " self.attention = ScaledDotProductAttention(d_k)\n", " \n", " def construct(self, Q, K, V, attn_mask):\n", " # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n", " residual, batch_size = Q, Q.shape[0]\n", - " q_s = self.W_Q(Q).view((batch_size, -1, self.n_heads, self.d_k)) \n", - " k_s = self.W_K(K).view((batch_size, -1, self.n_heads, self.d_k)) \n", - " v_s = self.W_V(V).view((batch_size, -1, self.n_heads, self.d_k)) \n", " # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n", - " q_s = q_s.transpose((0, 2, 1, 3)) # q_s: [batch_size x n_heads x len_q x d_k]\n", - " k_s = k_s.transpose((0, 2, 1, 3)) # k_s: [batch_size x n_heads x len_k x d_k]\n", - " v_s = v_s.transpose((0, 2, 1, 3)) # v_s: [batch_size x n_heads x len_k x d_v]\n", + " q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).swapaxes(1,2) # q_s: [batch_size x n_heads x len_q x d_k]\n", + " k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).swapaxes(1,2) # k_s: [batch_size x n_heads x len_k x d_k]\n", + " v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).swapaxes(1,2) # v_s: [batch_size x n_heads x len_k x d_v]\n", + "\n", + " attn_mask = attn_mask.unsqueeze(1).tile((1, n_heads, 1, 1)) # attn_mask : [batch_size x n_heads x len_q x len_k]\n", "\n", - " attn_mask = attn_mask.expand_dims(1)\n", - " attn_mask = ops.tile(attn_mask, (1, self.n_heads, 1, 1)) # attn_mask : [batch_size x n_heads x len_q x len_k]\n", - " \n", " # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", " context, attn = self.attention(q_s, k_s, v_s, attn_mask)\n", - " context = context.transpose((0, 2, 1, 3)).view((batch_size, -1, self.n_heads * self.d_k)) # context: [batch_size x len_q x n_heads * d_v]\n", - " output = self.linear(context) \n", + " context = context.swapaxes(1, 2).view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n", + " output = self.linear(context)\n", " return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "official-fetish", "metadata": {}, "outputs": [], @@ -160,25 +156,22 @@ " \n", " def construct(self, inputs):\n", " residual = inputs # inputs : [batch_size, len_q, d_model]\n", - " output = inputs.transpose((0, 2, 1))\n", - " output = self.conv1(output)\n", - " output = self.relu(output)\n", - " output = self.conv2(output)\n", - " output = output.transpose((0, 2, 1))\n", + " output = self.relu(self.conv1(inputs.swapaxes(1, 2)))\n", + " output = self.conv2(output).swapaxes(1, 2)\n", " return self.layer_norm(output + residual)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "incredible-packet", "metadata": {}, "outputs": [], "source": [ "class EncoderLayer(nn.Cell):\n", - " def __init__(self, d_model, d_k, n_heads, d_ff):\n", + " def __init__(self, d_model, d_k, d_v, n_heads, d_ff):\n", " super().__init__()\n", - " self.enc_self_attn = MultiHeadAttention(d_model, d_k, n_heads)\n", + " self.enc_self_attn = MultiHeadAttention(d_model, d_k, d_v, n_heads)\n", " self.pos_ffn = PoswiseFeedForward(d_ff, d_model)\n", " \n", " def construct(self, enc_inputs, enc_self_attn_mask):\n", @@ -189,16 +182,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "greater-samba", "metadata": {}, "outputs": [], "source": [ "class DecoderLayer(nn.Cell):\n", - " def __init__(self, d_model, d_k, n_heads, d_ff):\n", + " def __init__(self, d_model, d_k, d_v, n_heads, d_ff):\n", " super().__init__()\n", - " self.dec_self_attn = MultiHeadAttention(d_model, d_k, n_heads)\n", - " self.dec_enc_attn = MultiHeadAttention(d_model, d_k, n_heads)\n", + " self.dec_self_attn = MultiHeadAttention(d_model, d_k, d_v, n_heads)\n", + " self.dec_enc_attn = MultiHeadAttention(d_model, d_k, d_v, n_heads)\n", " self.pos_ffn = PoswiseFeedForward(d_ff, d_model)\n", " \n", " def construct(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):\n", @@ -210,17 +203,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "atmospheric-armstrong", "metadata": {}, "outputs": [], "source": [ "class Encoder(nn.Cell):\n", - " def __init__(self, src_vocab_size, d_model, d_k, n_heads, d_ff, n_layers, src_len):\n", + " def __init__(self, src_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, src_len):\n", " super().__init__()\n", " self.src_emb = Embedding(src_vocab_size, d_model)\n", " self.pos_emb = Embedding.from_pretrained_embedding(get_sinusoid_encoding_table(src_len+1, d_model), freeze=True)\n", - " self.layers = nn.CellList([EncoderLayer(d_model, d_k, n_heads, d_ff) for _ in range(n_layers)])\n", + " self.layers = nn.CellList([EncoderLayer(d_model, d_k, d_v, n_heads, d_ff) for _ in range(n_layers)])\n", " # temp positional indexes\n", " self.pos = Tensor([[1, 2, 3, 4, 0]])\n", " \n", @@ -237,29 +230,23 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "seeing-germany", "metadata": {}, "outputs": [], "source": [ "class Decoder(nn.Cell):\n", - " def __init__(self, tgt_vocab_size, d_model, d_k, n_heads, d_ff, n_layers, tgt_len):\n", + " def __init__(self, tgt_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, tgt_len):\n", " super().__init__()\n", " self.tgt_emb = Embedding(tgt_vocab_size, d_model)\n", " self.pos_emb = Embedding.from_pretrained_embedding(get_sinusoid_encoding_table(tgt_len+1, d_model), freeze=True)\n", - " self.layers = nn.CellList([DecoderLayer(d_model, d_k, n_heads, d_ff) for _ in range(n_layers)])\n", - " \n", - " # temp positional indexes\n", - " self.pos = Tensor([[5, 1, 2, 3, 4]])\n", - " \n", - " ones = np.ones(shape=(tgt_len, tgt_len))\n", - " self.subsequent_mask = Tensor(np.triu(ones, k=1), dtype=mindspore.float32)\n", - " \n", + " self.layers = nn.CellList([DecoderLayer(d_model, d_k, d_v, n_heads, d_ff) for _ in range(n_layers)])\n", + "\n", " def construct(self, dec_inputs, enc_inputs, enc_outputs):\n", " # dec_inputs : [batch_size x target_len]\n", - " dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(self.pos)\n", + " dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(Tensor([[5,1,2,3,4]]))\n", " dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)\n", - " dec_self_attn_subsequent_mask = get_attn_subsequent_mask(self.subsequent_mask)\n", + " dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)\n", " dec_self_attn_mask = ops.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)\n", " \n", " dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)\n", @@ -274,16 +261,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "presidential-bailey", "metadata": {}, "outputs": [], "source": [ "class Transformer(nn.Cell):\n", - " def __init__(self, d_model, d_k, n_heads, d_ff, n_layers, src_vocab_size, tgt_vocab_size, src_len, tgt_len):\n", + " def __init__(self, d_model, d_k, d_v, n_heads, d_ff, n_layers, src_vocab_size, tgt_vocab_size, src_len, tgt_len):\n", " super(Transformer, self).__init__()\n", - " self.encoder = Encoder(src_vocab_size, d_model, d_k, n_heads, d_ff, n_layers, src_len)\n", - " self.decoder = Decoder(tgt_vocab_size, d_model, d_k, n_heads, d_ff, n_layers, tgt_len)\n", + " self.encoder = Encoder(src_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, src_len)\n", + " self.decoder = Decoder(tgt_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, tgt_len)\n", " self.projection = Dense(d_model, tgt_vocab_size, has_bias=False)\n", "\n", " def construct(self, enc_inputs, dec_inputs):\n", @@ -295,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "hairy-internet", "metadata": {}, "outputs": [], @@ -316,24 +303,24 @@ "\n", "d_model = 512 # Embedding Size\n", "d_ff = 2048 # FeedForward dimension\n", - "d_k = 64 # dimension of K(=Q), V\n", + "d_k = d_v = 64 # dimension of K(=Q), V\n", "n_layers = 6 # number of Encoder of Decoder Layer\n", "n_heads = 8 # number of heads in Multi-Head Attention" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "pacific-pollution", "metadata": {}, "outputs": [], "source": [ - "model = Transformer(d_model, d_k, n_heads, d_ff, n_layers, src_vocab_size, tgt_vocab_size, src_len, tgt_len)" + "model = Transformer(d_model, d_k, d_v, n_heads, d_ff, n_layers, src_vocab_size, tgt_vocab_size, src_len, tgt_len)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "activated-ordinary", "metadata": {}, "outputs": [], @@ -346,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "0f7e451f", "metadata": {}, "outputs": [], @@ -360,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "223f4a3d", "metadata": {}, "outputs": [], @@ -370,12 +357,12 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "3c96e055", "metadata": {}, "outputs": [], "source": [ - "@ms_function\n", + "@mindspore.jit\n", "def train_step(enc_inputs, dec_inputs, target_batch):\n", " loss, grads = grad_fn(enc_inputs, dec_inputs, target_batch)\n", " optimizer(grads)\n", @@ -384,34 +371,50 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "id": "human-reverse", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:30.934.784 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/3030654789.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:31.085.333 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/1296245474.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:31.085.953 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/1296245474.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:31.086.786 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/1296245474.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:35.478.043 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/2926503423.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:35.478.088 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/2926503423.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:35.663.493 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/3030654789.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:35.663.514 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/1296245474.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:35.665.780 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/1296245474.py]\n", + "[ERROR] CORE(1265263,7f03d442f4c0,python):2024-04-16-15:48:35.667.354 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1265263/1296245474.py]\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Epoch: 0001 cost = 2.596976\n", - "Epoch: 0002 cost = 1.083525\n", - "Epoch: 0003 cost = 0.804770\n", - "Epoch: 0004 cost = 0.334796\n", - "Epoch: 0005 cost = 0.180342\n", - "Epoch: 0006 cost = 0.042456\n", - "Epoch: 0007 cost = 0.009277\n", - "Epoch: 0008 cost = 0.003621\n", - "Epoch: 0009 cost = 0.002399\n", - "Epoch: 0010 cost = 0.002193\n", - "Epoch: 0011 cost = 0.002270\n", - "Epoch: 0012 cost = 0.002314\n", - "Epoch: 0013 cost = 0.002193\n", - "Epoch: 0014 cost = 0.001918\n", - "Epoch: 0015 cost = 0.001599\n", - "Epoch: 0016 cost = 0.001310\n", - "Epoch: 0017 cost = 0.001075\n", - "Epoch: 0018 cost = 0.000890\n", - "Epoch: 0019 cost = 0.000741\n", - "Epoch: 0020 cost = 0.000619\n" + "Epoch: 0001 cost = 2.367968\n", + "Epoch: 0002 cost = 0.938194\n", + "Epoch: 0003 cost = 0.512835\n", + "Epoch: 0004 cost = 0.167661\n", + "Epoch: 0005 cost = 0.048837\n", + "Epoch: 0006 cost = 0.010902\n", + "Epoch: 0007 cost = 0.003625\n", + "Epoch: 0008 cost = 0.001862\n", + "Epoch: 0009 cost = 0.001336\n", + "Epoch: 0010 cost = 0.001196\n", + "Epoch: 0011 cost = 0.001198\n", + "Epoch: 0012 cost = 0.001234\n", + "Epoch: 0013 cost = 0.001247\n", + "Epoch: 0014 cost = 0.001202\n", + "Epoch: 0015 cost = 0.001100\n", + "Epoch: 0016 cost = 0.000960\n", + "Epoch: 0017 cost = 0.000809\n", + "Epoch: 0018 cost = 0.000667\n", + "Epoch: 0019 cost = 0.000544\n", + "Epoch: 0020 cost = 0.000444\n" ] } ], @@ -427,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "id": "rubber-difficulty", "metadata": {}, "outputs": [ @@ -443,12 +446,12 @@ "# Test\n", "predict, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)\n", "predict = predict.asnumpy().argmax(1)\n", - "print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])\n" + "print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "id": "postal-picking", "metadata": {}, "outputs": [], @@ -466,72 +469,10 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "dried-depth", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "first head of last state enc_self_attns\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/lvyufeng/miniconda3/envs/ms1.9/lib/python3.7/site-packages/ipykernel_launcher.py:7: UserWarning: FixedFormatter should only be used together with FixedLocator\n", - " import sys\n", - "/home/lvyufeng/miniconda3/envs/ms1.9/lib/python3.7/site-packages/ipykernel_launcher.py:8: UserWarning: FixedFormatter should only be used together with FixedLocator\n", - " \n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "first head of last state dec_self_attns\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "first head of last state dec_enc_attns\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print('first head of last state enc_self_attns')\n", "showgraph(enc_self_attns)\n", @@ -546,7 +487,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.7.13 ('ms1.9')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -560,7 +501,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/5-1.Transformer/Transformer_pytorch.ipynb b/5-1.Transformer/Transformer_pytorch.ipynb index f0077c8..b9378a3 100644 --- a/5-1.Transformer/Transformer_pytorch.ipynb +++ b/5-1.Transformer/Transformer_pytorch.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "muslim-material", "metadata": {}, "outputs": [], @@ -19,14 +19,30 @@ "\n", "# S: Symbol that shows starting of decoding input\n", "# E: Symbol that shows starting of decoding output\n", - "# P: Symbol that will fill in blank sequence if current batch data size is short than time steps\n", - "\n", + "# P: Symbol that will fill in blank sequence if current batch data size is short than time steps" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "15708426-6f32-4baf-9646-2b41bf94f84a", + "metadata": {}, + "outputs": [], + "source": [ "def make_batch(sentences):\n", " input_batch = [[src_vocab[n] for n in sentences[0].split()]]\n", " output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]\n", " target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]\n", - " return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)\n", - "\n", + " return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "853b2237-f05f-4f5a-b2b6-10377cd17244", + "metadata": {}, + "outputs": [], + "source": [ "def get_sinusoid_encoding_table(n_position, d_model):\n", " def cal_angle(position, hid_idx):\n", " return position / np.power(10000, 2 * (hid_idx // 2) / d_model)\n", @@ -36,100 +52,171 @@ " sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])\n", " sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i\n", " sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1\n", - " return torch.FloatTensor(sinusoid_table)\n", - "\n", + " return torch.FloatTensor(sinusoid_table)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "392a3a9d-f02c-450c-bfe8-0432e2ef4ee7", + "metadata": {}, + "outputs": [], + "source": [ "def get_attn_pad_mask(seq_q, seq_k):\n", " batch_size, len_q = seq_q.size()\n", " batch_size, len_k = seq_k.size()\n", " # eq(zero) is PAD token\n", " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # batch_size x 1 x len_k(=len_q), one is masking\n", - " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k\n", - "\n", + " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ef758198-6584-46f8-8884-63a63d7bd1d7", + "metadata": {}, + "outputs": [], + "source": [ "def get_attn_subsequent_mask(seq):\n", " attn_shape = [seq.size(0), seq.size(1), seq.size(1)]\n", " subsequent_mask = np.triu(np.ones(attn_shape), k=1)\n", " subsequent_mask = torch.from_numpy(subsequent_mask).byte()\n", - " return subsequent_mask\n", - "\n", + " return subsequent_mask" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "711b197f-55b0-4453-ad53-9dc371272784", + "metadata": {}, + "outputs": [], + "source": [ "class ScaledDotProductAttention(nn.Module):\n", - " def __init__(self):\n", + " def __init__(self, d_k):\n", " super(ScaledDotProductAttention, self).__init__()\n", + " self.softmax = nn.Softmax(dim=-1)\n", + " self.d_k = d_k\n", "\n", " def forward(self, Q, K, V, attn_mask):\n", - " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)# scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", + " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(self.d_k)# scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", " scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n", - " attn = nn.Softmax(dim=-1)(scores)\n", + " attn = self.softmax(scores)\n", " context = torch.matmul(attn, V)\n", - " return context, attn\n", - "\n", + " return context, attn" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "56ec0a47-4018-48a1-8826-f58b31438502", + "metadata": {}, + "outputs": [], + "source": [ "class MultiHeadAttention(nn.Module):\n", - " def __init__(self):\n", + " def __init__(self, d_model, d_k, d_v, n_heads):\n", " super(MultiHeadAttention, self).__init__()\n", + " self.d_k = d_k\n", + " self.d_v = d_v\n", + " self.n_heads = n_heads\n", " self.W_Q = nn.Linear(d_model, d_k * n_heads)\n", " self.W_K = nn.Linear(d_model, d_k * n_heads)\n", " self.W_V = nn.Linear(d_model, d_v * n_heads)\n", " self.linear = nn.Linear(n_heads * d_v, d_model)\n", " self.layer_norm = nn.LayerNorm(d_model)\n", + " self.attention = ScaledDotProductAttention(d_k)\n", "\n", " def forward(self, Q, K, V, attn_mask):\n", " # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n", " residual, batch_size = Q, Q.size(0)\n", " # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n", - " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]\n", - " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]\n", - " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]\n", + " q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]\n", + " k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]\n", + " v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]\n", "\n", " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]\n", "\n", " # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", - " context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", + " context, attn = self.attention(q_s, k_s, v_s, attn_mask)\n", " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n", " output = self.linear(context)\n", - " return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]\n", - "\n", - "class PoswiseFeedForwardNet(nn.Module):\n", - " def __init__(self):\n", - " super(PoswiseFeedForwardNet, self).__init__()\n", + " return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ff8f89d4-809f-4e45-8f37-e02784fb96ab", + "metadata": {}, + "outputs": [], + "source": [ + "class PoswiseFeedForward(nn.Module):\n", + " def __init__(self, d_ff, d_model):\n", + " super(PoswiseFeedForward, self).__init__()\n", " self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)\n", " self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)\n", " self.layer_norm = nn.LayerNorm(d_model)\n", + " self.relu = nn.ReLU()\n", "\n", " def forward(self, inputs):\n", " residual = inputs # inputs : [batch_size, len_q, d_model]\n", - " output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))\n", + " output = self.relu(self.conv1(inputs.transpose(1, 2)))\n", " output = self.conv2(output).transpose(1, 2)\n", - " return self.layer_norm(output + residual)\n", - "\n", + " return self.layer_norm(output + residual)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "855a975d-37f5-4c9a-a352-debc00eb0916", + "metadata": {}, + "outputs": [], + "source": [ "class EncoderLayer(nn.Module):\n", - " def __init__(self):\n", - " super(EncoderLayer, self).__init__()\n", - " self.enc_self_attn = MultiHeadAttention()\n", - " self.pos_ffn = PoswiseFeedForwardNet()\n", + " def __init__(self, d_model, d_k, d_v, n_heads, d_ff):\n", + " super().__init__()\n", + " self.enc_self_attn = MultiHeadAttention(d_model, d_k, d_v, n_heads)\n", + " self.pos_ffn = PoswiseFeedForward(d_ff, d_model)\n", "\n", " def forward(self, enc_inputs, enc_self_attn_mask):\n", " enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n", " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n", - " return enc_outputs, attn\n", - "\n", + " return enc_outputs, attn" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c90050db-a25c-4516-b653-29ef9e82b968", + "metadata": {}, + "outputs": [], + "source": [ "class DecoderLayer(nn.Module):\n", - " def __init__(self):\n", - " super(DecoderLayer, self).__init__()\n", - " self.dec_self_attn = MultiHeadAttention()\n", - " self.dec_enc_attn = MultiHeadAttention()\n", - " self.pos_ffn = PoswiseFeedForwardNet()\n", + " def __init__(self, d_model, d_k, d_v, n_heads, d_ff):\n", + " super().__init__()\n", + " self.dec_self_attn = MultiHeadAttention(d_model, d_k, d_v, n_heads)\n", + " self.dec_enc_attn = MultiHeadAttention(d_model, d_k, d_v, n_heads)\n", + " self.pos_ffn = PoswiseFeedForward(d_ff, d_model)\n", "\n", " def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):\n", " dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)\n", " dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)\n", " dec_outputs = self.pos_ffn(dec_outputs)\n", - " return dec_outputs, dec_self_attn, dec_enc_attn\n", - "\n", + " return dec_outputs, dec_self_attn, dec_enc_attn" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "497cafe9-e842-48fc-9081-52c8ec010569", + "metadata": {}, + "outputs": [], + "source": [ "class Encoder(nn.Module):\n", - " def __init__(self):\n", + " def __init__(self, src_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, src_len):\n", " super(Encoder, self).__init__()\n", " self.src_emb = nn.Embedding(src_vocab_size, d_model)\n", " self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1, d_model),freeze=True)\n", - " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n", + " self.layers = nn.ModuleList([EncoderLayer(d_model, d_k, d_v, n_heads, d_ff) for _ in range(n_layers)])\n", "\n", " def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]\n", " enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(torch.LongTensor([[1,2,3,4,0]]))\n", @@ -138,14 +225,22 @@ " for layer in self.layers:\n", " enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)\n", " enc_self_attns.append(enc_self_attn)\n", - " return enc_outputs, enc_self_attns\n", - "\n", + " return enc_outputs, enc_self_attns" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7b18e014-4e63-472a-8fb0-8635aa1541b7", + "metadata": {}, + "outputs": [], + "source": [ "class Decoder(nn.Module):\n", - " def __init__(self):\n", + " def __init__(self, tgt_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, tgt_len):\n", " super(Decoder, self).__init__()\n", " self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)\n", " self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1, d_model),freeze=True)\n", - " self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])\n", + " self.layers = nn.ModuleList([DecoderLayer(d_model, d_k, d_v, n_heads, d_ff) for _ in range(n_layers)])\n", "\n", " def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]\n", " dec_outputs = self.tgt_emb(dec_inputs) + self.pos_emb(torch.LongTensor([[5,1,2,3,4]]))\n", @@ -160,20 +255,152 @@ " dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)\n", " dec_self_attns.append(dec_self_attn)\n", " dec_enc_attns.append(dec_enc_attn)\n", - " return dec_outputs, dec_self_attns, dec_enc_attns\n", - "\n", + " return dec_outputs, dec_self_attns, dec_enc_attns" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "73941748-7a78-4d14-90a8-0663ec9deadf", + "metadata": {}, + "outputs": [], + "source": [ "class Transformer(nn.Module):\n", - " def __init__(self):\n", + " def __init__(self, d_model, d_k, d_v, n_heads, d_ff, n_layers, src_vocab_size, tgt_vocab_size, src_len, tgt_len):\n", " super(Transformer, self).__init__()\n", - " self.encoder = Encoder()\n", - " self.decoder = Decoder()\n", + " self.encoder = Encoder(src_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, src_len)\n", + " self.decoder = Decoder(tgt_vocab_size, d_model, d_k, d_v, n_heads, d_ff, n_layers, tgt_len)\n", " self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False)\n", + "\n", " def forward(self, enc_inputs, dec_inputs):\n", " enc_outputs, enc_self_attns = self.encoder(enc_inputs)\n", " dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)\n", " dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size x src_vocab_size x tgt_vocab_size]\n", - " return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns\n", + " return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "be4ea15c-5147-4917-ad73-bf42f253f5a9", + "metadata": {}, + "outputs": [], + "source": [ + "sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']\n", "\n", + "# Transformer Parameters\n", + "# Padding Should be Zero\n", + "src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}\n", + "src_vocab_size = len(src_vocab)\n", + "\n", + "tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}\n", + "number_dict = {i: w for i, w in enumerate(tgt_vocab)}\n", + "tgt_vocab_size = len(tgt_vocab)\n", + "\n", + "src_len = 6 # length of source\n", + "tgt_len = 5 # length of target\n", + "\n", + "d_model = 512 # Embedding Size\n", + "d_ff = 2048 # FeedForward dimension\n", + "d_k = d_v = 64 # dimension of K(=Q), V\n", + "n_layers = 6 # number of Encoder of Decoder Layer\n", + "n_heads = 8 # number of heads in Multi-Head Attention" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "83f611f7-126c-4dbe-ab90-a9d203dbadee", + "metadata": {}, + "outputs": [], + "source": [ + "model = Transformer(d_model, d_k, d_v, n_heads, d_ff, n_layers, src_vocab_size, tgt_vocab_size, src_len, tgt_len)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b818b9a6-9df9-459c-ac47-1d864e7b141b", + "metadata": {}, + "outputs": [], + "source": [ + "criterion = nn.CrossEntropyLoss()\n", + "optimizer = optim.Adam(model.parameters(), lr=0.0001)\n", + "\n", + "enc_inputs, dec_inputs, target_batch = make_batch(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "39e82729-1316-4377-bb53-58859cdca6bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch: 0001 cost = 2.031222\n", + "Epoch: 0002 cost = 1.260098\n", + "Epoch: 0003 cost = 0.785420\n", + "Epoch: 0004 cost = 0.526297\n", + "Epoch: 0005 cost = 0.073096\n", + "Epoch: 0006 cost = 0.020095\n", + "Epoch: 0007 cost = 0.012649\n", + "Epoch: 0008 cost = 0.010072\n", + "Epoch: 0009 cost = 0.007779\n", + "Epoch: 0010 cost = 0.005323\n", + "Epoch: 0011 cost = 0.003494\n", + "Epoch: 0012 cost = 0.002412\n", + "Epoch: 0013 cost = 0.001798\n", + "Epoch: 0014 cost = 0.001434\n", + "Epoch: 0015 cost = 0.001191\n", + "Epoch: 0016 cost = 0.001008\n", + "Epoch: 0017 cost = 0.000860\n", + "Epoch: 0018 cost = 0.000732\n", + "Epoch: 0019 cost = 0.000623\n", + "Epoch: 0020 cost = 0.000529\n" + ] + } + ], + "source": [ + "for epoch in range(20):\n", + " optimizer.zero_grad()\n", + " outputs, _, _, _ = model(enc_inputs, dec_inputs)\n", + " loss = criterion(outputs, target_batch.contiguous().view(-1))\n", + " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", + " loss.backward()\n", + " optimizer.step()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a2685684-a92a-4019-aa95-041d97bf7096", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ich mochte ein bier P -> ['i', 'want', 'a', 'beer', 'E']\n" + ] + } + ], + "source": [ + "# Test\n", + "predict, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)\n", + "predict = predict.data.max(1, keepdim=True)[1]\n", + "print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8f8d43a7-f750-40d3-8bc7-f27d7336cecb", + "metadata": {}, + "outputs": [], + "source": [ "def showgraph(attn):\n", " attn = attn[-1].squeeze(0)[0]\n", " attn = attn.squeeze(0).data.numpy()\n", @@ -183,63 +410,112 @@ " ax.matshow(attn, cmap='viridis')\n", " ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)\n", " ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})\n", - " plt.show()\n", - "\n", - "if __name__ == '__main__':\n", - " sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']\n", - "\n", - " # Transformer Parameters\n", - " # Padding Should be Zero\n", - " src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}\n", - " src_vocab_size = len(src_vocab)\n", - "\n", - " tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}\n", - " number_dict = {i: w for i, w in enumerate(tgt_vocab)}\n", - " tgt_vocab_size = len(tgt_vocab)\n", - "\n", - " src_len = 5 # length of source\n", - " tgt_len = 5 # length of target\n", - "\n", - " d_model = 512 # Embedding Size\n", - " d_ff = 2048 # FeedForward dimension\n", - " d_k = d_v = 64 # dimension of K(=Q), V\n", - " n_layers = 6 # number of Encoder of Decoder Layer\n", - " n_heads = 8 # number of heads in Multi-Head Attention\n", - "\n", - " model = Transformer()\n", - "\n", - " criterion = nn.CrossEntropyLoss()\n", - " optimizer = optim.Adam(model.parameters(), lr=0.0001)\n", - "\n", - " enc_inputs, dec_inputs, target_batch = make_batch(sentences)\n", - "\n", - " for epoch in range(20):\n", - " optimizer.zero_grad()\n", - " outputs, _, _, _ = model(enc_inputs, dec_inputs)\n", - " loss = criterion(outputs, target_batch.contiguous().view(-1))\n", - " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " # Test\n", - " predict, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)\n", - " predict = predict.data.max(1, keepdim=True)[1]\n", - " print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])\n", - "\n", - " print('first head of last state enc_self_attns')\n", - " showgraph(enc_self_attns)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "090b2177-65bf-4169-a2f4-afe67ab1a602", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first head of last state enc_self_attns\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1262415/2392112591.py:8: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)\n", + "/tmp/ipykernel_1262415/2392112591.py:9: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first head of last state dec_self_attns\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1262415/2392112591.py:8: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)\n", + "/tmp/ipykernel_1262415/2392112591.py:9: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first head of last state dec_enc_attns\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1262415/2392112591.py:8: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " ax.set_xticklabels(['']+sentences[0].split(), fontdict={'fontsize': 14}, rotation=90)\n", + "/tmp/ipykernel_1262415/2392112591.py:9: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " ax.set_yticklabels(['']+sentences[2].split(), fontdict={'fontsize': 14})\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print('first head of last state enc_self_attns')\n", + "showgraph(enc_self_attns)\n", "\n", - " print('first head of last state dec_self_attns')\n", - " showgraph(dec_self_attns)\n", + "print('first head of last state dec_self_attns')\n", + "showgraph(dec_self_attns)\n", "\n", - " print('first head of last state dec_enc_attns')\n", - " showgraph(dec_enc_attns)" + "print('first head of last state dec_enc_attns')\n", + "showgraph(dec_enc_attns)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -253,7 +529,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.5" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/5-2.BERT/BERT.ipynb b/5-2.BERT/BERT.ipynb index 6265d39..5735667 100644 --- a/5-2.BERT/BERT.ipynb +++ b/5-2.BERT/BERT.ipynb @@ -13,8 +13,7 @@ "import mindspore.nn as nn\n", "import mindspore.ops as ops\n", "import mindspore.numpy as mnp\n", - "from layers import Dense, Embedding\n", - "from mindspore import ms_function" + "from layers import Dense, Embedding" ] }, { @@ -104,7 +103,7 @@ "\n", " def construct(self, x, seg):\n", " seq_len = x.shape[1]\n", - " pos = mnp.arange(seq_len, dtype=mindspore.int64)\n", + " pos = ops.arange(seq_len, dtype=mindspore.int64)\n", " pos = pos.expand_dims(0).expand_as(x) # (seq_len,) -> (batch_size, seq_len)\n", " embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n", " return self.norm(embedding)" @@ -226,7 +225,7 @@ " n_vocab, n_dim = embed_weight.shape\n", " self.decoder = Dense(n_dim, n_vocab, has_bias=False)\n", " self.decoder.weight = embed_weight\n", - " self.decoder_bias = mindspore.Parameter(mnp.zeros(n_vocab), 'decoder_bias')\n", + " self.decoder_bias = mindspore.Parameter(ops.zeros(n_vocab), 'decoder_bias')\n", "\n", " def construct(self, input_ids, segment_ids, masked_pos):\n", " output = self.embedding(input_ids, segment_ids)\n", @@ -340,7 +339,7 @@ "metadata": {}, "outputs": [], "source": [ - "@ms_function\n", + "@mindspore.jit\n", "def train_step(input_ids, segment_ids, masked_pos, masked_tokens, isNext):\n", " loss, grads = grad_fn(input_ids, segment_ids, masked_pos, masked_tokens, isNext)\n", " optimizer(grads)\n", @@ -353,20 +352,28 @@ "id": "81bf550b-8239-440d-9fda-c556dee4552c", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[ERROR] CORE(1267049,7f74549fd4c0,python):2024-04-16-15:56:16.580.126 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1267049/3083615623.py]\n", + "[ERROR] CORE(1267049,7f74549fd4c0,python):2024-04-16-15:56:16.580.172 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_1267049/3083615623.py]\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Epoch: 0010 cost = 54.808880\n", - "Epoch: 0020 cost = 43.177929\n", - "Epoch: 0030 cost = 24.757572\n", - "Epoch: 0040 cost = 17.764444\n", - "Epoch: 0050 cost = 10.951277\n", - "Epoch: 0060 cost = 7.573495\n", - "Epoch: 0070 cost = 5.663257\n", - "Epoch: 0080 cost = 4.734429\n", - "Epoch: 0090 cost = 3.930883\n", - "Epoch: 0100 cost = 3.159842\n" + "Epoch: 0010 cost = 46.552399\n", + "Epoch: 0020 cost = 19.055964\n", + "Epoch: 0030 cost = 15.114850\n", + "Epoch: 0040 cost = 9.543916\n", + "Epoch: 0050 cost = 6.100155\n", + "Epoch: 0060 cost = 2.962293\n", + "Epoch: 0070 cost = 3.004694\n", + "Epoch: 0080 cost = 2.631464\n", + "Epoch: 0090 cost = 2.321460\n", + "Epoch: 0100 cost = 2.230808\n" ] } ], @@ -383,28 +390,10 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "f7da833d-9efb-475f-9aa1-93be15e3ea73", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Hello, how are you? I am Romeo.\n", - "Hello, Romeo My name is Juliet. Nice to meet you.\n", - "Nice meet you too. How are you today?\n", - "Great. My baseball team won the competition.\n", - "Oh Congratulations, Juliet\n", - "Thanks you Romeo\n", - "['[CLS]', '[MASK]', 'congratulations', '[MASK]', '[SEP]', 'nice', 'meet', 'you', 'too', 'how', 'are', 'you', 'today', '[SEP]']\n", - "masked tokens list : [Tensor(shape=[], dtype=Int64, value= 28), Tensor(shape=[], dtype=Int64, value= 12)]\n", - "predict masked tokens list : []\n", - "isNext : False\n", - "predict isNext : False\n" - ] - } - ], + "outputs": [], "source": [ "# Predict mask tokens ans isNext\n", "input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(mindspore.Tensor, zip(batch[0]))\n", @@ -424,7 +413,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.7.13 ('ms1.8')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -438,7 +427,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.13" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/5-2.BERT/BERT_pytorch.ipynb b/5-2.BERT/BERT_pytorch.ipynb index 215f860..e483616 100644 --- a/5-2.BERT/BERT_pytorch.ipynb +++ b/5-2.BERT/BERT_pytorch.ipynb @@ -1,271 +1,271 @@ { - "cells": [ - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# code by Tae Hwan Jung(Jeff Jung) @graykode\n", - "# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch\n", - "# https://github.com/JayParks/transformer, https://github.com/dhlee347/pytorchic-bert\n", - "import math\n", - "import re\n", - "from random import *\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "\n", - "# sample IsNext and NotNext to be same in small batch size\n", - "def make_batch():\n", - " batch = []\n", - " positive = negative = 0\n", - " while positive != batch_size/2 or negative != batch_size/2:\n", - " tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences\n", - " tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]\n", - " input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]\n", - " segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n", - "\n", - " # MASK LM\n", - " n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence\n", - " cand_maked_pos = [i for i, token in enumerate(input_ids)\n", - " if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]\n", - " shuffle(cand_maked_pos)\n", - " masked_tokens, masked_pos = [], []\n", - " for pos in cand_maked_pos[:n_pred]:\n", - " masked_pos.append(pos)\n", - " masked_tokens.append(input_ids[pos])\n", - " if random() < 0.8: # 80%\n", - " input_ids[pos] = word_dict['[MASK]'] # make mask\n", - " elif random() < 0.5: # 10%\n", - " index = randint(0, vocab_size - 1) # random index in vocabulary\n", - " input_ids[pos] = word_dict[number_dict[index]] # replace\n", - "\n", - " # Zero Paddings\n", - " n_pad = maxlen - len(input_ids)\n", - " input_ids.extend([0] * n_pad)\n", - " segment_ids.extend([0] * n_pad)\n", - "\n", - " # Zero Padding (100% - 15%) tokens\n", - " if max_pred > n_pred:\n", - " n_pad = max_pred - n_pred\n", - " masked_tokens.extend([0] * n_pad)\n", - " masked_pos.extend([0] * n_pad)\n", - "\n", - " if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:\n", - " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n", - " positive += 1\n", - " elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:\n", - " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n", - " negative += 1\n", - " return batch\n", - "# Proprecessing Finished\n", - "\n", - "def get_attn_pad_mask(seq_q, seq_k):\n", - " batch_size, len_q = seq_q.size()\n", - " batch_size, len_k = seq_k.size()\n", - " # eq(zero) is PAD token\n", - " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # batch_size x 1 x len_k(=len_q), one is masking\n", - " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k\n", - "\n", - "def gelu(x):\n", - " \"Implementation of the gelu activation function by Hugging Face\"\n", - " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))\n", - "\n", - "class Embedding(nn.Module):\n", - " def __init__(self):\n", - " super(Embedding, self).__init__()\n", - " self.tok_embed = nn.Embedding(vocab_size, d_model) # token embedding\n", - " self.pos_embed = nn.Embedding(maxlen, d_model) # position embedding\n", - " self.seg_embed = nn.Embedding(n_segments, d_model) # segment(token type) embedding\n", - " self.norm = nn.LayerNorm(d_model)\n", - "\n", - " def forward(self, x, seg):\n", - " seq_len = x.size(1)\n", - " pos = torch.arange(seq_len, dtype=torch.long)\n", - " pos = pos.unsqueeze(0).expand_as(x) # (seq_len,) -> (batch_size, seq_len)\n", - " embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n", - " return self.norm(embedding)\n", - "\n", - "class ScaledDotProductAttention(nn.Module):\n", - " def __init__(self):\n", - " super(ScaledDotProductAttention, self).__init__()\n", - "\n", - " def forward(self, Q, K, V, attn_mask):\n", - " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", - " scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n", - " attn = nn.Softmax(dim=-1)(scores)\n", - " context = torch.matmul(attn, V)\n", - " return context, attn\n", - "\n", - "class MultiHeadAttention(nn.Module):\n", - " def __init__(self):\n", - " super(MultiHeadAttention, self).__init__()\n", - " self.W_Q = nn.Linear(d_model, d_k * n_heads)\n", - " self.W_K = nn.Linear(d_model, d_k * n_heads)\n", - " self.W_V = nn.Linear(d_model, d_v * n_heads)\n", - " def forward(self, Q, K, V, attn_mask):\n", - " # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n", - " residual, batch_size = Q, Q.size(0)\n", - " # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n", - " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]\n", - " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]\n", - " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]\n", - "\n", - " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]\n", - "\n", - " # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", - " context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", - " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n", - " output = nn.Linear(n_heads * d_v, d_model)(context)\n", - " return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]\n", - "\n", - "class PoswiseFeedForwardNet(nn.Module):\n", - " def __init__(self):\n", - " super(PoswiseFeedForwardNet, self).__init__()\n", - " self.fc1 = nn.Linear(d_model, d_ff)\n", - " self.fc2 = nn.Linear(d_ff, d_model)\n", - "\n", - " def forward(self, x):\n", - " # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)\n", - " return self.fc2(gelu(self.fc1(x)))\n", - "\n", - "class EncoderLayer(nn.Module):\n", - " def __init__(self):\n", - " super(EncoderLayer, self).__init__()\n", - " self.enc_self_attn = MultiHeadAttention()\n", - " self.pos_ffn = PoswiseFeedForwardNet()\n", - "\n", - " def forward(self, enc_inputs, enc_self_attn_mask):\n", - " enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n", - " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n", - " return enc_outputs, attn\n", - "\n", - "class BERT(nn.Module):\n", - " def __init__(self):\n", - " super(BERT, self).__init__()\n", - " self.embedding = Embedding()\n", - " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n", - " self.fc = nn.Linear(d_model, d_model)\n", - " self.activ1 = nn.Tanh()\n", - " self.linear = nn.Linear(d_model, d_model)\n", - " self.activ2 = gelu\n", - " self.norm = nn.LayerNorm(d_model)\n", - " self.classifier = nn.Linear(d_model, 2)\n", - " # decoder is shared with embedding layer\n", - " embed_weight = self.embedding.tok_embed.weight\n", - " n_vocab, n_dim = embed_weight.size()\n", - " self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n", - " self.decoder.weight = embed_weight\n", - " self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n", - "\n", - " def forward(self, input_ids, segment_ids, masked_pos):\n", - " output = self.embedding(input_ids, segment_ids)\n", - " enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n", - " for layer in self.layers:\n", - " output, enc_self_attn = layer(output, enc_self_attn_mask)\n", - " # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]\n", - " # it will be decided by first token(CLS)\n", - " h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]\n", - " logits_clsf = self.classifier(h_pooled) # [batch_size, 2]\n", - "\n", - " masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]\n", - " # get masked position from final output of transformer.\n", - " h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]\n", - " h_masked = self.norm(self.activ2(self.linear(h_masked)))\n", - " logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]\n", - "\n", - " return logits_lm, logits_clsf\n", - "\n", - "if __name__ == '__main__':\n", - " # BERT Parameters\n", - " maxlen = 30 # maximum of length\n", - " batch_size = 6\n", - " max_pred = 5 # max tokens of prediction\n", - " n_layers = 6 # number of Encoder of Encoder Layer\n", - " n_heads = 12 # number of heads in Multi-Head Attention\n", - " d_model = 768 # Embedding Size\n", - " d_ff = 768 * 4 # 4*d_model, FeedForward dimension\n", - " d_k = d_v = 64 # dimension of K(=Q), V\n", - " n_segments = 2\n", - "\n", - " text = (\n", - " 'Hello, how are you? I am Romeo.\\n'\n", - " 'Hello, Romeo My name is Juliet. Nice to meet you.\\n'\n", - " 'Nice meet you too. How are you today?\\n'\n", - " 'Great. My baseball team won the competition.\\n'\n", - " 'Oh Congratulations, Juliet\\n'\n", - " 'Thanks you Romeo'\n", - " )\n", - " sentences = re.sub(\"[.,!?\\\\-]\", '', text.lower()).split('\\n') # filter '.', ',', '?', '!'\n", - " word_list = list(set(\" \".join(sentences).split()))\n", - " word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}\n", - " for i, w in enumerate(word_list):\n", - " word_dict[w] = i + 4\n", - " number_dict = {i: w for i, w in enumerate(word_dict)}\n", - " vocab_size = len(word_dict)\n", - "\n", - " token_list = list()\n", - " for sentence in sentences:\n", - " arr = [word_dict[s] for s in sentence.split()]\n", - " token_list.append(arr)\n", - "\n", - " model = BERT()\n", - " criterion = nn.CrossEntropyLoss()\n", - " optimizer = optim.Adam(model.parameters(), lr=0.001)\n", - "\n", - " batch = make_batch()\n", - " input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n", - "\n", - " for epoch in range(100):\n", - " optimizer.zero_grad()\n", - " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n", - " loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM\n", - " loss_lm = (loss_lm.float()).mean()\n", - " loss_clsf = criterion(logits_clsf, isNext) # for sentence classification\n", - " loss = loss_lm + loss_clsf\n", - " if (epoch + 1) % 10 == 0:\n", - " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", - " loss.backward()\n", - " optimizer.step()\n", - "\n", - " # Predict mask tokens ans isNext\n", - " input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))\n", - " print(text)\n", - " print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])\n", - "\n", - " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n", - " logits_lm = logits_lm.data.max(2)[1][0].data.numpy()\n", - " print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])\n", - " print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])\n", - "\n", - " logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]\n", - " print('isNext : ', True if isNext else False)\n", - " print('predict isNext : ',True if logits_clsf else False)\n" - ], - "outputs": [], - "execution_count": null - } - ], - "metadata": { - "anaconda-cloud": {}, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# code by Tae Hwan Jung(Jeff Jung) @graykode\n", + "# Reference : https://github.com/jadore801120/attention-is-all-you-need-pytorch\n", + "# https://github.com/JayParks/transformer, https://github.com/dhlee347/pytorchic-bert\n", + "import math\n", + "import re\n", + "from random import *\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "\n", + "# sample IsNext and NotNext to be same in small batch size\n", + "def make_batch():\n", + " batch = []\n", + " positive = negative = 0\n", + " while positive != batch_size/2 or negative != batch_size/2:\n", + " tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences\n", + " tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]\n", + " input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]\n", + " segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n", + "\n", + " # MASK LM\n", + " n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence\n", + " cand_maked_pos = [i for i, token in enumerate(input_ids)\n", + " if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]\n", + " shuffle(cand_maked_pos)\n", + " masked_tokens, masked_pos = [], []\n", + " for pos in cand_maked_pos[:n_pred]:\n", + " masked_pos.append(pos)\n", + " masked_tokens.append(input_ids[pos])\n", + " if random() < 0.8: # 80%\n", + " input_ids[pos] = word_dict['[MASK]'] # make mask\n", + " elif random() < 0.5: # 10%\n", + " index = randint(0, vocab_size - 1) # random index in vocabulary\n", + " input_ids[pos] = word_dict[number_dict[index]] # replace\n", + "\n", + " # Zero Paddings\n", + " n_pad = maxlen - len(input_ids)\n", + " input_ids.extend([0] * n_pad)\n", + " segment_ids.extend([0] * n_pad)\n", + "\n", + " # Zero Padding (100% - 15%) tokens\n", + " if max_pred > n_pred:\n", + " n_pad = max_pred - n_pred\n", + " masked_tokens.extend([0] * n_pad)\n", + " masked_pos.extend([0] * n_pad)\n", + "\n", + " if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:\n", + " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n", + " positive += 1\n", + " elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:\n", + " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n", + " negative += 1\n", + " return batch\n", + "# Proprecessing Finished\n", + "\n", + "def get_attn_pad_mask(seq_q, seq_k):\n", + " batch_size, len_q = seq_q.size()\n", + " batch_size, len_k = seq_k.size()\n", + " # eq(zero) is PAD token\n", + " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # batch_size x 1 x len_k(=len_q), one is masking\n", + " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k\n", + "\n", + "def gelu(x):\n", + " \"Implementation of the gelu activation function by Hugging Face\"\n", + " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))\n", + "\n", + "class Embedding(nn.Module):\n", + " def __init__(self):\n", + " super(Embedding, self).__init__()\n", + " self.tok_embed = nn.Embedding(vocab_size, d_model) # token embedding\n", + " self.pos_embed = nn.Embedding(maxlen, d_model) # position embedding\n", + " self.seg_embed = nn.Embedding(n_segments, d_model) # segment(token type) embedding\n", + " self.norm = nn.LayerNorm(d_model)\n", + "\n", + " def forward(self, x, seg):\n", + " seq_len = x.size(1)\n", + " pos = torch.arange(seq_len, dtype=torch.long)\n", + " pos = pos.unsqueeze(0).expand_as(x) # (seq_len,) -> (batch_size, seq_len)\n", + " embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n", + " return self.norm(embedding)\n", + "\n", + "class ScaledDotProductAttention(nn.Module):\n", + " def __init__(self):\n", + " super(ScaledDotProductAttention, self).__init__()\n", + "\n", + " def forward(self, Q, K, V, attn_mask):\n", + " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", + " scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.\n", + " attn = nn.Softmax(dim=-1)(scores)\n", + " context = torch.matmul(attn, V)\n", + " return context, attn\n", + "\n", + "class MultiHeadAttention(nn.Module):\n", + " def __init__(self):\n", + " super(MultiHeadAttention, self).__init__()\n", + " self.W_Q = nn.Linear(d_model, d_k * n_heads)\n", + " self.W_K = nn.Linear(d_model, d_k * n_heads)\n", + " self.W_V = nn.Linear(d_model, d_v * n_heads)\n", + " def forward(self, Q, K, V, attn_mask):\n", + " # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]\n", + " residual, batch_size = Q, Q.size(0)\n", + " # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)\n", + " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]\n", + " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]\n", + " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]\n", + "\n", + " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]\n", + "\n", + " # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]\n", + " context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", + " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n", + " output = nn.Linear(n_heads * d_v, d_model)(context)\n", + " return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]\n", + "\n", + "class PoswiseFeedForwardNet(nn.Module):\n", + " def __init__(self):\n", + " super(PoswiseFeedForwardNet, self).__init__()\n", + " self.fc1 = nn.Linear(d_model, d_ff)\n", + " self.fc2 = nn.Linear(d_ff, d_model)\n", + "\n", + " def forward(self, x):\n", + " # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)\n", + " return self.fc2(gelu(self.fc1(x)))\n", + "\n", + "class EncoderLayer(nn.Module):\n", + " def __init__(self):\n", + " super(EncoderLayer, self).__init__()\n", + " self.enc_self_attn = MultiHeadAttention()\n", + " self.pos_ffn = PoswiseFeedForwardNet()\n", + "\n", + " def forward(self, enc_inputs, enc_self_attn_mask):\n", + " enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V\n", + " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n", + " return enc_outputs, attn\n", + "\n", + "class BERT(nn.Module):\n", + " def __init__(self):\n", + " super(BERT, self).__init__()\n", + " self.embedding = Embedding()\n", + " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n", + " self.fc = nn.Linear(d_model, d_model)\n", + " self.activ1 = nn.Tanh()\n", + " self.linear = nn.Linear(d_model, d_model)\n", + " self.activ2 = gelu\n", + " self.norm = nn.LayerNorm(d_model)\n", + " self.classifier = nn.Linear(d_model, 2)\n", + " # decoder is shared with embedding layer\n", + " embed_weight = self.embedding.tok_embed.weight\n", + " n_vocab, n_dim = embed_weight.size()\n", + " self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n", + " self.decoder.weight = embed_weight\n", + " self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n", + "\n", + " def forward(self, input_ids, segment_ids, masked_pos):\n", + " output = self.embedding(input_ids, segment_ids)\n", + " enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n", + " for layer in self.layers:\n", + " output, enc_self_attn = layer(output, enc_self_attn_mask)\n", + " # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]\n", + " # it will be decided by first token(CLS)\n", + " h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]\n", + " logits_clsf = self.classifier(h_pooled) # [batch_size, 2]\n", + "\n", + " masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]\n", + " # get masked position from final output of transformer.\n", + " h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]\n", + " h_masked = self.norm(self.activ2(self.linear(h_masked)))\n", + " logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]\n", + "\n", + " return logits_lm, logits_clsf\n", + "\n", + "if __name__ == '__main__':\n", + " # BERT Parameters\n", + " maxlen = 30 # maximum of length\n", + " batch_size = 6\n", + " max_pred = 5 # max tokens of prediction\n", + " n_layers = 6 # number of Encoder of Encoder Layer\n", + " n_heads = 12 # number of heads in Multi-Head Attention\n", + " d_model = 768 # Embedding Size\n", + " d_ff = 768 * 4 # 4*d_model, FeedForward dimension\n", + " d_k = d_v = 64 # dimension of K(=Q), V\n", + " n_segments = 2\n", + "\n", + " text = (\n", + " 'Hello, how are you? I am Romeo.\\n'\n", + " 'Hello, Romeo My name is Juliet. Nice to meet you.\\n'\n", + " 'Nice meet you too. How are you today?\\n'\n", + " 'Great. My baseball team won the competition.\\n'\n", + " 'Oh Congratulations, Juliet\\n'\n", + " 'Thanks you Romeo'\n", + " )\n", + " sentences = re.sub(\"[.,!?\\\\-]\", '', text.lower()).split('\\n') # filter '.', ',', '?', '!'\n", + " word_list = list(set(\" \".join(sentences).split()))\n", + " word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}\n", + " for i, w in enumerate(word_list):\n", + " word_dict[w] = i + 4\n", + " number_dict = {i: w for i, w in enumerate(word_dict)}\n", + " vocab_size = len(word_dict)\n", + "\n", + " token_list = list()\n", + " for sentence in sentences:\n", + " arr = [word_dict[s] for s in sentence.split()]\n", + " token_list.append(arr)\n", + "\n", + " model = BERT()\n", + " criterion = nn.CrossEntropyLoss()\n", + " optimizer = optim.Adam(model.parameters(), lr=0.001)\n", + "\n", + " batch = make_batch()\n", + " input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n", + "\n", + " for epoch in range(100):\n", + " optimizer.zero_grad()\n", + " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n", + " loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM\n", + " loss_lm = (loss_lm.float()).mean()\n", + " loss_clsf = criterion(logits_clsf, isNext) # for sentence classification\n", + " loss = loss_lm + loss_clsf\n", + " if (epoch + 1) % 10 == 0:\n", + " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # Predict mask tokens ans isNext\n", + " input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))\n", + " print(text)\n", + " print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])\n", + "\n", + " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n", + " logits_lm = logits_lm.data.max(2)[1][0].data.numpy()\n", + " print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])\n", + " print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])\n", + "\n", + " logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]\n", + " print('isNext : ', True if isNext else False)\n", + " print('predict isNext : ',True if logits_clsf else False)\n" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}