From e08cf123f4b63db165fc04dd854aca057dd8609d Mon Sep 17 00:00:00 2001
From: ddilbaz <ddilbaz@tenstorrent.com>
Date: Fri, 28 Feb 2025 18:07:30 +0000
Subject: [PATCH] Remove tests from latest version, rebase

---
 env/activate                                  |     3 +-
 mlir_tests/Autoencoder (linear).mlir          |    86 -
 mlir_tests/DETR.mlir                          |  3266 -----
 mlir_tests/GLPN-KITTI.mlir                    | 10953 ----------------
 mlir_tests/MLPMixer.mlir                      |  2147 ---
 mlir_tests/Mnist.mlir                         |    65 -
 mlir_tests/MobileNetSSD.mlir                  |  1643 ---
 mlir_tests/MobileNetV2.mlir                   |  1089 --
 mlir_tests/OpenPose V2.mlir                   |   867 --
 mlir_tests/Perceiver IO.mlir                  |  5604 --------
 mlir_tests/ResNet18.mlir                      |   403 -
 mlir_tests/ResNet50.mlir                      |  1011 --
 mlir_tests/SegFormer.mlir                     |  2515 ----
 mlir_tests/SqueezeBERT.mlir                   |    76 -
 mlir_tests/ViLT.mlir                          |    61 -
 mlir_tests/YOLOv3.mlir                        |  1752 ---
 mlir_tests/alibaba-damomgp-str-base.mlir      |  2556 ----
 mlir_tests/distilbert-base-uncased.mlir       |  1380 --
 .../microsoftbeit-base-patch16-224.mlir       |  2490 ----
 .../microsoftbeit-large-patch16-224.mlir      |  4866 -------
 mlir_tests/pytests/test_autoencoder_linear.py |    47 -
 .../pytests/test_beit_base_patch16_224.py     |    49 -
 .../pytests/test_beit_large_patch16_224.py    |    49 -
 mlir_tests/pytests/test_detr.py               |    47 -
 .../pytests/test_distilbert_base_uncased.py   |    47 -
 mlir_tests/pytests/test_glpn_kitti.py         |    47 -
 mlir_tests/pytests/test_mgp_str_base.py       |    49 -
 mlir_tests/pytests/test_mlpmixer.py           |    47 -
 mlir_tests/pytests/test_mnist.py              |    47 -
 mlir_tests/pytests/test_mobilenetssd.py       |    47 -
 mlir_tests/pytests/test_mobilenetv2.py        |    47 -
 mlir_tests/pytests/test_openposev2.py         |    47 -
 mlir_tests/pytests/test_perceiverio.py        |    47 -
 mlir_tests/pytests/test_resnet18.py           |    47 -
 mlir_tests/pytests/test_resnet50.py           |    47 -
 mlir_tests/pytests/test_segformer.py          |    47 -
 mlir_tests/pytests/test_squeezebert.py        |    47 -
 mlir_tests/pytests/test_vilt.py               |    47 -
 mlir_tests/pytests/test_yolov3.py             |    47 -
 tests/models/resnet/test_resnet.py            |     4 +-
 tt_torch/dynamo/executor.py                   |    15 +-
 tt_torch/dynamo/test_mlir.py                  |   170 -
 tt_torch/dynamo/torch_backend.py              |   221 +-
 43 files changed, 120 insertions(+), 44022 deletions(-)
 delete mode 100644 mlir_tests/Autoencoder (linear).mlir
 delete mode 100644 mlir_tests/DETR.mlir
 delete mode 100644 mlir_tests/GLPN-KITTI.mlir
 delete mode 100644 mlir_tests/MLPMixer.mlir
 delete mode 100644 mlir_tests/Mnist.mlir
 delete mode 100644 mlir_tests/MobileNetSSD.mlir
 delete mode 100644 mlir_tests/MobileNetV2.mlir
 delete mode 100644 mlir_tests/OpenPose V2.mlir
 delete mode 100644 mlir_tests/Perceiver IO.mlir
 delete mode 100644 mlir_tests/ResNet18.mlir
 delete mode 100644 mlir_tests/ResNet50.mlir
 delete mode 100644 mlir_tests/SegFormer.mlir
 delete mode 100644 mlir_tests/SqueezeBERT.mlir
 delete mode 100644 mlir_tests/ViLT.mlir
 delete mode 100644 mlir_tests/YOLOv3.mlir
 delete mode 100644 mlir_tests/alibaba-damomgp-str-base.mlir
 delete mode 100644 mlir_tests/distilbert-base-uncased.mlir
 delete mode 100644 mlir_tests/microsoftbeit-base-patch16-224.mlir
 delete mode 100644 mlir_tests/microsoftbeit-large-patch16-224.mlir
 delete mode 100644 mlir_tests/pytests/test_autoencoder_linear.py
 delete mode 100644 mlir_tests/pytests/test_beit_base_patch16_224.py
 delete mode 100644 mlir_tests/pytests/test_beit_large_patch16_224.py
 delete mode 100644 mlir_tests/pytests/test_detr.py
 delete mode 100644 mlir_tests/pytests/test_distilbert_base_uncased.py
 delete mode 100644 mlir_tests/pytests/test_glpn_kitti.py
 delete mode 100644 mlir_tests/pytests/test_mgp_str_base.py
 delete mode 100644 mlir_tests/pytests/test_mlpmixer.py
 delete mode 100644 mlir_tests/pytests/test_mnist.py
 delete mode 100644 mlir_tests/pytests/test_mobilenetssd.py
 delete mode 100644 mlir_tests/pytests/test_mobilenetv2.py
 delete mode 100644 mlir_tests/pytests/test_openposev2.py
 delete mode 100644 mlir_tests/pytests/test_perceiverio.py
 delete mode 100644 mlir_tests/pytests/test_resnet18.py
 delete mode 100644 mlir_tests/pytests/test_resnet50.py
 delete mode 100644 mlir_tests/pytests/test_segformer.py
 delete mode 100644 mlir_tests/pytests/test_squeezebert.py
 delete mode 100644 mlir_tests/pytests/test_vilt.py
 delete mode 100644 mlir_tests/pytests/test_yolov3.py
 delete mode 100644 tt_torch/dynamo/test_mlir.py

diff --git a/env/activate b/env/activate
index 24de56b8..6b9d6a6f 100644
--- a/env/activate
+++ b/env/activate
@@ -28,6 +28,7 @@ else
   pip install --upgrade pip
 
   python3.11 -m pip install -r requirements.txt
+
   mkdir -p $TT_TORCH_HOME/dist
   if [ ! -f $TT_TORCH_HOME/dist/torchvision*.whl ]; then
     cd $TT_TORCH_HOME/third_party
@@ -39,11 +40,9 @@ else
 
     cp dist/*.whl $TT_TORCH_HOME/dist
     cd $TT_TORCH_HOME
-
   fi
   pip install $TT_TORCH_HOME/dist/torchvision*.whl
   pip install stablehlo -f https://github.com/openxla/stablehlo/releases/expanded_assets/dev-wheels
-
 fi
 export TTTORCH_ENV_ACTIVATED=1
 export TTMLIR_ENV_ACTIVATED=1
diff --git a/mlir_tests/Autoencoder (linear).mlir b/mlir_tests/Autoencoder (linear).mlir
deleted file mode 100644
index 4acdcd97..00000000
--- a/mlir_tests/Autoencoder (linear).mlir	
+++ /dev/null
@@ -1,86 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x784xbf16>, %arg1: tensor<784x128xf32>, %arg2: tensor<128xf32>, %arg3: tensor<128x64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64x12xf32>, %arg6: tensor<12xf32>, %arg7: tensor<12x3xf32>, %arg8: tensor<3xf32>, %arg9: tensor<3x12xf32>, %arg10: tensor<12xf32>, %arg11: tensor<12x64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64x128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128x784xf32>, %arg16: tensor<784xf32>) -> tensor<1x784xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<1x128xbf16>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<1x64xbf16>
-    %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<1x12xbf16>
-    %cst_2 = arith.constant dense<1> : tensor<1xi64>
-    %0 = stablehlo.convert %arg0 : (tensor<1x784xbf16>) -> tensor<1x784xf32>
-    %1 = stablehlo.dot_general %0, %arg1, contracting_dims = [1] x [0] : (tensor<1x784xf32>, tensor<784x128xf32>) -> tensor<1x128xf32>
-    %2 = stablehlo.convert %cst_2 : (tensor<1xi64>) -> tensor<1xf32>
-    %3 = stablehlo.reshape %2 : (tensor<1xf32>) -> tensor<f32>
-    %4 = stablehlo.broadcast_in_dim %1, dims = [0, 1] : (tensor<1x128xf32>) -> tensor<1x128xf32>
-    %5 = stablehlo.broadcast_in_dim %3, dims = [] : (tensor<f32>) -> tensor<1x128xf32>
-    %6 = stablehlo.multiply %4, %5 : tensor<1x128xf32>
-    %7 = stablehlo.broadcast_in_dim %6, dims = [0, 1] : (tensor<1x128xf32>) -> tensor<1x128xf32>
-    %8 = stablehlo.broadcast_in_dim %arg2, dims = [1] : (tensor<128xf32>) -> tensor<1x128xf32>
-    %9 = stablehlo.add %7, %8 : tensor<1x128xf32>
-    %10 = stablehlo.convert %9 : (tensor<1x128xf32>) -> tensor<1x128xbf16>
-    %11 = stablehlo.maximum %10, %cst : tensor<1x128xbf16>
-    %12 = stablehlo.convert %11 : (tensor<1x128xbf16>) -> tensor<1x128xf32>
-    %13 = stablehlo.dot_general %12, %arg3, contracting_dims = [1] x [0] : (tensor<1x128xf32>, tensor<128x64xf32>) -> tensor<1x64xf32>
-    %14 = stablehlo.broadcast_in_dim %13, dims = [0, 1] : (tensor<1x64xf32>) -> tensor<1x64xf32>
-    %15 = stablehlo.broadcast_in_dim %3, dims = [] : (tensor<f32>) -> tensor<1x64xf32>
-    %16 = stablehlo.multiply %14, %15 : tensor<1x64xf32>
-    %17 = stablehlo.broadcast_in_dim %16, dims = [0, 1] : (tensor<1x64xf32>) -> tensor<1x64xf32>
-    %18 = stablehlo.broadcast_in_dim %arg4, dims = [1] : (tensor<64xf32>) -> tensor<1x64xf32>
-    %19 = stablehlo.add %17, %18 : tensor<1x64xf32>
-    %20 = stablehlo.convert %19 : (tensor<1x64xf32>) -> tensor<1x64xbf16>
-    %21 = stablehlo.maximum %20, %cst_0 : tensor<1x64xbf16>
-    %22 = stablehlo.convert %21 : (tensor<1x64xbf16>) -> tensor<1x64xf32>
-    %23 = stablehlo.dot_general %22, %arg5, contracting_dims = [1] x [0] : (tensor<1x64xf32>, tensor<64x12xf32>) -> tensor<1x12xf32>
-    %24 = stablehlo.broadcast_in_dim %23, dims = [0, 1] : (tensor<1x12xf32>) -> tensor<1x12xf32>
-    %25 = stablehlo.broadcast_in_dim %3, dims = [] : (tensor<f32>) -> tensor<1x12xf32>
-    %26 = stablehlo.multiply %24, %25 : tensor<1x12xf32>
-    %27 = stablehlo.broadcast_in_dim %26, dims = [0, 1] : (tensor<1x12xf32>) -> tensor<1x12xf32>
-    %28 = stablehlo.broadcast_in_dim %arg6, dims = [1] : (tensor<12xf32>) -> tensor<1x12xf32>
-    %29 = stablehlo.add %27, %28 : tensor<1x12xf32>
-    %30 = stablehlo.convert %29 : (tensor<1x12xf32>) -> tensor<1x12xbf16>
-    %31 = stablehlo.maximum %30, %cst_1 : tensor<1x12xbf16>
-    %32 = stablehlo.convert %31 : (tensor<1x12xbf16>) -> tensor<1x12xf32>
-    %33 = stablehlo.dot_general %32, %arg7, contracting_dims = [1] x [0] : (tensor<1x12xf32>, tensor<12x3xf32>) -> tensor<1x3xf32>
-    %34 = stablehlo.broadcast_in_dim %33, dims = [0, 1] : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %35 = stablehlo.broadcast_in_dim %3, dims = [] : (tensor<f32>) -> tensor<1x3xf32>
-    %36 = stablehlo.multiply %34, %35 : tensor<1x3xf32>
-    %37 = stablehlo.broadcast_in_dim %36, dims = [0, 1] : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %38 = stablehlo.broadcast_in_dim %arg8, dims = [1] : (tensor<3xf32>) -> tensor<1x3xf32>
-    %39 = stablehlo.add %37, %38 : tensor<1x3xf32>
-    %40 = stablehlo.convert %39 : (tensor<1x3xf32>) -> tensor<1x3xbf16>
-    %41 = stablehlo.convert %40 : (tensor<1x3xbf16>) -> tensor<1x3xf32>
-    %42 = stablehlo.dot_general %41, %arg9, contracting_dims = [1] x [0] : (tensor<1x3xf32>, tensor<3x12xf32>) -> tensor<1x12xf32>
-    %43 = stablehlo.broadcast_in_dim %42, dims = [0, 1] : (tensor<1x12xf32>) -> tensor<1x12xf32>
-    %44 = stablehlo.multiply %43, %25 : tensor<1x12xf32>
-    %45 = stablehlo.broadcast_in_dim %44, dims = [0, 1] : (tensor<1x12xf32>) -> tensor<1x12xf32>
-    %46 = stablehlo.broadcast_in_dim %arg10, dims = [1] : (tensor<12xf32>) -> tensor<1x12xf32>
-    %47 = stablehlo.add %45, %46 : tensor<1x12xf32>
-    %48 = stablehlo.convert %47 : (tensor<1x12xf32>) -> tensor<1x12xbf16>
-    %49 = stablehlo.maximum %48, %cst_1 : tensor<1x12xbf16>
-    %50 = stablehlo.convert %49 : (tensor<1x12xbf16>) -> tensor<1x12xf32>
-    %51 = stablehlo.dot_general %50, %arg11, contracting_dims = [1] x [0] : (tensor<1x12xf32>, tensor<12x64xf32>) -> tensor<1x64xf32>
-    %52 = stablehlo.broadcast_in_dim %51, dims = [0, 1] : (tensor<1x64xf32>) -> tensor<1x64xf32>
-    %53 = stablehlo.multiply %52, %15 : tensor<1x64xf32>
-    %54 = stablehlo.broadcast_in_dim %53, dims = [0, 1] : (tensor<1x64xf32>) -> tensor<1x64xf32>
-    %55 = stablehlo.broadcast_in_dim %arg12, dims = [1] : (tensor<64xf32>) -> tensor<1x64xf32>
-    %56 = stablehlo.add %54, %55 : tensor<1x64xf32>
-    %57 = stablehlo.convert %56 : (tensor<1x64xf32>) -> tensor<1x64xbf16>
-    %58 = stablehlo.maximum %57, %cst_0 : tensor<1x64xbf16>
-    %59 = stablehlo.convert %58 : (tensor<1x64xbf16>) -> tensor<1x64xf32>
-    %60 = stablehlo.dot_general %59, %arg13, contracting_dims = [1] x [0] : (tensor<1x64xf32>, tensor<64x128xf32>) -> tensor<1x128xf32>
-    %61 = stablehlo.broadcast_in_dim %60, dims = [0, 1] : (tensor<1x128xf32>) -> tensor<1x128xf32>
-    %62 = stablehlo.multiply %61, %5 : tensor<1x128xf32>
-    %63 = stablehlo.broadcast_in_dim %62, dims = [0, 1] : (tensor<1x128xf32>) -> tensor<1x128xf32>
-    %64 = stablehlo.broadcast_in_dim %arg14, dims = [1] : (tensor<128xf32>) -> tensor<1x128xf32>
-    %65 = stablehlo.add %63, %64 : tensor<1x128xf32>
-    %66 = stablehlo.convert %65 : (tensor<1x128xf32>) -> tensor<1x128xbf16>
-    %67 = stablehlo.maximum %66, %cst : tensor<1x128xbf16>
-    %68 = stablehlo.convert %67 : (tensor<1x128xbf16>) -> tensor<1x128xf32>
-    %69 = stablehlo.dot_general %68, %arg15, contracting_dims = [1] x [0] : (tensor<1x128xf32>, tensor<128x784xf32>) -> tensor<1x784xf32>
-    %70 = stablehlo.broadcast_in_dim %69, dims = [0, 1] : (tensor<1x784xf32>) -> tensor<1x784xf32>
-    %71 = stablehlo.broadcast_in_dim %3, dims = [] : (tensor<f32>) -> tensor<1x784xf32>
-    %72 = stablehlo.multiply %70, %71 : tensor<1x784xf32>
-    %73 = stablehlo.broadcast_in_dim %72, dims = [0, 1] : (tensor<1x784xf32>) -> tensor<1x784xf32>
-    %74 = stablehlo.broadcast_in_dim %arg16, dims = [1] : (tensor<784xf32>) -> tensor<1x784xf32>
-    %75 = stablehlo.add %73, %74 : tensor<1x784xf32>
-    %76 = stablehlo.convert %75 : (tensor<1x784xf32>) -> tensor<1x784xbf16>
-    return %76 : tensor<1x784xbf16>
-  }
-}
diff --git a/mlir_tests/DETR.mlir b/mlir_tests/DETR.mlir
deleted file mode 100644
index 0cf6070b..00000000
--- a/mlir_tests/DETR.mlir
+++ /dev/null
@@ -1,3266 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x720x1280xbf16>, %arg1: tensor<64x3x7x7xbf16>, %arg2: tensor<64x64x1x1xbf16>, %arg3: tensor<64x64x3x3xbf16>, %arg4: tensor<256x64x1x1xbf16>, %arg5: tensor<256x64x1x1xbf16>, %arg6: tensor<64x256x1x1xbf16>, %arg7: tensor<64x64x3x3xbf16>, %arg8: tensor<256x64x1x1xbf16>, %arg9: tensor<64x256x1x1xbf16>, %arg10: tensor<64x64x3x3xbf16>, %arg11: tensor<256x64x1x1xbf16>, %arg12: tensor<128x256x1x1xbf16>, %arg13: tensor<128x128x3x3xbf16>, %arg14: tensor<512x128x1x1xbf16>, %arg15: tensor<512x256x1x1xbf16>, %arg16: tensor<128x512x1x1xbf16>, %arg17: tensor<128x128x3x3xbf16>, %arg18: tensor<512x128x1x1xbf16>, %arg19: tensor<128x512x1x1xbf16>, %arg20: tensor<128x128x3x3xbf16>, %arg21: tensor<512x128x1x1xbf16>, %arg22: tensor<128x512x1x1xbf16>, %arg23: tensor<128x128x3x3xbf16>, %arg24: tensor<512x128x1x1xbf16>, %arg25: tensor<256x512x1x1xbf16>, %arg26: tensor<256x256x3x3xbf16>, %arg27: tensor<1024x256x1x1xbf16>, %arg28: tensor<1024x512x1x1xbf16>, %arg29: tensor<256x1024x1x1xbf16>, %arg30: tensor<256x256x3x3xbf16>, %arg31: tensor<1024x256x1x1xbf16>, %arg32: tensor<256x1024x1x1xbf16>, %arg33: tensor<256x256x3x3xbf16>, %arg34: tensor<1024x256x1x1xbf16>, %arg35: tensor<256x1024x1x1xbf16>, %arg36: tensor<256x256x3x3xbf16>, %arg37: tensor<1024x256x1x1xbf16>, %arg38: tensor<256x1024x1x1xbf16>, %arg39: tensor<256x256x3x3xbf16>, %arg40: tensor<1024x256x1x1xbf16>, %arg41: tensor<256x1024x1x1xbf16>, %arg42: tensor<256x256x3x3xbf16>, %arg43: tensor<1024x256x1x1xbf16>, %arg44: tensor<512x1024x1x1xbf16>, %arg45: tensor<512x512x3x3xbf16>, %arg46: tensor<2048x512x1x1xbf16>, %arg47: tensor<2048x1024x1x1xbf16>, %arg48: tensor<512x2048x1x1xbf16>, %arg49: tensor<512x512x3x3xbf16>, %arg50: tensor<2048x512x1x1xbf16>, %arg51: tensor<512x2048x1x1xbf16>, %arg52: tensor<512x512x3x3xbf16>, %arg53: tensor<2048x512x1x1xbf16>, %arg54: tensor<256x2048x1x1xbf16>, %arg55: tensor<256xbf16>, %arg56: tensor<256xbf16>, %arg57: tensor<256xbf16>, %arg58: tensor<256xbf16>, %arg59: tensor<256xbf16>, %arg60: tensor<256xbf16>, %arg61: tensor<256xbf16>, %arg62: tensor<256xbf16>, %arg63: tensor<256xbf16>, %arg64: tensor<256xbf16>, %arg65: tensor<256xbf16>, %arg66: tensor<256xbf16>, %arg67: tensor<256xbf16>, %arg68: tensor<256xbf16>, %arg69: tensor<256xbf16>, %arg70: tensor<256xbf16>, %arg71: tensor<256xbf16>, %arg72: tensor<256xbf16>, %arg73: tensor<256xbf16>, %arg74: tensor<256xbf16>, %arg75: tensor<256xbf16>, %arg76: tensor<256xbf16>, %arg77: tensor<256xbf16>, %arg78: tensor<256xbf16>, %arg79: tensor<256xbf16>, %arg80: tensor<256xbf16>, %arg81: tensor<256xbf16>, %arg82: tensor<256xbf16>, %arg83: tensor<256xbf16>, %arg84: tensor<256xbf16>, %arg85: tensor<256xbf16>, %arg86: tensor<256xbf16>, %arg87: tensor<256xbf16>, %arg88: tensor<256xbf16>, %arg89: tensor<256xbf16>, %arg90: tensor<256xbf16>, %arg91: tensor<256xbf16>, %arg92: tensor<256xbf16>, %arg93: tensor<256xbf16>, %arg94: tensor<256xbf16>, %arg95: tensor<256xbf16>, %arg96: tensor<256xbf16>, %arg97: tensor<256xbf16>, %arg98: tensor<256xbf16>, %arg99: tensor<256xbf16>, %arg100: tensor<256xbf16>, %arg101: tensor<256xbf16>, %arg102: tensor<256xbf16>, %arg103: tensor<256xbf16>, %arg104: tensor<256xbf16>, %arg105: tensor<256xbf16>, %arg106: tensor<256xbf16>, %arg107: tensor<256xbf16>, %arg108: tensor<256xbf16>, %arg109: tensor<256xbf16>, %arg110: tensor<256xbf16>, %arg111: tensor<256xbf16>, %arg112: tensor<256xbf16>, %arg113: tensor<256xbf16>, %arg114: tensor<256xbf16>, %arg115: tensor<256xbf16>, %arg116: tensor<92xbf16>, %arg117: tensor<256xbf16>, %arg118: tensor<256xbf16>, %arg119: tensor<4xbf16>, %arg120: tensor<3x720x1280xbf16>, %arg121: tensor<1x3x720x1280xbf16>, %arg122: tensor<1x64x1x1xbf16>, %arg123: tensor<1x64x1x1xbf16>, %arg124: tensor<1x64x1x1xbf16>, %arg125: tensor<1x64x1x1xbf16>, %arg126: tensor<1x64x1x1xbf16>, %arg127: tensor<1x64x1x1xbf16>, %arg128: tensor<1x256x1x1xbf16>, %arg129: tensor<1x256x1x1xbf16>, %arg130: tensor<1x256x1x1xbf16>, %arg131: tensor<1x256x1x1xbf16>, %arg132: tensor<1x64x1x1xbf16>, %arg133: tensor<1x64x1x1xbf16>, %arg134: tensor<1x64x1x1xbf16>, %arg135: tensor<1x64x1x1xbf16>, %arg136: tensor<1x256x1x1xbf16>, %arg137: tensor<1x256x1x1xbf16>, %arg138: tensor<1x64x1x1xbf16>, %arg139: tensor<1x64x1x1xbf16>, %arg140: tensor<1x64x1x1xbf16>, %arg141: tensor<1x64x1x1xbf16>, %arg142: tensor<1x256x1x1xbf16>, %arg143: tensor<1x256x1x1xbf16>, %arg144: tensor<1x128x1x1xbf16>, %arg145: tensor<1x128x1x1xbf16>, %arg146: tensor<1x128x1x1xbf16>, %arg147: tensor<1x128x1x1xbf16>, %arg148: tensor<1x512x1x1xbf16>, %arg149: tensor<1x512x1x1xbf16>, %arg150: tensor<1x512x1x1xbf16>, %arg151: tensor<1x512x1x1xbf16>, %arg152: tensor<1x128x1x1xbf16>, %arg153: tensor<1x128x1x1xbf16>, %arg154: tensor<1x128x1x1xbf16>, %arg155: tensor<1x128x1x1xbf16>, %arg156: tensor<1x512x1x1xbf16>, %arg157: tensor<1x512x1x1xbf16>, %arg158: tensor<1x128x1x1xbf16>, %arg159: tensor<1x128x1x1xbf16>, %arg160: tensor<1x128x1x1xbf16>, %arg161: tensor<1x128x1x1xbf16>, %arg162: tensor<1x512x1x1xbf16>, %arg163: tensor<1x512x1x1xbf16>, %arg164: tensor<1x128x1x1xbf16>, %arg165: tensor<1x128x1x1xbf16>, %arg166: tensor<1x128x1x1xbf16>, %arg167: tensor<1x128x1x1xbf16>, %arg168: tensor<1x512x1x1xbf16>, %arg169: tensor<1x512x1x1xbf16>, %arg170: tensor<1x256x1x1xbf16>, %arg171: tensor<1x256x1x1xbf16>, %arg172: tensor<1x256x1x1xbf16>, %arg173: tensor<1x256x1x1xbf16>, %arg174: tensor<1x1024x1x1xbf16>, %arg175: tensor<1x1024x1x1xbf16>, %arg176: tensor<1x1024x1x1xbf16>, %arg177: tensor<1x1024x1x1xbf16>, %arg178: tensor<1x256x1x1xbf16>, %arg179: tensor<1x256x1x1xbf16>, %arg180: tensor<1x256x1x1xbf16>, %arg181: tensor<1x256x1x1xbf16>, %arg182: tensor<1x1024x1x1xbf16>, %arg183: tensor<1x1024x1x1xbf16>, %arg184: tensor<1x256x1x1xbf16>, %arg185: tensor<1x256x1x1xbf16>, %arg186: tensor<1x256x1x1xbf16>, %arg187: tensor<1x256x1x1xbf16>, %arg188: tensor<1x1024x1x1xbf16>, %arg189: tensor<1x1024x1x1xbf16>, %arg190: tensor<1x256x1x1xbf16>, %arg191: tensor<1x256x1x1xbf16>, %arg192: tensor<1x256x1x1xbf16>, %arg193: tensor<1x256x1x1xbf16>, %arg194: tensor<1x1024x1x1xbf16>, %arg195: tensor<1x1024x1x1xbf16>, %arg196: tensor<1x256x1x1xbf16>, %arg197: tensor<1x256x1x1xbf16>, %arg198: tensor<1x256x1x1xbf16>, %arg199: tensor<1x256x1x1xbf16>, %arg200: tensor<1x1024x1x1xbf16>, %arg201: tensor<1x1024x1x1xbf16>, %arg202: tensor<1x256x1x1xbf16>, %arg203: tensor<1x256x1x1xbf16>, %arg204: tensor<1x256x1x1xbf16>, %arg205: tensor<1x256x1x1xbf16>, %arg206: tensor<1x1024x1x1xbf16>, %arg207: tensor<1x1024x1x1xbf16>, %arg208: tensor<1x512x1x1xbf16>, %arg209: tensor<1x512x1x1xbf16>, %arg210: tensor<1x512x1x1xbf16>, %arg211: tensor<1x512x1x1xbf16>, %arg212: tensor<1x2048x1x1xbf16>, %arg213: tensor<1x2048x1x1xbf16>, %arg214: tensor<1x2048x1x1xbf16>, %arg215: tensor<1x2048x1x1xbf16>, %arg216: tensor<1x512x1x1xbf16>, %arg217: tensor<1x512x1x1xbf16>, %arg218: tensor<1x512x1x1xbf16>, %arg219: tensor<1x512x1x1xbf16>, %arg220: tensor<1x2048x1x1xbf16>, %arg221: tensor<1x2048x1x1xbf16>, %arg222: tensor<1x512x1x1xbf16>, %arg223: tensor<1x512x1x1xbf16>, %arg224: tensor<1x512x1x1xbf16>, %arg225: tensor<1x512x1x1xbf16>, %arg226: tensor<1x2048x1x1xbf16>, %arg227: tensor<1x2048x1x1xbf16>, %arg228: tensor<920x1x256xbf16>, %arg229: tensor<256x256xbf16>, %arg230: tensor<256xbf16>, %arg231: tensor<256x256xbf16>, %arg232: tensor<256xbf16>, %arg233: tensor<256x256xbf16>, %arg234: tensor<256xbf16>, %arg235: tensor<8x1x920xbf16>, %arg236: tensor<256x256xf32>, %arg237: tensor<256xf32>, %arg238: tensor<256x2048xf32>, %arg239: tensor<2048xf32>, %arg240: tensor<2048x256xf32>, %arg241: tensor<256xf32>, %arg242: tensor<256x256xf32>, %arg243: tensor<256xf32>, %arg244: tensor<256x256xf32>, %arg245: tensor<256xf32>, %arg246: tensor<256x256xf32>, %arg247: tensor<256xf32>, %arg248: tensor<8x1x920xbf16>, %arg249: tensor<256x256xf32>, %arg250: tensor<256xf32>, %arg251: tensor<256x2048xf32>, %arg252: tensor<2048xf32>, %arg253: tensor<2048x256xf32>, %arg254: tensor<256xf32>, %arg255: tensor<256x256xf32>, %arg256: tensor<256xf32>, %arg257: tensor<256x256xf32>, %arg258: tensor<256xf32>, %arg259: tensor<256x256xf32>, %arg260: tensor<256xf32>, %arg261: tensor<8x1x920xbf16>, %arg262: tensor<256x256xf32>, %arg263: tensor<256xf32>, %arg264: tensor<256x2048xf32>, %arg265: tensor<2048xf32>, %arg266: tensor<2048x256xf32>, %arg267: tensor<256xf32>, %arg268: tensor<256x256xf32>, %arg269: tensor<256xf32>, %arg270: tensor<256x256xf32>, %arg271: tensor<256xf32>, %arg272: tensor<256x256xf32>, %arg273: tensor<256xf32>, %arg274: tensor<8x1x920xbf16>, %arg275: tensor<256x256xf32>, %arg276: tensor<256xf32>, %arg277: tensor<256x2048xf32>, %arg278: tensor<2048xf32>, %arg279: tensor<2048x256xf32>, %arg280: tensor<256xf32>, %arg281: tensor<256x256xf32>, %arg282: tensor<256xf32>, %arg283: tensor<256x256xf32>, %arg284: tensor<256xf32>, %arg285: tensor<256x256xf32>, %arg286: tensor<256xf32>, %arg287: tensor<8x1x920xbf16>, %arg288: tensor<256x256xf32>, %arg289: tensor<256xf32>, %arg290: tensor<256x2048xf32>, %arg291: tensor<2048xf32>, %arg292: tensor<2048x256xf32>, %arg293: tensor<256xf32>, %arg294: tensor<256x256xf32>, %arg295: tensor<256xf32>, %arg296: tensor<256x256xf32>, %arg297: tensor<256xf32>, %arg298: tensor<256x256xf32>, %arg299: tensor<256xf32>, %arg300: tensor<8x1x920xbf16>, %arg301: tensor<256x256xf32>, %arg302: tensor<256xf32>, %arg303: tensor<256x2048xf32>, %arg304: tensor<2048xf32>, %arg305: tensor<2048x256xf32>, %arg306: tensor<256xf32>, %arg307: tensor<256x256xf32>, %arg308: tensor<256xf32>, %arg309: tensor<256x256xf32>, %arg310: tensor<256xf32>, %arg311: tensor<8x1x920xbf16>, %arg312: tensor<8x100x32xbf16>, %arg313: tensor<256x256xf32>, %arg314: tensor<256xf32>, %arg315: tensor<100x1x256xbf16>, %arg316: tensor<256x2048xf32>, %arg317: tensor<2048xf32>, %arg318: tensor<2048x256xf32>, %arg319: tensor<256xf32>, %arg320: tensor<100x1x256xbf16>, %arg321: tensor<256x256xf32>, %arg322: tensor<256xf32>, %arg323: tensor<256x256xf32>, %arg324: tensor<256xf32>, %arg325: tensor<256x256xf32>, %arg326: tensor<256xf32>, %arg327: tensor<256x256xf32>, %arg328: tensor<256xf32>, %arg329: tensor<256x256xf32>, %arg330: tensor<256xf32>, %arg331: tensor<256x256xf32>, %arg332: tensor<256xf32>, %arg333: tensor<256x256xf32>, %arg334: tensor<256xf32>, %arg335: tensor<8x1x920xbf16>, %arg336: tensor<256x256xf32>, %arg337: tensor<256xf32>, %arg338: tensor<256x2048xf32>, %arg339: tensor<2048xf32>, %arg340: tensor<2048x256xf32>, %arg341: tensor<256xf32>, %arg342: tensor<256x256xf32>, %arg343: tensor<256xf32>, %arg344: tensor<256x256xf32>, %arg345: tensor<256xf32>, %arg346: tensor<256x256xf32>, %arg347: tensor<256xf32>, %arg348: tensor<256x256xf32>, %arg349: tensor<256xf32>, %arg350: tensor<256x256xf32>, %arg351: tensor<256xf32>, %arg352: tensor<256x256xf32>, %arg353: tensor<256xf32>, %arg354: tensor<256x256xf32>, %arg355: tensor<256xf32>, %arg356: tensor<8x1x920xbf16>, %arg357: tensor<256x256xf32>, %arg358: tensor<256xf32>, %arg359: tensor<256x2048xf32>, %arg360: tensor<2048xf32>, %arg361: tensor<2048x256xf32>, %arg362: tensor<256xf32>, %arg363: tensor<256x256xf32>, %arg364: tensor<256xf32>, %arg365: tensor<256x256xf32>, %arg366: tensor<256xf32>, %arg367: tensor<256x256xf32>, %arg368: tensor<256xf32>, %arg369: tensor<256x256xf32>, %arg370: tensor<256xf32>, %arg371: tensor<256x256xf32>, %arg372: tensor<256xf32>, %arg373: tensor<256x256xf32>, %arg374: tensor<256xf32>, %arg375: tensor<256x256xf32>, %arg376: tensor<256xf32>, %arg377: tensor<8x1x920xbf16>, %arg378: tensor<256x256xf32>, %arg379: tensor<256xf32>, %arg380: tensor<256x2048xf32>, %arg381: tensor<2048xf32>, %arg382: tensor<2048x256xf32>, %arg383: tensor<256xf32>, %arg384: tensor<256x256xf32>, %arg385: tensor<256xf32>, %arg386: tensor<256x256xf32>, %arg387: tensor<256xf32>, %arg388: tensor<256x256xf32>, %arg389: tensor<256xf32>, %arg390: tensor<256x256xf32>, %arg391: tensor<256xf32>, %arg392: tensor<256x256xf32>, %arg393: tensor<256xf32>, %arg394: tensor<256x256xf32>, %arg395: tensor<256xf32>, %arg396: tensor<256x256xf32>, %arg397: tensor<256xf32>, %arg398: tensor<8x1x920xbf16>, %arg399: tensor<256x256xf32>, %arg400: tensor<256xf32>, %arg401: tensor<256x2048xf32>, %arg402: tensor<2048xf32>, %arg403: tensor<2048x256xf32>, %arg404: tensor<256xf32>, %arg405: tensor<256x256xf32>, %arg406: tensor<256xf32>, %arg407: tensor<256x256xf32>, %arg408: tensor<256xf32>, %arg409: tensor<256x256xf32>, %arg410: tensor<256xf32>, %arg411: tensor<256x256xf32>, %arg412: tensor<256xf32>, %arg413: tensor<256x256xf32>, %arg414: tensor<256xf32>, %arg415: tensor<256x256xf32>, %arg416: tensor<256xf32>, %arg417: tensor<256x256xf32>, %arg418: tensor<256xf32>, %arg419: tensor<8x1x920xbf16>, %arg420: tensor<256x256xf32>, %arg421: tensor<256xf32>, %arg422: tensor<256x2048xf32>, %arg423: tensor<2048xf32>, %arg424: tensor<2048x256xf32>, %arg425: tensor<256xf32>, %arg426: tensor<256x92xbf16>, %arg427: tensor<256x256xbf16>, %arg428: tensor<256x256xbf16>, %arg429: tensor<256x4xbf16>) -> (tensor<1x100x92xbf16>, tensor<1x100x4xbf16>) {
-    %c = stablehlo.constant dense<0> : tensor<1x1xi64>
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<1x64x360x640xbf16>
-    %cst_0 = stablehlo.constant dense<0xFF80> : tensor<bf16>
-    %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x180x320xbf16>
-    %cst_2 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x180x320xbf16>
-    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x180x320xbf16>
-    %cst_4 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x90x160xbf16>
-    %cst_5 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x90x160xbf16>
-    %cst_6 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x90x160xbf16>
-    %cst_7 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x45x80xbf16>
-    %cst_8 = stablehlo.constant dense<0.000000e+00> : tensor<1x1024x45x80xbf16>
-    %cst_9 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x45x80xbf16>
-    %cst_10 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x23x40xbf16>
-    %cst_11 = stablehlo.constant dense<0.000000e+00> : tensor<1x2048x23x40xbf16>
-    %cst_12 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_13 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_14 = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_15 = stablehlo.constant dense<0.000000e+00> : tensor<920x1x2048xbf16>
-    %cst_16 = stablehlo.constant dense<0.000000e+00> : tensor<100x1x2048xbf16>
-    %cst_17 = stablehlo.constant dense<0.000000e+00> : tensor<6x1x100x256xbf16>
-    %cst_18 = arith.constant dense<0.17677669529663689> : tensor<1xf64>
-    %cst_19 = arith.constant dense<1> : tensor<1xi64>
-    %cst_20 = arith.constant dense<256> : tensor<1xi64>
-    %cst_21 = arith.constant dense<1.000000e-05> : tensor<1xf64>
-    %0 = stablehlo.reshape %arg0 : (tensor<1x3x720x1280xbf16>) -> tensor<3x720x1280xbf16>
-    %1 = stablehlo.reshape %0 : (tensor<3x720x1280xbf16>) -> tensor<1x3x720x1280xbf16>
-    %2 = "stablehlo.scatter"(%arg121, %c, %1) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1, 2, 3], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false}> ({
-    ^bb0(%arg430: tensor<bf16>, %arg431: tensor<bf16>):
-      stablehlo.return %arg431 : tensor<bf16>
-    }) : (tensor<1x3x720x1280xbf16>, tensor<1x1xi64>, tensor<1x3x720x1280xbf16>) -> tensor<1x3x720x1280xbf16>
-    %3 = stablehlo.convolution(%2, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x720x1280xbf16>, tensor<64x3x7x7xbf16>) -> tensor<1x64x360x640xbf16>
-    %4 = stablehlo.broadcast_in_dim %3, dims = [0, 1, 2, 3] : (tensor<1x64x360x640xbf16>) -> tensor<1x64x360x640xbf16>
-    %5 = stablehlo.broadcast_in_dim %arg122, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x360x640xbf16>
-    %6 = stablehlo.multiply %4, %5 : tensor<1x64x360x640xbf16>
-    %7 = stablehlo.broadcast_in_dim %6, dims = [0, 1, 2, 3] : (tensor<1x64x360x640xbf16>) -> tensor<1x64x360x640xbf16>
-    %8 = stablehlo.broadcast_in_dim %arg123, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x360x640xbf16>
-    %9 = stablehlo.add %7, %8 : tensor<1x64x360x640xbf16>
-    %10 = stablehlo.maximum %9, %cst : tensor<1x64x360x640xbf16>
-    %11 = "stablehlo.reduce_window"(%10, %cst_0) <{padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 1, 3, 3>, window_strides = array<i64: 1, 1, 2, 2>}> ({
-    ^bb0(%arg430: tensor<bf16>, %arg431: tensor<bf16>):
-      %3230 = stablehlo.maximum %arg430, %arg431 : tensor<bf16>
-      stablehlo.return %3230 : tensor<bf16>
-    }) : (tensor<1x64x360x640xbf16>, tensor<bf16>) -> tensor<1x64x180x320xbf16>
-    %12 = stablehlo.convolution(%11, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<64x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %13 = stablehlo.broadcast_in_dim %12, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %14 = stablehlo.broadcast_in_dim %arg124, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %15 = stablehlo.multiply %13, %14 : tensor<1x64x180x320xbf16>
-    %16 = stablehlo.broadcast_in_dim %15, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %17 = stablehlo.broadcast_in_dim %arg125, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %18 = stablehlo.add %16, %17 : tensor<1x64x180x320xbf16>
-    %19 = stablehlo.maximum %18, %cst_1 : tensor<1x64x180x320xbf16>
-    %20 = stablehlo.convolution(%19, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x180x320xbf16>
-    %21 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %22 = stablehlo.broadcast_in_dim %arg126, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %23 = stablehlo.multiply %21, %22 : tensor<1x64x180x320xbf16>
-    %24 = stablehlo.broadcast_in_dim %23, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %25 = stablehlo.broadcast_in_dim %arg127, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %26 = stablehlo.add %24, %25 : tensor<1x64x180x320xbf16>
-    %27 = stablehlo.maximum %26, %cst_1 : tensor<1x64x180x320xbf16>
-    %28 = stablehlo.convolution(%27, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %29 = stablehlo.broadcast_in_dim %28, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %30 = stablehlo.broadcast_in_dim %arg128, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %31 = stablehlo.multiply %29, %30 : tensor<1x256x180x320xbf16>
-    %32 = stablehlo.broadcast_in_dim %31, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %33 = stablehlo.broadcast_in_dim %arg129, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %34 = stablehlo.add %32, %33 : tensor<1x256x180x320xbf16>
-    %35 = stablehlo.convolution(%11, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %36 = stablehlo.broadcast_in_dim %35, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %37 = stablehlo.broadcast_in_dim %arg130, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %38 = stablehlo.multiply %36, %37 : tensor<1x256x180x320xbf16>
-    %39 = stablehlo.broadcast_in_dim %38, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %40 = stablehlo.broadcast_in_dim %arg131, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %41 = stablehlo.add %39, %40 : tensor<1x256x180x320xbf16>
-    %42 = stablehlo.add %34, %41 : tensor<1x256x180x320xbf16>
-    %43 = stablehlo.maximum %42, %cst_2 : tensor<1x256x180x320xbf16>
-    %44 = stablehlo.convolution(%43, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x180x320xbf16>, tensor<64x256x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %45 = stablehlo.broadcast_in_dim %44, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %46 = stablehlo.broadcast_in_dim %arg132, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %47 = stablehlo.multiply %45, %46 : tensor<1x64x180x320xbf16>
-    %48 = stablehlo.broadcast_in_dim %47, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %49 = stablehlo.broadcast_in_dim %arg133, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %50 = stablehlo.add %48, %49 : tensor<1x64x180x320xbf16>
-    %51 = stablehlo.maximum %50, %cst_1 : tensor<1x64x180x320xbf16>
-    %52 = stablehlo.convolution(%51, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x180x320xbf16>
-    %53 = stablehlo.broadcast_in_dim %52, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %54 = stablehlo.broadcast_in_dim %arg134, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %55 = stablehlo.multiply %53, %54 : tensor<1x64x180x320xbf16>
-    %56 = stablehlo.broadcast_in_dim %55, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %57 = stablehlo.broadcast_in_dim %arg135, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %58 = stablehlo.add %56, %57 : tensor<1x64x180x320xbf16>
-    %59 = stablehlo.maximum %58, %cst_1 : tensor<1x64x180x320xbf16>
-    %60 = stablehlo.convolution(%59, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %61 = stablehlo.broadcast_in_dim %60, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %62 = stablehlo.broadcast_in_dim %arg136, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %63 = stablehlo.multiply %61, %62 : tensor<1x256x180x320xbf16>
-    %64 = stablehlo.broadcast_in_dim %63, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %65 = stablehlo.broadcast_in_dim %arg137, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %66 = stablehlo.add %64, %65 : tensor<1x256x180x320xbf16>
-    %67 = stablehlo.add %66, %43 : tensor<1x256x180x320xbf16>
-    %68 = stablehlo.maximum %67, %cst_2 : tensor<1x256x180x320xbf16>
-    %69 = stablehlo.convolution(%68, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x180x320xbf16>, tensor<64x256x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %70 = stablehlo.broadcast_in_dim %69, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %71 = stablehlo.broadcast_in_dim %arg138, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %72 = stablehlo.multiply %70, %71 : tensor<1x64x180x320xbf16>
-    %73 = stablehlo.broadcast_in_dim %72, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %74 = stablehlo.broadcast_in_dim %arg139, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %75 = stablehlo.add %73, %74 : tensor<1x64x180x320xbf16>
-    %76 = stablehlo.maximum %75, %cst_1 : tensor<1x64x180x320xbf16>
-    %77 = stablehlo.convolution(%76, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x180x320xbf16>
-    %78 = stablehlo.broadcast_in_dim %77, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %79 = stablehlo.broadcast_in_dim %arg140, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %80 = stablehlo.multiply %78, %79 : tensor<1x64x180x320xbf16>
-    %81 = stablehlo.broadcast_in_dim %80, dims = [0, 1, 2, 3] : (tensor<1x64x180x320xbf16>) -> tensor<1x64x180x320xbf16>
-    %82 = stablehlo.broadcast_in_dim %arg141, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xbf16>) -> tensor<1x64x180x320xbf16>
-    %83 = stablehlo.add %81, %82 : tensor<1x64x180x320xbf16>
-    %84 = stablehlo.maximum %83, %cst_1 : tensor<1x64x180x320xbf16>
-    %85 = stablehlo.convolution(%84, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x180x320xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %86 = stablehlo.broadcast_in_dim %85, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %87 = stablehlo.broadcast_in_dim %arg142, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %88 = stablehlo.multiply %86, %87 : tensor<1x256x180x320xbf16>
-    %89 = stablehlo.broadcast_in_dim %88, dims = [0, 1, 2, 3] : (tensor<1x256x180x320xbf16>) -> tensor<1x256x180x320xbf16>
-    %90 = stablehlo.broadcast_in_dim %arg143, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x180x320xbf16>
-    %91 = stablehlo.add %89, %90 : tensor<1x256x180x320xbf16>
-    %92 = stablehlo.add %91, %68 : tensor<1x256x180x320xbf16>
-    %93 = stablehlo.maximum %92, %cst_2 : tensor<1x256x180x320xbf16>
-    %94 = stablehlo.convolution(%93, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x180x320xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x180x320xbf16>
-    %95 = stablehlo.broadcast_in_dim %94, dims = [0, 1, 2, 3] : (tensor<1x128x180x320xbf16>) -> tensor<1x128x180x320xbf16>
-    %96 = stablehlo.broadcast_in_dim %arg144, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x180x320xbf16>
-    %97 = stablehlo.multiply %95, %96 : tensor<1x128x180x320xbf16>
-    %98 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x128x180x320xbf16>) -> tensor<1x128x180x320xbf16>
-    %99 = stablehlo.broadcast_in_dim %arg145, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x180x320xbf16>
-    %100 = stablehlo.add %98, %99 : tensor<1x128x180x320xbf16>
-    %101 = stablehlo.maximum %100, %cst_3 : tensor<1x128x180x320xbf16>
-    %102 = stablehlo.convolution(%101, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x180x320xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x90x160xbf16>
-    %103 = stablehlo.broadcast_in_dim %102, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %104 = stablehlo.broadcast_in_dim %arg146, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %105 = stablehlo.multiply %103, %104 : tensor<1x128x90x160xbf16>
-    %106 = stablehlo.broadcast_in_dim %105, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %107 = stablehlo.broadcast_in_dim %arg147, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %108 = stablehlo.add %106, %107 : tensor<1x128x90x160xbf16>
-    %109 = stablehlo.maximum %108, %cst_4 : tensor<1x128x90x160xbf16>
-    %110 = stablehlo.convolution(%109, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %111 = stablehlo.broadcast_in_dim %110, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %112 = stablehlo.broadcast_in_dim %arg148, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %113 = stablehlo.multiply %111, %112 : tensor<1x512x90x160xbf16>
-    %114 = stablehlo.broadcast_in_dim %113, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %115 = stablehlo.broadcast_in_dim %arg149, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %116 = stablehlo.add %114, %115 : tensor<1x512x90x160xbf16>
-    %117 = stablehlo.convolution(%93, %arg15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x180x320xbf16>, tensor<512x256x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %119 = stablehlo.broadcast_in_dim %arg150, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %120 = stablehlo.multiply %118, %119 : tensor<1x512x90x160xbf16>
-    %121 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %122 = stablehlo.broadcast_in_dim %arg151, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %123 = stablehlo.add %121, %122 : tensor<1x512x90x160xbf16>
-    %124 = stablehlo.add %116, %123 : tensor<1x512x90x160xbf16>
-    %125 = stablehlo.maximum %124, %cst_5 : tensor<1x512x90x160xbf16>
-    %126 = stablehlo.convolution(%125, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x90x160xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %127 = stablehlo.broadcast_in_dim %126, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %128 = stablehlo.broadcast_in_dim %arg152, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %129 = stablehlo.multiply %127, %128 : tensor<1x128x90x160xbf16>
-    %130 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %131 = stablehlo.broadcast_in_dim %arg153, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %132 = stablehlo.add %130, %131 : tensor<1x128x90x160xbf16>
-    %133 = stablehlo.maximum %132, %cst_4 : tensor<1x128x90x160xbf16>
-    %134 = stablehlo.convolution(%133, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x90x160xbf16>
-    %135 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %136 = stablehlo.broadcast_in_dim %arg154, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %137 = stablehlo.multiply %135, %136 : tensor<1x128x90x160xbf16>
-    %138 = stablehlo.broadcast_in_dim %137, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %139 = stablehlo.broadcast_in_dim %arg155, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %140 = stablehlo.add %138, %139 : tensor<1x128x90x160xbf16>
-    %141 = stablehlo.maximum %140, %cst_4 : tensor<1x128x90x160xbf16>
-    %142 = stablehlo.convolution(%141, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %143 = stablehlo.broadcast_in_dim %142, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %144 = stablehlo.broadcast_in_dim %arg156, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %145 = stablehlo.multiply %143, %144 : tensor<1x512x90x160xbf16>
-    %146 = stablehlo.broadcast_in_dim %145, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %147 = stablehlo.broadcast_in_dim %arg157, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %148 = stablehlo.add %146, %147 : tensor<1x512x90x160xbf16>
-    %149 = stablehlo.add %148, %125 : tensor<1x512x90x160xbf16>
-    %150 = stablehlo.maximum %149, %cst_5 : tensor<1x512x90x160xbf16>
-    %151 = stablehlo.convolution(%150, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x90x160xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %152 = stablehlo.broadcast_in_dim %151, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %153 = stablehlo.broadcast_in_dim %arg158, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %154 = stablehlo.multiply %152, %153 : tensor<1x128x90x160xbf16>
-    %155 = stablehlo.broadcast_in_dim %154, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %156 = stablehlo.broadcast_in_dim %arg159, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %157 = stablehlo.add %155, %156 : tensor<1x128x90x160xbf16>
-    %158 = stablehlo.maximum %157, %cst_4 : tensor<1x128x90x160xbf16>
-    %159 = stablehlo.convolution(%158, %arg20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x90x160xbf16>
-    %160 = stablehlo.broadcast_in_dim %159, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %161 = stablehlo.broadcast_in_dim %arg160, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %162 = stablehlo.multiply %160, %161 : tensor<1x128x90x160xbf16>
-    %163 = stablehlo.broadcast_in_dim %162, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %164 = stablehlo.broadcast_in_dim %arg161, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %165 = stablehlo.add %163, %164 : tensor<1x128x90x160xbf16>
-    %166 = stablehlo.maximum %165, %cst_4 : tensor<1x128x90x160xbf16>
-    %167 = stablehlo.convolution(%166, %arg21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %168 = stablehlo.broadcast_in_dim %167, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %169 = stablehlo.broadcast_in_dim %arg162, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %170 = stablehlo.multiply %168, %169 : tensor<1x512x90x160xbf16>
-    %171 = stablehlo.broadcast_in_dim %170, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %172 = stablehlo.broadcast_in_dim %arg163, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %173 = stablehlo.add %171, %172 : tensor<1x512x90x160xbf16>
-    %174 = stablehlo.add %173, %150 : tensor<1x512x90x160xbf16>
-    %175 = stablehlo.maximum %174, %cst_5 : tensor<1x512x90x160xbf16>
-    %176 = stablehlo.convolution(%175, %arg22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x90x160xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %177 = stablehlo.broadcast_in_dim %176, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %178 = stablehlo.broadcast_in_dim %arg164, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %179 = stablehlo.multiply %177, %178 : tensor<1x128x90x160xbf16>
-    %180 = stablehlo.broadcast_in_dim %179, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %181 = stablehlo.broadcast_in_dim %arg165, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %182 = stablehlo.add %180, %181 : tensor<1x128x90x160xbf16>
-    %183 = stablehlo.maximum %182, %cst_4 : tensor<1x128x90x160xbf16>
-    %184 = stablehlo.convolution(%183, %arg23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x90x160xbf16>
-    %185 = stablehlo.broadcast_in_dim %184, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %186 = stablehlo.broadcast_in_dim %arg166, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %187 = stablehlo.multiply %185, %186 : tensor<1x128x90x160xbf16>
-    %188 = stablehlo.broadcast_in_dim %187, dims = [0, 1, 2, 3] : (tensor<1x128x90x160xbf16>) -> tensor<1x128x90x160xbf16>
-    %189 = stablehlo.broadcast_in_dim %arg167, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xbf16>) -> tensor<1x128x90x160xbf16>
-    %190 = stablehlo.add %188, %189 : tensor<1x128x90x160xbf16>
-    %191 = stablehlo.maximum %190, %cst_4 : tensor<1x128x90x160xbf16>
-    %192 = stablehlo.convolution(%191, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x90x160xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %193 = stablehlo.broadcast_in_dim %192, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %194 = stablehlo.broadcast_in_dim %arg168, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %195 = stablehlo.multiply %193, %194 : tensor<1x512x90x160xbf16>
-    %196 = stablehlo.broadcast_in_dim %195, dims = [0, 1, 2, 3] : (tensor<1x512x90x160xbf16>) -> tensor<1x512x90x160xbf16>
-    %197 = stablehlo.broadcast_in_dim %arg169, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x90x160xbf16>
-    %198 = stablehlo.add %196, %197 : tensor<1x512x90x160xbf16>
-    %199 = stablehlo.add %198, %175 : tensor<1x512x90x160xbf16>
-    %200 = stablehlo.maximum %199, %cst_5 : tensor<1x512x90x160xbf16>
-    %201 = stablehlo.convolution(%200, %arg25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x90x160xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x90x160xbf16>
-    %202 = stablehlo.broadcast_in_dim %201, dims = [0, 1, 2, 3] : (tensor<1x256x90x160xbf16>) -> tensor<1x256x90x160xbf16>
-    %203 = stablehlo.broadcast_in_dim %arg170, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x90x160xbf16>
-    %204 = stablehlo.multiply %202, %203 : tensor<1x256x90x160xbf16>
-    %205 = stablehlo.broadcast_in_dim %204, dims = [0, 1, 2, 3] : (tensor<1x256x90x160xbf16>) -> tensor<1x256x90x160xbf16>
-    %206 = stablehlo.broadcast_in_dim %arg171, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x90x160xbf16>
-    %207 = stablehlo.add %205, %206 : tensor<1x256x90x160xbf16>
-    %208 = stablehlo.maximum %207, %cst_6 : tensor<1x256x90x160xbf16>
-    %209 = stablehlo.convolution(%208, %arg26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x90x160xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x45x80xbf16>
-    %210 = stablehlo.broadcast_in_dim %209, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %211 = stablehlo.broadcast_in_dim %arg172, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %212 = stablehlo.multiply %210, %211 : tensor<1x256x45x80xbf16>
-    %213 = stablehlo.broadcast_in_dim %212, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %214 = stablehlo.broadcast_in_dim %arg173, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %215 = stablehlo.add %213, %214 : tensor<1x256x45x80xbf16>
-    %216 = stablehlo.maximum %215, %cst_7 : tensor<1x256x45x80xbf16>
-    %217 = stablehlo.convolution(%216, %arg27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %218 = stablehlo.broadcast_in_dim %217, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %219 = stablehlo.broadcast_in_dim %arg174, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %220 = stablehlo.multiply %218, %219 : tensor<1x1024x45x80xbf16>
-    %221 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %222 = stablehlo.broadcast_in_dim %arg175, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %223 = stablehlo.add %221, %222 : tensor<1x1024x45x80xbf16>
-    %224 = stablehlo.convolution(%200, %arg28) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x90x160xbf16>, tensor<1024x512x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %225 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %226 = stablehlo.broadcast_in_dim %arg176, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %227 = stablehlo.multiply %225, %226 : tensor<1x1024x45x80xbf16>
-    %228 = stablehlo.broadcast_in_dim %227, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %229 = stablehlo.broadcast_in_dim %arg177, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %230 = stablehlo.add %228, %229 : tensor<1x1024x45x80xbf16>
-    %231 = stablehlo.add %223, %230 : tensor<1x1024x45x80xbf16>
-    %232 = stablehlo.maximum %231, %cst_8 : tensor<1x1024x45x80xbf16>
-    %233 = stablehlo.convolution(%232, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %234 = stablehlo.broadcast_in_dim %233, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %235 = stablehlo.broadcast_in_dim %arg178, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %236 = stablehlo.multiply %234, %235 : tensor<1x256x45x80xbf16>
-    %237 = stablehlo.broadcast_in_dim %236, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %238 = stablehlo.broadcast_in_dim %arg179, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %239 = stablehlo.add %237, %238 : tensor<1x256x45x80xbf16>
-    %240 = stablehlo.maximum %239, %cst_7 : tensor<1x256x45x80xbf16>
-    %241 = stablehlo.convolution(%240, %arg30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x45x80xbf16>
-    %242 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %243 = stablehlo.broadcast_in_dim %arg180, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %244 = stablehlo.multiply %242, %243 : tensor<1x256x45x80xbf16>
-    %245 = stablehlo.broadcast_in_dim %244, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %246 = stablehlo.broadcast_in_dim %arg181, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %247 = stablehlo.add %245, %246 : tensor<1x256x45x80xbf16>
-    %248 = stablehlo.maximum %247, %cst_7 : tensor<1x256x45x80xbf16>
-    %249 = stablehlo.convolution(%248, %arg31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %250 = stablehlo.broadcast_in_dim %249, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %251 = stablehlo.broadcast_in_dim %arg182, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %252 = stablehlo.multiply %250, %251 : tensor<1x1024x45x80xbf16>
-    %253 = stablehlo.broadcast_in_dim %252, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %254 = stablehlo.broadcast_in_dim %arg183, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %255 = stablehlo.add %253, %254 : tensor<1x1024x45x80xbf16>
-    %256 = stablehlo.add %255, %232 : tensor<1x1024x45x80xbf16>
-    %257 = stablehlo.maximum %256, %cst_8 : tensor<1x1024x45x80xbf16>
-    %258 = stablehlo.convolution(%257, %arg32) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %259 = stablehlo.broadcast_in_dim %258, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %260 = stablehlo.broadcast_in_dim %arg184, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %261 = stablehlo.multiply %259, %260 : tensor<1x256x45x80xbf16>
-    %262 = stablehlo.broadcast_in_dim %261, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %263 = stablehlo.broadcast_in_dim %arg185, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %264 = stablehlo.add %262, %263 : tensor<1x256x45x80xbf16>
-    %265 = stablehlo.maximum %264, %cst_7 : tensor<1x256x45x80xbf16>
-    %266 = stablehlo.convolution(%265, %arg33) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x45x80xbf16>
-    %267 = stablehlo.broadcast_in_dim %266, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %268 = stablehlo.broadcast_in_dim %arg186, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %269 = stablehlo.multiply %267, %268 : tensor<1x256x45x80xbf16>
-    %270 = stablehlo.broadcast_in_dim %269, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %271 = stablehlo.broadcast_in_dim %arg187, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %272 = stablehlo.add %270, %271 : tensor<1x256x45x80xbf16>
-    %273 = stablehlo.maximum %272, %cst_7 : tensor<1x256x45x80xbf16>
-    %274 = stablehlo.convolution(%273, %arg34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %276 = stablehlo.broadcast_in_dim %arg188, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %277 = stablehlo.multiply %275, %276 : tensor<1x1024x45x80xbf16>
-    %278 = stablehlo.broadcast_in_dim %277, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %279 = stablehlo.broadcast_in_dim %arg189, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %280 = stablehlo.add %278, %279 : tensor<1x1024x45x80xbf16>
-    %281 = stablehlo.add %280, %257 : tensor<1x1024x45x80xbf16>
-    %282 = stablehlo.maximum %281, %cst_8 : tensor<1x1024x45x80xbf16>
-    %283 = stablehlo.convolution(%282, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %284 = stablehlo.broadcast_in_dim %283, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %285 = stablehlo.broadcast_in_dim %arg190, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %286 = stablehlo.multiply %284, %285 : tensor<1x256x45x80xbf16>
-    %287 = stablehlo.broadcast_in_dim %286, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %288 = stablehlo.broadcast_in_dim %arg191, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %289 = stablehlo.add %287, %288 : tensor<1x256x45x80xbf16>
-    %290 = stablehlo.maximum %289, %cst_7 : tensor<1x256x45x80xbf16>
-    %291 = stablehlo.convolution(%290, %arg36) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x45x80xbf16>
-    %292 = stablehlo.broadcast_in_dim %291, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %293 = stablehlo.broadcast_in_dim %arg192, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %294 = stablehlo.multiply %292, %293 : tensor<1x256x45x80xbf16>
-    %295 = stablehlo.broadcast_in_dim %294, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %296 = stablehlo.broadcast_in_dim %arg193, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %297 = stablehlo.add %295, %296 : tensor<1x256x45x80xbf16>
-    %298 = stablehlo.maximum %297, %cst_7 : tensor<1x256x45x80xbf16>
-    %299 = stablehlo.convolution(%298, %arg37) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %300 = stablehlo.broadcast_in_dim %299, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %301 = stablehlo.broadcast_in_dim %arg194, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %302 = stablehlo.multiply %300, %301 : tensor<1x1024x45x80xbf16>
-    %303 = stablehlo.broadcast_in_dim %302, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %304 = stablehlo.broadcast_in_dim %arg195, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %305 = stablehlo.add %303, %304 : tensor<1x1024x45x80xbf16>
-    %306 = stablehlo.add %305, %282 : tensor<1x1024x45x80xbf16>
-    %307 = stablehlo.maximum %306, %cst_8 : tensor<1x1024x45x80xbf16>
-    %308 = stablehlo.convolution(%307, %arg38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %309 = stablehlo.broadcast_in_dim %308, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %310 = stablehlo.broadcast_in_dim %arg196, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %311 = stablehlo.multiply %309, %310 : tensor<1x256x45x80xbf16>
-    %312 = stablehlo.broadcast_in_dim %311, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %313 = stablehlo.broadcast_in_dim %arg197, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %314 = stablehlo.add %312, %313 : tensor<1x256x45x80xbf16>
-    %315 = stablehlo.maximum %314, %cst_7 : tensor<1x256x45x80xbf16>
-    %316 = stablehlo.convolution(%315, %arg39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x45x80xbf16>
-    %317 = stablehlo.broadcast_in_dim %316, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %318 = stablehlo.broadcast_in_dim %arg198, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %319 = stablehlo.multiply %317, %318 : tensor<1x256x45x80xbf16>
-    %320 = stablehlo.broadcast_in_dim %319, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %321 = stablehlo.broadcast_in_dim %arg199, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %322 = stablehlo.add %320, %321 : tensor<1x256x45x80xbf16>
-    %323 = stablehlo.maximum %322, %cst_7 : tensor<1x256x45x80xbf16>
-    %324 = stablehlo.convolution(%323, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %325 = stablehlo.broadcast_in_dim %324, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %326 = stablehlo.broadcast_in_dim %arg200, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %327 = stablehlo.multiply %325, %326 : tensor<1x1024x45x80xbf16>
-    %328 = stablehlo.broadcast_in_dim %327, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %329 = stablehlo.broadcast_in_dim %arg201, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %330 = stablehlo.add %328, %329 : tensor<1x1024x45x80xbf16>
-    %331 = stablehlo.add %330, %307 : tensor<1x1024x45x80xbf16>
-    %332 = stablehlo.maximum %331, %cst_8 : tensor<1x1024x45x80xbf16>
-    %333 = stablehlo.convolution(%332, %arg41) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %334 = stablehlo.broadcast_in_dim %333, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %335 = stablehlo.broadcast_in_dim %arg202, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %336 = stablehlo.multiply %334, %335 : tensor<1x256x45x80xbf16>
-    %337 = stablehlo.broadcast_in_dim %336, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %338 = stablehlo.broadcast_in_dim %arg203, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %339 = stablehlo.add %337, %338 : tensor<1x256x45x80xbf16>
-    %340 = stablehlo.maximum %339, %cst_7 : tensor<1x256x45x80xbf16>
-    %341 = stablehlo.convolution(%340, %arg42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x45x80xbf16>
-    %342 = stablehlo.broadcast_in_dim %341, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %343 = stablehlo.broadcast_in_dim %arg204, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %344 = stablehlo.multiply %342, %343 : tensor<1x256x45x80xbf16>
-    %345 = stablehlo.broadcast_in_dim %344, dims = [0, 1, 2, 3] : (tensor<1x256x45x80xbf16>) -> tensor<1x256x45x80xbf16>
-    %346 = stablehlo.broadcast_in_dim %arg205, dims = [0, 1, 2, 3] : (tensor<1x256x1x1xbf16>) -> tensor<1x256x45x80xbf16>
-    %347 = stablehlo.add %345, %346 : tensor<1x256x45x80xbf16>
-    %348 = stablehlo.maximum %347, %cst_7 : tensor<1x256x45x80xbf16>
-    %349 = stablehlo.convolution(%348, %arg43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x45x80xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %350 = stablehlo.broadcast_in_dim %349, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %351 = stablehlo.broadcast_in_dim %arg206, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %352 = stablehlo.multiply %350, %351 : tensor<1x1024x45x80xbf16>
-    %353 = stablehlo.broadcast_in_dim %352, dims = [0, 1, 2, 3] : (tensor<1x1024x45x80xbf16>) -> tensor<1x1024x45x80xbf16>
-    %354 = stablehlo.broadcast_in_dim %arg207, dims = [0, 1, 2, 3] : (tensor<1x1024x1x1xbf16>) -> tensor<1x1024x45x80xbf16>
-    %355 = stablehlo.add %353, %354 : tensor<1x1024x45x80xbf16>
-    %356 = stablehlo.add %355, %332 : tensor<1x1024x45x80xbf16>
-    %357 = stablehlo.maximum %356, %cst_8 : tensor<1x1024x45x80xbf16>
-    %358 = stablehlo.convolution(%357, %arg44) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x45x80xbf16>
-    %359 = stablehlo.broadcast_in_dim %358, dims = [0, 1, 2, 3] : (tensor<1x512x45x80xbf16>) -> tensor<1x512x45x80xbf16>
-    %360 = stablehlo.broadcast_in_dim %arg208, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x45x80xbf16>
-    %361 = stablehlo.multiply %359, %360 : tensor<1x512x45x80xbf16>
-    %362 = stablehlo.broadcast_in_dim %361, dims = [0, 1, 2, 3] : (tensor<1x512x45x80xbf16>) -> tensor<1x512x45x80xbf16>
-    %363 = stablehlo.broadcast_in_dim %arg209, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x45x80xbf16>
-    %364 = stablehlo.add %362, %363 : tensor<1x512x45x80xbf16>
-    %365 = stablehlo.maximum %364, %cst_9 : tensor<1x512x45x80xbf16>
-    %366 = stablehlo.convolution(%365, %arg45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x45x80xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x23x40xbf16>
-    %367 = stablehlo.broadcast_in_dim %366, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %368 = stablehlo.broadcast_in_dim %arg210, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %369 = stablehlo.multiply %367, %368 : tensor<1x512x23x40xbf16>
-    %370 = stablehlo.broadcast_in_dim %369, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %371 = stablehlo.broadcast_in_dim %arg211, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %372 = stablehlo.add %370, %371 : tensor<1x512x23x40xbf16>
-    %373 = stablehlo.maximum %372, %cst_10 : tensor<1x512x23x40xbf16>
-    %374 = stablehlo.convolution(%373, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x23x40xbf16>, tensor<2048x512x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %375 = stablehlo.broadcast_in_dim %374, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %376 = stablehlo.broadcast_in_dim %arg212, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %377 = stablehlo.multiply %375, %376 : tensor<1x2048x23x40xbf16>
-    %378 = stablehlo.broadcast_in_dim %377, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %379 = stablehlo.broadcast_in_dim %arg213, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %380 = stablehlo.add %378, %379 : tensor<1x2048x23x40xbf16>
-    %381 = stablehlo.convolution(%357, %arg47) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x45x80xbf16>, tensor<2048x1024x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %382 = stablehlo.broadcast_in_dim %381, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %383 = stablehlo.broadcast_in_dim %arg214, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %384 = stablehlo.multiply %382, %383 : tensor<1x2048x23x40xbf16>
-    %385 = stablehlo.broadcast_in_dim %384, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %386 = stablehlo.broadcast_in_dim %arg215, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %387 = stablehlo.add %385, %386 : tensor<1x2048x23x40xbf16>
-    %388 = stablehlo.add %380, %387 : tensor<1x2048x23x40xbf16>
-    %389 = stablehlo.maximum %388, %cst_11 : tensor<1x2048x23x40xbf16>
-    %390 = stablehlo.convolution(%389, %arg48) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2048x23x40xbf16>, tensor<512x2048x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %391 = stablehlo.broadcast_in_dim %390, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %392 = stablehlo.broadcast_in_dim %arg216, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %393 = stablehlo.multiply %391, %392 : tensor<1x512x23x40xbf16>
-    %394 = stablehlo.broadcast_in_dim %393, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %395 = stablehlo.broadcast_in_dim %arg217, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %396 = stablehlo.add %394, %395 : tensor<1x512x23x40xbf16>
-    %397 = stablehlo.maximum %396, %cst_10 : tensor<1x512x23x40xbf16>
-    %398 = stablehlo.convolution(%397, %arg49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x23x40xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x23x40xbf16>
-    %399 = stablehlo.broadcast_in_dim %398, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %400 = stablehlo.broadcast_in_dim %arg218, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %401 = stablehlo.multiply %399, %400 : tensor<1x512x23x40xbf16>
-    %402 = stablehlo.broadcast_in_dim %401, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %403 = stablehlo.broadcast_in_dim %arg219, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %404 = stablehlo.add %402, %403 : tensor<1x512x23x40xbf16>
-    %405 = stablehlo.maximum %404, %cst_10 : tensor<1x512x23x40xbf16>
-    %406 = stablehlo.convolution(%405, %arg50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x23x40xbf16>, tensor<2048x512x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %407 = stablehlo.broadcast_in_dim %406, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %408 = stablehlo.broadcast_in_dim %arg220, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %409 = stablehlo.multiply %407, %408 : tensor<1x2048x23x40xbf16>
-    %410 = stablehlo.broadcast_in_dim %409, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %411 = stablehlo.broadcast_in_dim %arg221, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %412 = stablehlo.add %410, %411 : tensor<1x2048x23x40xbf16>
-    %413 = stablehlo.add %412, %389 : tensor<1x2048x23x40xbf16>
-    %414 = stablehlo.maximum %413, %cst_11 : tensor<1x2048x23x40xbf16>
-    %415 = stablehlo.convolution(%414, %arg51) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2048x23x40xbf16>, tensor<512x2048x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %416 = stablehlo.broadcast_in_dim %415, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %417 = stablehlo.broadcast_in_dim %arg222, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %418 = stablehlo.multiply %416, %417 : tensor<1x512x23x40xbf16>
-    %419 = stablehlo.broadcast_in_dim %418, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %420 = stablehlo.broadcast_in_dim %arg223, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %421 = stablehlo.add %419, %420 : tensor<1x512x23x40xbf16>
-    %422 = stablehlo.maximum %421, %cst_10 : tensor<1x512x23x40xbf16>
-    %423 = stablehlo.convolution(%422, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x23x40xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x23x40xbf16>
-    %424 = stablehlo.broadcast_in_dim %423, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %425 = stablehlo.broadcast_in_dim %arg224, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %426 = stablehlo.multiply %424, %425 : tensor<1x512x23x40xbf16>
-    %427 = stablehlo.broadcast_in_dim %426, dims = [0, 1, 2, 3] : (tensor<1x512x23x40xbf16>) -> tensor<1x512x23x40xbf16>
-    %428 = stablehlo.broadcast_in_dim %arg225, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x23x40xbf16>
-    %429 = stablehlo.add %427, %428 : tensor<1x512x23x40xbf16>
-    %430 = stablehlo.maximum %429, %cst_10 : tensor<1x512x23x40xbf16>
-    %431 = stablehlo.convolution(%430, %arg53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x23x40xbf16>, tensor<2048x512x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %432 = stablehlo.broadcast_in_dim %431, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %433 = stablehlo.broadcast_in_dim %arg226, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %434 = stablehlo.multiply %432, %433 : tensor<1x2048x23x40xbf16>
-    %435 = stablehlo.broadcast_in_dim %434, dims = [0, 1, 2, 3] : (tensor<1x2048x23x40xbf16>) -> tensor<1x2048x23x40xbf16>
-    %436 = stablehlo.broadcast_in_dim %arg227, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x23x40xbf16>
-    %437 = stablehlo.add %435, %436 : tensor<1x2048x23x40xbf16>
-    %438 = stablehlo.add %437, %414 : tensor<1x2048x23x40xbf16>
-    %439 = stablehlo.maximum %438, %cst_11 : tensor<1x2048x23x40xbf16>
-    %440 = stablehlo.convolution(%439, %arg54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2048x23x40xbf16>, tensor<256x2048x1x1xbf16>) -> tensor<1x256x23x40xbf16>
-    %441 = stablehlo.reshape %arg55 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %442 = stablehlo.broadcast_in_dim %440, dims = [0, 1, 2, 3] : (tensor<1x256x23x40xbf16>) -> tensor<1x256x23x40xbf16>
-    %443 = stablehlo.broadcast_in_dim %441, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x23x40xbf16>
-    %444 = stablehlo.add %442, %443 : tensor<1x256x23x40xbf16>
-    %445 = stablehlo.reshape %444 : (tensor<1x256x23x40xbf16>) -> tensor<1x256x920xbf16>
-    %446 = stablehlo.transpose %445, dims = [2, 0, 1] : (tensor<1x256x920xbf16>) -> tensor<920x1x256xbf16>
-    %447 = stablehlo.add %446, %arg228 : tensor<920x1x256xbf16>
-    %448 = stablehlo.reshape %447 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %449 = stablehlo.dot_general %448, %arg229, contracting_dims = [1] x [0] : (tensor<920x256xbf16>, tensor<256x256xbf16>) -> tensor<920x256xbf16>
-    %450 = stablehlo.reshape %449 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %451 = stablehlo.broadcast_in_dim %450, dims = [0, 1, 2] : (tensor<920x1x256xbf16>) -> tensor<920x1x256xbf16>
-    %452 = stablehlo.broadcast_in_dim %arg230, dims = [2] : (tensor<256xbf16>) -> tensor<920x1x256xbf16>
-    %453 = stablehlo.add %451, %452 : tensor<920x1x256xbf16>
-    %454 = stablehlo.reshape %453 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %455 = stablehlo.dot_general %448, %arg231, contracting_dims = [1] x [0] : (tensor<920x256xbf16>, tensor<256x256xbf16>) -> tensor<920x256xbf16>
-    %456 = stablehlo.reshape %455 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2] : (tensor<920x1x256xbf16>) -> tensor<920x1x256xbf16>
-    %458 = stablehlo.broadcast_in_dim %arg232, dims = [2] : (tensor<256xbf16>) -> tensor<920x1x256xbf16>
-    %459 = stablehlo.add %457, %458 : tensor<920x1x256xbf16>
-    %460 = stablehlo.reshape %459 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %461 = stablehlo.reshape %446 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %462 = stablehlo.dot_general %461, %arg233, contracting_dims = [1] x [0] : (tensor<920x256xbf16>, tensor<256x256xbf16>) -> tensor<920x256xbf16>
-    %463 = stablehlo.reshape %462 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %464 = stablehlo.broadcast_in_dim %463, dims = [0, 1, 2] : (tensor<920x1x256xbf16>) -> tensor<920x1x256xbf16>
-    %465 = stablehlo.broadcast_in_dim %arg234, dims = [2] : (tensor<256xbf16>) -> tensor<920x1x256xbf16>
-    %466 = stablehlo.add %464, %465 : tensor<920x1x256xbf16>
-    %467 = stablehlo.reshape %466 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %468 = stablehlo.reshape %454 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %469 = stablehlo.reshape %468 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %470 = stablehlo.transpose %469, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %471 = stablehlo.convert %cst_18 : (tensor<1xf64>) -> tensor<1xbf16>
-    %472 = stablehlo.reshape %471 : (tensor<1xbf16>) -> tensor<bf16>
-    %473 = stablehlo.broadcast_in_dim %470, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %474 = stablehlo.broadcast_in_dim %472, dims = [] : (tensor<bf16>) -> tensor<8x920x32xbf16>
-    %475 = stablehlo.multiply %473, %474 : tensor<8x920x32xbf16>
-    %476 = stablehlo.reshape %460 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %477 = stablehlo.reshape %476 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %478 = stablehlo.transpose %477, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %479 = stablehlo.transpose %478, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %480 = stablehlo.broadcast_in_dim %479, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %481 = stablehlo.dot_general %475, %480, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x920x920xbf16>
-    %482 = stablehlo.convert %cst_19 : (tensor<1xi64>) -> tensor<1xbf16>
-    %483 = stablehlo.reshape %482 : (tensor<1xbf16>) -> tensor<bf16>
-    %484 = stablehlo.broadcast_in_dim %481, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %485 = stablehlo.broadcast_in_dim %483, dims = [] : (tensor<bf16>) -> tensor<8x920x920xbf16>
-    %486 = stablehlo.multiply %484, %485 : tensor<8x920x920xbf16>
-    %487 = stablehlo.broadcast_in_dim %486, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %488 = stablehlo.broadcast_in_dim %arg235, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x920x920xbf16>
-    %489 = stablehlo.add %487, %488 : tensor<8x920x920xbf16>
-    %490 = stablehlo.convert %489 : (tensor<8x920x920xbf16>) -> tensor<8x920x920xf32>
-    %491 = stablehlo.reduce(%490 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %492 = stablehlo.reshape %491 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %493 = stablehlo.broadcast_in_dim %490, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %494 = stablehlo.broadcast_in_dim %492, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %495 = stablehlo.subtract %493, %494 : tensor<8x920x920xf32>
-    %496 = stablehlo.exponential %495 : tensor<8x920x920xf32>
-    %497 = stablehlo.reduce(%496 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %498 = stablehlo.reshape %497 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %499 = stablehlo.broadcast_in_dim %496, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %500 = stablehlo.broadcast_in_dim %498, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %501 = stablehlo.divide %499, %500 : tensor<8x920x920xf32>
-    %502 = stablehlo.convert %501 : (tensor<8x920x920xf32>) -> tensor<8x920x920xbf16>
-    %503 = stablehlo.reshape %467 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %504 = stablehlo.reshape %503 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %505 = stablehlo.transpose %504, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %506 = stablehlo.broadcast_in_dim %505, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %507 = stablehlo.dot_general %502, %506, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %508 = stablehlo.transpose %507, dims = [1, 0, 2] : (tensor<8x920x32xbf16>) -> tensor<920x8x32xbf16>
-    %509 = stablehlo.reshape %508 : (tensor<920x8x32xbf16>) -> tensor<920x256xbf16>
-    %510 = stablehlo.convert %509 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %511 = stablehlo.dot_general %510, %arg236, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %512 = stablehlo.convert %cst_19 : (tensor<1xi64>) -> tensor<1xf32>
-    %513 = stablehlo.reshape %512 : (tensor<1xf32>) -> tensor<f32>
-    %514 = stablehlo.broadcast_in_dim %511, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %515 = stablehlo.broadcast_in_dim %513, dims = [] : (tensor<f32>) -> tensor<920x256xf32>
-    %516 = stablehlo.multiply %514, %515 : tensor<920x256xf32>
-    %517 = stablehlo.broadcast_in_dim %516, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %518 = stablehlo.broadcast_in_dim %arg237, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %519 = stablehlo.add %517, %518 : tensor<920x256xf32>
-    %520 = stablehlo.convert %519 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %521 = stablehlo.reshape %520 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %522 = stablehlo.add %446, %521 : tensor<920x1x256xbf16>
-    %523 = stablehlo.convert %522 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %524 = stablehlo.convert %523 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %525 = stablehlo.reduce(%524 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %526 = stablehlo.reshape %525 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %527 = stablehlo.convert %cst_20 : (tensor<1xi64>) -> tensor<1xf64>
-    %528 = stablehlo.reshape %527 : (tensor<1xf64>) -> tensor<f64>
-    %529 = stablehlo.broadcast_in_dim %526, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %530 = stablehlo.broadcast_in_dim %528, dims = [] : (tensor<f64>) -> tensor<920x1x1xf64>
-    %531 = stablehlo.divide %529, %530 : tensor<920x1x1xf64>
-    %532 = stablehlo.broadcast_in_dim %524, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %533 = stablehlo.broadcast_in_dim %531, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %534 = stablehlo.subtract %532, %533 : tensor<920x1x256xf64>
-    %535 = stablehlo.multiply %534, %534 : tensor<920x1x256xf64>
-    %536 = stablehlo.reduce(%535 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %537 = stablehlo.reshape %536 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %538 = stablehlo.broadcast_in_dim %537, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %539 = stablehlo.divide %538, %530 : tensor<920x1x1xf64>
-    %540 = stablehlo.convert %539 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %541 = stablehlo.reduce(%523 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %542 = stablehlo.reshape %541 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %543 = stablehlo.convert %cst_20 : (tensor<1xi64>) -> tensor<1xf32>
-    %544 = stablehlo.reshape %543 : (tensor<1xf32>) -> tensor<f32>
-    %545 = stablehlo.broadcast_in_dim %542, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %546 = stablehlo.broadcast_in_dim %544, dims = [] : (tensor<f32>) -> tensor<920x1x1xf32>
-    %547 = stablehlo.divide %545, %546 : tensor<920x1x1xf32>
-    %548 = stablehlo.convert %cst_21 : (tensor<1xf64>) -> tensor<1xf32>
-    %549 = stablehlo.reshape %548 : (tensor<1xf32>) -> tensor<f32>
-    %550 = stablehlo.broadcast_in_dim %540, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %551 = stablehlo.broadcast_in_dim %549, dims = [] : (tensor<f32>) -> tensor<920x1x1xf32>
-    %552 = stablehlo.add %550, %551 : tensor<920x1x1xf32>
-    %553 = stablehlo.rsqrt %552 : tensor<920x1x1xf32>
-    %554 = stablehlo.broadcast_in_dim %523, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %555 = stablehlo.broadcast_in_dim %547, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %556 = stablehlo.subtract %554, %555 : tensor<920x1x256xf32>
-    %557 = stablehlo.broadcast_in_dim %556, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %558 = stablehlo.broadcast_in_dim %553, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %559 = stablehlo.multiply %557, %558 : tensor<920x1x256xf32>
-    %560 = stablehlo.convert %arg56 : (tensor<256xbf16>) -> tensor<256xf32>
-    %561 = stablehlo.broadcast_in_dim %559, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %562 = stablehlo.broadcast_in_dim %560, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %563 = stablehlo.multiply %561, %562 : tensor<920x1x256xf32>
-    %564 = stablehlo.convert %arg57 : (tensor<256xbf16>) -> tensor<256xf32>
-    %565 = stablehlo.broadcast_in_dim %563, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %566 = stablehlo.broadcast_in_dim %564, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %567 = stablehlo.add %565, %566 : tensor<920x1x256xf32>
-    %568 = stablehlo.convert %567 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %569 = stablehlo.reshape %568 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %570 = stablehlo.convert %569 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %571 = stablehlo.dot_general %570, %arg238, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x2048xf32>) -> tensor<920x2048xf32>
-    %572 = stablehlo.broadcast_in_dim %571, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %573 = stablehlo.broadcast_in_dim %513, dims = [] : (tensor<f32>) -> tensor<920x2048xf32>
-    %574 = stablehlo.multiply %572, %573 : tensor<920x2048xf32>
-    %575 = stablehlo.broadcast_in_dim %574, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %576 = stablehlo.broadcast_in_dim %arg239, dims = [1] : (tensor<2048xf32>) -> tensor<920x2048xf32>
-    %577 = stablehlo.add %575, %576 : tensor<920x2048xf32>
-    %578 = stablehlo.convert %577 : (tensor<920x2048xf32>) -> tensor<920x2048xbf16>
-    %579 = stablehlo.reshape %578 : (tensor<920x2048xbf16>) -> tensor<920x1x2048xbf16>
-    %580 = stablehlo.maximum %579, %cst_15 : tensor<920x1x2048xbf16>
-    %581 = stablehlo.reshape %580 : (tensor<920x1x2048xbf16>) -> tensor<920x2048xbf16>
-    %582 = stablehlo.convert %581 : (tensor<920x2048xbf16>) -> tensor<920x2048xf32>
-    %583 = stablehlo.dot_general %582, %arg240, contracting_dims = [1] x [0] : (tensor<920x2048xf32>, tensor<2048x256xf32>) -> tensor<920x256xf32>
-    %584 = stablehlo.broadcast_in_dim %583, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %585 = stablehlo.multiply %584, %515 : tensor<920x256xf32>
-    %586 = stablehlo.broadcast_in_dim %585, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %587 = stablehlo.broadcast_in_dim %arg241, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %588 = stablehlo.add %586, %587 : tensor<920x256xf32>
-    %589 = stablehlo.convert %588 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %590 = stablehlo.reshape %589 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %591 = stablehlo.add %568, %590 : tensor<920x1x256xbf16>
-    %592 = stablehlo.convert %591 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %593 = stablehlo.convert %592 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %594 = stablehlo.reduce(%593 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %595 = stablehlo.reshape %594 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %596 = stablehlo.broadcast_in_dim %595, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %597 = stablehlo.divide %596, %530 : tensor<920x1x1xf64>
-    %598 = stablehlo.broadcast_in_dim %593, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %599 = stablehlo.broadcast_in_dim %597, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %600 = stablehlo.subtract %598, %599 : tensor<920x1x256xf64>
-    %601 = stablehlo.multiply %600, %600 : tensor<920x1x256xf64>
-    %602 = stablehlo.reduce(%601 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %603 = stablehlo.reshape %602 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %604 = stablehlo.broadcast_in_dim %603, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %605 = stablehlo.divide %604, %530 : tensor<920x1x1xf64>
-    %606 = stablehlo.convert %605 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %607 = stablehlo.reduce(%592 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %608 = stablehlo.reshape %607 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %609 = stablehlo.broadcast_in_dim %608, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %610 = stablehlo.divide %609, %546 : tensor<920x1x1xf32>
-    %611 = stablehlo.broadcast_in_dim %606, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %612 = stablehlo.add %611, %551 : tensor<920x1x1xf32>
-    %613 = stablehlo.rsqrt %612 : tensor<920x1x1xf32>
-    %614 = stablehlo.broadcast_in_dim %592, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %615 = stablehlo.broadcast_in_dim %610, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %616 = stablehlo.subtract %614, %615 : tensor<920x1x256xf32>
-    %617 = stablehlo.broadcast_in_dim %616, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %618 = stablehlo.broadcast_in_dim %613, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %619 = stablehlo.multiply %617, %618 : tensor<920x1x256xf32>
-    %620 = stablehlo.convert %arg58 : (tensor<256xbf16>) -> tensor<256xf32>
-    %621 = stablehlo.broadcast_in_dim %619, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %622 = stablehlo.broadcast_in_dim %620, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %623 = stablehlo.multiply %621, %622 : tensor<920x1x256xf32>
-    %624 = stablehlo.convert %arg59 : (tensor<256xbf16>) -> tensor<256xf32>
-    %625 = stablehlo.broadcast_in_dim %623, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %626 = stablehlo.broadcast_in_dim %624, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %627 = stablehlo.add %625, %626 : tensor<920x1x256xf32>
-    %628 = stablehlo.convert %627 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %629 = stablehlo.add %628, %arg228 : tensor<920x1x256xbf16>
-    %630 = stablehlo.reshape %629 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %631 = stablehlo.convert %630 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %632 = stablehlo.dot_general %631, %arg242, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %633 = stablehlo.broadcast_in_dim %632, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %634 = stablehlo.multiply %633, %515 : tensor<920x256xf32>
-    %635 = stablehlo.broadcast_in_dim %634, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %636 = stablehlo.broadcast_in_dim %arg243, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %637 = stablehlo.add %635, %636 : tensor<920x256xf32>
-    %638 = stablehlo.convert %637 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %639 = stablehlo.reshape %638 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %640 = stablehlo.dot_general %631, %arg244, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %641 = stablehlo.broadcast_in_dim %640, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %642 = stablehlo.multiply %641, %515 : tensor<920x256xf32>
-    %643 = stablehlo.broadcast_in_dim %642, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %644 = stablehlo.broadcast_in_dim %arg245, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %645 = stablehlo.add %643, %644 : tensor<920x256xf32>
-    %646 = stablehlo.convert %645 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %647 = stablehlo.reshape %646 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %648 = stablehlo.reshape %628 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %649 = stablehlo.convert %648 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %650 = stablehlo.dot_general %649, %arg246, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %651 = stablehlo.broadcast_in_dim %650, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %652 = stablehlo.multiply %651, %515 : tensor<920x256xf32>
-    %653 = stablehlo.broadcast_in_dim %652, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %654 = stablehlo.broadcast_in_dim %arg247, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %655 = stablehlo.add %653, %654 : tensor<920x256xf32>
-    %656 = stablehlo.convert %655 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %657 = stablehlo.reshape %656 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %658 = stablehlo.reshape %639 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %659 = stablehlo.transpose %658, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %660 = stablehlo.reshape %647 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %661 = stablehlo.transpose %660, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %662 = stablehlo.reshape %657 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %663 = stablehlo.transpose %662, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %664 = stablehlo.broadcast_in_dim %659, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %665 = stablehlo.multiply %664, %474 : tensor<8x920x32xbf16>
-    %666 = stablehlo.transpose %661, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %667 = stablehlo.broadcast_in_dim %666, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %668 = stablehlo.dot_general %665, %667, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x920x920xbf16>
-    %669 = stablehlo.broadcast_in_dim %668, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %670 = stablehlo.multiply %669, %485 : tensor<8x920x920xbf16>
-    %671 = stablehlo.broadcast_in_dim %670, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %672 = stablehlo.broadcast_in_dim %arg248, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x920x920xbf16>
-    %673 = stablehlo.add %671, %672 : tensor<8x920x920xbf16>
-    %674 = stablehlo.convert %673 : (tensor<8x920x920xbf16>) -> tensor<8x920x920xf32>
-    %675 = stablehlo.reduce(%674 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %676 = stablehlo.reshape %675 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %677 = stablehlo.broadcast_in_dim %674, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %678 = stablehlo.broadcast_in_dim %676, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %679 = stablehlo.subtract %677, %678 : tensor<8x920x920xf32>
-    %680 = stablehlo.exponential %679 : tensor<8x920x920xf32>
-    %681 = stablehlo.reduce(%680 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %682 = stablehlo.reshape %681 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %683 = stablehlo.broadcast_in_dim %680, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %684 = stablehlo.broadcast_in_dim %682, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %685 = stablehlo.divide %683, %684 : tensor<8x920x920xf32>
-    %686 = stablehlo.convert %685 : (tensor<8x920x920xf32>) -> tensor<8x920x920xbf16>
-    %687 = stablehlo.broadcast_in_dim %663, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %688 = stablehlo.dot_general %686, %687, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %689 = stablehlo.transpose %688, dims = [1, 0, 2] : (tensor<8x920x32xbf16>) -> tensor<920x8x32xbf16>
-    %690 = stablehlo.reshape %689 : (tensor<920x8x32xbf16>) -> tensor<920x256xbf16>
-    %691 = stablehlo.convert %690 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %692 = stablehlo.dot_general %691, %arg249, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %693 = stablehlo.broadcast_in_dim %692, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %694 = stablehlo.multiply %693, %515 : tensor<920x256xf32>
-    %695 = stablehlo.broadcast_in_dim %694, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %696 = stablehlo.broadcast_in_dim %arg250, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %697 = stablehlo.add %695, %696 : tensor<920x256xf32>
-    %698 = stablehlo.convert %697 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %699 = stablehlo.reshape %698 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %700 = stablehlo.add %628, %699 : tensor<920x1x256xbf16>
-    %701 = stablehlo.convert %700 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %702 = stablehlo.convert %701 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %703 = stablehlo.reduce(%702 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %704 = stablehlo.reshape %703 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %705 = stablehlo.broadcast_in_dim %704, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %706 = stablehlo.divide %705, %530 : tensor<920x1x1xf64>
-    %707 = stablehlo.broadcast_in_dim %702, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %708 = stablehlo.broadcast_in_dim %706, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %709 = stablehlo.subtract %707, %708 : tensor<920x1x256xf64>
-    %710 = stablehlo.multiply %709, %709 : tensor<920x1x256xf64>
-    %711 = stablehlo.reduce(%710 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %712 = stablehlo.reshape %711 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %713 = stablehlo.broadcast_in_dim %712, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %714 = stablehlo.divide %713, %530 : tensor<920x1x1xf64>
-    %715 = stablehlo.convert %714 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %716 = stablehlo.reduce(%701 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %717 = stablehlo.reshape %716 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %718 = stablehlo.broadcast_in_dim %717, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %719 = stablehlo.divide %718, %546 : tensor<920x1x1xf32>
-    %720 = stablehlo.broadcast_in_dim %715, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %721 = stablehlo.add %720, %551 : tensor<920x1x1xf32>
-    %722 = stablehlo.rsqrt %721 : tensor<920x1x1xf32>
-    %723 = stablehlo.broadcast_in_dim %701, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %724 = stablehlo.broadcast_in_dim %719, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %725 = stablehlo.subtract %723, %724 : tensor<920x1x256xf32>
-    %726 = stablehlo.broadcast_in_dim %725, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %727 = stablehlo.broadcast_in_dim %722, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %728 = stablehlo.multiply %726, %727 : tensor<920x1x256xf32>
-    %729 = stablehlo.convert %arg60 : (tensor<256xbf16>) -> tensor<256xf32>
-    %730 = stablehlo.broadcast_in_dim %728, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %731 = stablehlo.broadcast_in_dim %729, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %732 = stablehlo.multiply %730, %731 : tensor<920x1x256xf32>
-    %733 = stablehlo.convert %arg61 : (tensor<256xbf16>) -> tensor<256xf32>
-    %734 = stablehlo.broadcast_in_dim %732, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %735 = stablehlo.broadcast_in_dim %733, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %736 = stablehlo.add %734, %735 : tensor<920x1x256xf32>
-    %737 = stablehlo.convert %736 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %738 = stablehlo.reshape %737 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %739 = stablehlo.convert %738 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %740 = stablehlo.dot_general %739, %arg251, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x2048xf32>) -> tensor<920x2048xf32>
-    %741 = stablehlo.broadcast_in_dim %740, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %742 = stablehlo.multiply %741, %573 : tensor<920x2048xf32>
-    %743 = stablehlo.broadcast_in_dim %742, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %744 = stablehlo.broadcast_in_dim %arg252, dims = [1] : (tensor<2048xf32>) -> tensor<920x2048xf32>
-    %745 = stablehlo.add %743, %744 : tensor<920x2048xf32>
-    %746 = stablehlo.convert %745 : (tensor<920x2048xf32>) -> tensor<920x2048xbf16>
-    %747 = stablehlo.reshape %746 : (tensor<920x2048xbf16>) -> tensor<920x1x2048xbf16>
-    %748 = stablehlo.maximum %747, %cst_15 : tensor<920x1x2048xbf16>
-    %749 = stablehlo.reshape %748 : (tensor<920x1x2048xbf16>) -> tensor<920x2048xbf16>
-    %750 = stablehlo.convert %749 : (tensor<920x2048xbf16>) -> tensor<920x2048xf32>
-    %751 = stablehlo.dot_general %750, %arg253, contracting_dims = [1] x [0] : (tensor<920x2048xf32>, tensor<2048x256xf32>) -> tensor<920x256xf32>
-    %752 = stablehlo.broadcast_in_dim %751, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %753 = stablehlo.multiply %752, %515 : tensor<920x256xf32>
-    %754 = stablehlo.broadcast_in_dim %753, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %755 = stablehlo.broadcast_in_dim %arg254, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %756 = stablehlo.add %754, %755 : tensor<920x256xf32>
-    %757 = stablehlo.convert %756 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %758 = stablehlo.reshape %757 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %759 = stablehlo.add %737, %758 : tensor<920x1x256xbf16>
-    %760 = stablehlo.convert %759 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %761 = stablehlo.convert %760 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %762 = stablehlo.reduce(%761 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %763 = stablehlo.reshape %762 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %764 = stablehlo.broadcast_in_dim %763, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %765 = stablehlo.divide %764, %530 : tensor<920x1x1xf64>
-    %766 = stablehlo.broadcast_in_dim %761, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %767 = stablehlo.broadcast_in_dim %765, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %768 = stablehlo.subtract %766, %767 : tensor<920x1x256xf64>
-    %769 = stablehlo.multiply %768, %768 : tensor<920x1x256xf64>
-    %770 = stablehlo.reduce(%769 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %771 = stablehlo.reshape %770 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %772 = stablehlo.broadcast_in_dim %771, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %773 = stablehlo.divide %772, %530 : tensor<920x1x1xf64>
-    %774 = stablehlo.convert %773 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %775 = stablehlo.reduce(%760 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %776 = stablehlo.reshape %775 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %777 = stablehlo.broadcast_in_dim %776, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %778 = stablehlo.divide %777, %546 : tensor<920x1x1xf32>
-    %779 = stablehlo.broadcast_in_dim %774, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %780 = stablehlo.add %779, %551 : tensor<920x1x1xf32>
-    %781 = stablehlo.rsqrt %780 : tensor<920x1x1xf32>
-    %782 = stablehlo.broadcast_in_dim %760, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %783 = stablehlo.broadcast_in_dim %778, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %784 = stablehlo.subtract %782, %783 : tensor<920x1x256xf32>
-    %785 = stablehlo.broadcast_in_dim %784, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %786 = stablehlo.broadcast_in_dim %781, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %787 = stablehlo.multiply %785, %786 : tensor<920x1x256xf32>
-    %788 = stablehlo.convert %arg62 : (tensor<256xbf16>) -> tensor<256xf32>
-    %789 = stablehlo.broadcast_in_dim %787, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %790 = stablehlo.broadcast_in_dim %788, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %791 = stablehlo.multiply %789, %790 : tensor<920x1x256xf32>
-    %792 = stablehlo.convert %arg63 : (tensor<256xbf16>) -> tensor<256xf32>
-    %793 = stablehlo.broadcast_in_dim %791, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %794 = stablehlo.broadcast_in_dim %792, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %795 = stablehlo.add %793, %794 : tensor<920x1x256xf32>
-    %796 = stablehlo.convert %795 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %797 = stablehlo.add %796, %arg228 : tensor<920x1x256xbf16>
-    %798 = stablehlo.reshape %797 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %799 = stablehlo.convert %798 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %800 = stablehlo.dot_general %799, %arg255, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %801 = stablehlo.broadcast_in_dim %800, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %802 = stablehlo.multiply %801, %515 : tensor<920x256xf32>
-    %803 = stablehlo.broadcast_in_dim %802, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %804 = stablehlo.broadcast_in_dim %arg256, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %805 = stablehlo.add %803, %804 : tensor<920x256xf32>
-    %806 = stablehlo.convert %805 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %807 = stablehlo.reshape %806 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %808 = stablehlo.dot_general %799, %arg257, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %809 = stablehlo.broadcast_in_dim %808, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %810 = stablehlo.multiply %809, %515 : tensor<920x256xf32>
-    %811 = stablehlo.broadcast_in_dim %810, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %812 = stablehlo.broadcast_in_dim %arg258, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %813 = stablehlo.add %811, %812 : tensor<920x256xf32>
-    %814 = stablehlo.convert %813 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %815 = stablehlo.reshape %814 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %816 = stablehlo.reshape %796 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %817 = stablehlo.convert %816 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %818 = stablehlo.dot_general %817, %arg259, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %819 = stablehlo.broadcast_in_dim %818, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %820 = stablehlo.multiply %819, %515 : tensor<920x256xf32>
-    %821 = stablehlo.broadcast_in_dim %820, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %822 = stablehlo.broadcast_in_dim %arg260, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %823 = stablehlo.add %821, %822 : tensor<920x256xf32>
-    %824 = stablehlo.convert %823 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %825 = stablehlo.reshape %824 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %826 = stablehlo.reshape %807 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %827 = stablehlo.transpose %826, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %828 = stablehlo.reshape %815 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %829 = stablehlo.transpose %828, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %830 = stablehlo.reshape %825 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %831 = stablehlo.transpose %830, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %832 = stablehlo.broadcast_in_dim %827, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %833 = stablehlo.multiply %832, %474 : tensor<8x920x32xbf16>
-    %834 = stablehlo.transpose %829, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %835 = stablehlo.broadcast_in_dim %834, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %836 = stablehlo.dot_general %833, %835, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x920x920xbf16>
-    %837 = stablehlo.broadcast_in_dim %836, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %838 = stablehlo.multiply %837, %485 : tensor<8x920x920xbf16>
-    %839 = stablehlo.broadcast_in_dim %838, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %840 = stablehlo.broadcast_in_dim %arg261, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x920x920xbf16>
-    %841 = stablehlo.add %839, %840 : tensor<8x920x920xbf16>
-    %842 = stablehlo.convert %841 : (tensor<8x920x920xbf16>) -> tensor<8x920x920xf32>
-    %843 = stablehlo.reduce(%842 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %844 = stablehlo.reshape %843 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %845 = stablehlo.broadcast_in_dim %842, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %846 = stablehlo.broadcast_in_dim %844, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %847 = stablehlo.subtract %845, %846 : tensor<8x920x920xf32>
-    %848 = stablehlo.exponential %847 : tensor<8x920x920xf32>
-    %849 = stablehlo.reduce(%848 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %850 = stablehlo.reshape %849 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %851 = stablehlo.broadcast_in_dim %848, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %852 = stablehlo.broadcast_in_dim %850, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %853 = stablehlo.divide %851, %852 : tensor<8x920x920xf32>
-    %854 = stablehlo.convert %853 : (tensor<8x920x920xf32>) -> tensor<8x920x920xbf16>
-    %855 = stablehlo.broadcast_in_dim %831, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %856 = stablehlo.dot_general %854, %855, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %857 = stablehlo.transpose %856, dims = [1, 0, 2] : (tensor<8x920x32xbf16>) -> tensor<920x8x32xbf16>
-    %858 = stablehlo.reshape %857 : (tensor<920x8x32xbf16>) -> tensor<920x256xbf16>
-    %859 = stablehlo.convert %858 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %860 = stablehlo.dot_general %859, %arg262, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %861 = stablehlo.broadcast_in_dim %860, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %862 = stablehlo.multiply %861, %515 : tensor<920x256xf32>
-    %863 = stablehlo.broadcast_in_dim %862, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %864 = stablehlo.broadcast_in_dim %arg263, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %865 = stablehlo.add %863, %864 : tensor<920x256xf32>
-    %866 = stablehlo.convert %865 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %867 = stablehlo.reshape %866 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %868 = stablehlo.add %796, %867 : tensor<920x1x256xbf16>
-    %869 = stablehlo.convert %868 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %870 = stablehlo.convert %869 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %871 = stablehlo.reduce(%870 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %872 = stablehlo.reshape %871 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %873 = stablehlo.broadcast_in_dim %872, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %874 = stablehlo.divide %873, %530 : tensor<920x1x1xf64>
-    %875 = stablehlo.broadcast_in_dim %870, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %876 = stablehlo.broadcast_in_dim %874, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %877 = stablehlo.subtract %875, %876 : tensor<920x1x256xf64>
-    %878 = stablehlo.multiply %877, %877 : tensor<920x1x256xf64>
-    %879 = stablehlo.reduce(%878 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %880 = stablehlo.reshape %879 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %881 = stablehlo.broadcast_in_dim %880, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %882 = stablehlo.divide %881, %530 : tensor<920x1x1xf64>
-    %883 = stablehlo.convert %882 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %884 = stablehlo.reduce(%869 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %885 = stablehlo.reshape %884 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %886 = stablehlo.broadcast_in_dim %885, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %887 = stablehlo.divide %886, %546 : tensor<920x1x1xf32>
-    %888 = stablehlo.broadcast_in_dim %883, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %889 = stablehlo.add %888, %551 : tensor<920x1x1xf32>
-    %890 = stablehlo.rsqrt %889 : tensor<920x1x1xf32>
-    %891 = stablehlo.broadcast_in_dim %869, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %892 = stablehlo.broadcast_in_dim %887, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %893 = stablehlo.subtract %891, %892 : tensor<920x1x256xf32>
-    %894 = stablehlo.broadcast_in_dim %893, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %895 = stablehlo.broadcast_in_dim %890, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %896 = stablehlo.multiply %894, %895 : tensor<920x1x256xf32>
-    %897 = stablehlo.convert %arg64 : (tensor<256xbf16>) -> tensor<256xf32>
-    %898 = stablehlo.broadcast_in_dim %896, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %899 = stablehlo.broadcast_in_dim %897, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %900 = stablehlo.multiply %898, %899 : tensor<920x1x256xf32>
-    %901 = stablehlo.convert %arg65 : (tensor<256xbf16>) -> tensor<256xf32>
-    %902 = stablehlo.broadcast_in_dim %900, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %903 = stablehlo.broadcast_in_dim %901, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %904 = stablehlo.add %902, %903 : tensor<920x1x256xf32>
-    %905 = stablehlo.convert %904 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %906 = stablehlo.reshape %905 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %907 = stablehlo.convert %906 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %908 = stablehlo.dot_general %907, %arg264, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x2048xf32>) -> tensor<920x2048xf32>
-    %909 = stablehlo.broadcast_in_dim %908, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %910 = stablehlo.multiply %909, %573 : tensor<920x2048xf32>
-    %911 = stablehlo.broadcast_in_dim %910, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %912 = stablehlo.broadcast_in_dim %arg265, dims = [1] : (tensor<2048xf32>) -> tensor<920x2048xf32>
-    %913 = stablehlo.add %911, %912 : tensor<920x2048xf32>
-    %914 = stablehlo.convert %913 : (tensor<920x2048xf32>) -> tensor<920x2048xbf16>
-    %915 = stablehlo.reshape %914 : (tensor<920x2048xbf16>) -> tensor<920x1x2048xbf16>
-    %916 = stablehlo.maximum %915, %cst_15 : tensor<920x1x2048xbf16>
-    %917 = stablehlo.reshape %916 : (tensor<920x1x2048xbf16>) -> tensor<920x2048xbf16>
-    %918 = stablehlo.convert %917 : (tensor<920x2048xbf16>) -> tensor<920x2048xf32>
-    %919 = stablehlo.dot_general %918, %arg266, contracting_dims = [1] x [0] : (tensor<920x2048xf32>, tensor<2048x256xf32>) -> tensor<920x256xf32>
-    %920 = stablehlo.broadcast_in_dim %919, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %921 = stablehlo.multiply %920, %515 : tensor<920x256xf32>
-    %922 = stablehlo.broadcast_in_dim %921, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %923 = stablehlo.broadcast_in_dim %arg267, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %924 = stablehlo.add %922, %923 : tensor<920x256xf32>
-    %925 = stablehlo.convert %924 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %926 = stablehlo.reshape %925 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %927 = stablehlo.add %905, %926 : tensor<920x1x256xbf16>
-    %928 = stablehlo.convert %927 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %929 = stablehlo.convert %928 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %930 = stablehlo.reduce(%929 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %931 = stablehlo.reshape %930 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %932 = stablehlo.broadcast_in_dim %931, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %933 = stablehlo.divide %932, %530 : tensor<920x1x1xf64>
-    %934 = stablehlo.broadcast_in_dim %929, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %935 = stablehlo.broadcast_in_dim %933, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %936 = stablehlo.subtract %934, %935 : tensor<920x1x256xf64>
-    %937 = stablehlo.multiply %936, %936 : tensor<920x1x256xf64>
-    %938 = stablehlo.reduce(%937 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %939 = stablehlo.reshape %938 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %940 = stablehlo.broadcast_in_dim %939, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %941 = stablehlo.divide %940, %530 : tensor<920x1x1xf64>
-    %942 = stablehlo.convert %941 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %943 = stablehlo.reduce(%928 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %944 = stablehlo.reshape %943 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %945 = stablehlo.broadcast_in_dim %944, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %946 = stablehlo.divide %945, %546 : tensor<920x1x1xf32>
-    %947 = stablehlo.broadcast_in_dim %942, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %948 = stablehlo.add %947, %551 : tensor<920x1x1xf32>
-    %949 = stablehlo.rsqrt %948 : tensor<920x1x1xf32>
-    %950 = stablehlo.broadcast_in_dim %928, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %951 = stablehlo.broadcast_in_dim %946, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %952 = stablehlo.subtract %950, %951 : tensor<920x1x256xf32>
-    %953 = stablehlo.broadcast_in_dim %952, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %954 = stablehlo.broadcast_in_dim %949, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %955 = stablehlo.multiply %953, %954 : tensor<920x1x256xf32>
-    %956 = stablehlo.convert %arg66 : (tensor<256xbf16>) -> tensor<256xf32>
-    %957 = stablehlo.broadcast_in_dim %955, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %958 = stablehlo.broadcast_in_dim %956, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %959 = stablehlo.multiply %957, %958 : tensor<920x1x256xf32>
-    %960 = stablehlo.convert %arg67 : (tensor<256xbf16>) -> tensor<256xf32>
-    %961 = stablehlo.broadcast_in_dim %959, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %962 = stablehlo.broadcast_in_dim %960, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %963 = stablehlo.add %961, %962 : tensor<920x1x256xf32>
-    %964 = stablehlo.convert %963 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %965 = stablehlo.add %964, %arg228 : tensor<920x1x256xbf16>
-    %966 = stablehlo.reshape %965 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %967 = stablehlo.convert %966 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %968 = stablehlo.dot_general %967, %arg268, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %969 = stablehlo.broadcast_in_dim %968, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %970 = stablehlo.multiply %969, %515 : tensor<920x256xf32>
-    %971 = stablehlo.broadcast_in_dim %970, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %972 = stablehlo.broadcast_in_dim %arg269, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %973 = stablehlo.add %971, %972 : tensor<920x256xf32>
-    %974 = stablehlo.convert %973 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %975 = stablehlo.reshape %974 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %976 = stablehlo.dot_general %967, %arg270, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %977 = stablehlo.broadcast_in_dim %976, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %978 = stablehlo.multiply %977, %515 : tensor<920x256xf32>
-    %979 = stablehlo.broadcast_in_dim %978, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %980 = stablehlo.broadcast_in_dim %arg271, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %981 = stablehlo.add %979, %980 : tensor<920x256xf32>
-    %982 = stablehlo.convert %981 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %983 = stablehlo.reshape %982 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %984 = stablehlo.reshape %964 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %985 = stablehlo.convert %984 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %986 = stablehlo.dot_general %985, %arg272, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %987 = stablehlo.broadcast_in_dim %986, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %988 = stablehlo.multiply %987, %515 : tensor<920x256xf32>
-    %989 = stablehlo.broadcast_in_dim %988, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %990 = stablehlo.broadcast_in_dim %arg273, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %991 = stablehlo.add %989, %990 : tensor<920x256xf32>
-    %992 = stablehlo.convert %991 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %993 = stablehlo.reshape %992 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %994 = stablehlo.reshape %975 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %995 = stablehlo.transpose %994, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %996 = stablehlo.reshape %983 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %997 = stablehlo.transpose %996, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %998 = stablehlo.reshape %993 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %999 = stablehlo.transpose %998, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1000 = stablehlo.broadcast_in_dim %995, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1001 = stablehlo.multiply %1000, %474 : tensor<8x920x32xbf16>
-    %1002 = stablehlo.transpose %997, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %1003 = stablehlo.broadcast_in_dim %1002, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %1004 = stablehlo.dot_general %1001, %1003, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x920x920xbf16>
-    %1005 = stablehlo.broadcast_in_dim %1004, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %1006 = stablehlo.multiply %1005, %485 : tensor<8x920x920xbf16>
-    %1007 = stablehlo.broadcast_in_dim %1006, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %1008 = stablehlo.broadcast_in_dim %arg274, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x920x920xbf16>
-    %1009 = stablehlo.add %1007, %1008 : tensor<8x920x920xbf16>
-    %1010 = stablehlo.convert %1009 : (tensor<8x920x920xbf16>) -> tensor<8x920x920xf32>
-    %1011 = stablehlo.reduce(%1010 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %1012 = stablehlo.reshape %1011 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %1013 = stablehlo.broadcast_in_dim %1010, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %1014 = stablehlo.broadcast_in_dim %1012, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %1015 = stablehlo.subtract %1013, %1014 : tensor<8x920x920xf32>
-    %1016 = stablehlo.exponential %1015 : tensor<8x920x920xf32>
-    %1017 = stablehlo.reduce(%1016 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %1018 = stablehlo.reshape %1017 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %1019 = stablehlo.broadcast_in_dim %1016, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %1020 = stablehlo.broadcast_in_dim %1018, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %1021 = stablehlo.divide %1019, %1020 : tensor<8x920x920xf32>
-    %1022 = stablehlo.convert %1021 : (tensor<8x920x920xf32>) -> tensor<8x920x920xbf16>
-    %1023 = stablehlo.broadcast_in_dim %999, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1024 = stablehlo.dot_general %1022, %1023, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1025 = stablehlo.transpose %1024, dims = [1, 0, 2] : (tensor<8x920x32xbf16>) -> tensor<920x8x32xbf16>
-    %1026 = stablehlo.reshape %1025 : (tensor<920x8x32xbf16>) -> tensor<920x256xbf16>
-    %1027 = stablehlo.convert %1026 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1028 = stablehlo.dot_general %1027, %arg275, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1029 = stablehlo.broadcast_in_dim %1028, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1030 = stablehlo.multiply %1029, %515 : tensor<920x256xf32>
-    %1031 = stablehlo.broadcast_in_dim %1030, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1032 = stablehlo.broadcast_in_dim %arg276, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1033 = stablehlo.add %1031, %1032 : tensor<920x256xf32>
-    %1034 = stablehlo.convert %1033 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1035 = stablehlo.reshape %1034 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1036 = stablehlo.add %964, %1035 : tensor<920x1x256xbf16>
-    %1037 = stablehlo.convert %1036 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %1038 = stablehlo.convert %1037 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %1039 = stablehlo.reduce(%1038 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1040 = stablehlo.reshape %1039 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1041 = stablehlo.broadcast_in_dim %1040, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1042 = stablehlo.divide %1041, %530 : tensor<920x1x1xf64>
-    %1043 = stablehlo.broadcast_in_dim %1038, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %1044 = stablehlo.broadcast_in_dim %1042, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %1045 = stablehlo.subtract %1043, %1044 : tensor<920x1x256xf64>
-    %1046 = stablehlo.multiply %1045, %1045 : tensor<920x1x256xf64>
-    %1047 = stablehlo.reduce(%1046 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1048 = stablehlo.reshape %1047 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1049 = stablehlo.broadcast_in_dim %1048, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1050 = stablehlo.divide %1049, %530 : tensor<920x1x1xf64>
-    %1051 = stablehlo.convert %1050 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %1052 = stablehlo.reduce(%1037 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %1053 = stablehlo.reshape %1052 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %1054 = stablehlo.broadcast_in_dim %1053, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1055 = stablehlo.divide %1054, %546 : tensor<920x1x1xf32>
-    %1056 = stablehlo.broadcast_in_dim %1051, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1057 = stablehlo.add %1056, %551 : tensor<920x1x1xf32>
-    %1058 = stablehlo.rsqrt %1057 : tensor<920x1x1xf32>
-    %1059 = stablehlo.broadcast_in_dim %1037, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1060 = stablehlo.broadcast_in_dim %1055, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1061 = stablehlo.subtract %1059, %1060 : tensor<920x1x256xf32>
-    %1062 = stablehlo.broadcast_in_dim %1061, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1063 = stablehlo.broadcast_in_dim %1058, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1064 = stablehlo.multiply %1062, %1063 : tensor<920x1x256xf32>
-    %1065 = stablehlo.convert %arg68 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1066 = stablehlo.broadcast_in_dim %1064, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1067 = stablehlo.broadcast_in_dim %1065, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1068 = stablehlo.multiply %1066, %1067 : tensor<920x1x256xf32>
-    %1069 = stablehlo.convert %arg69 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1070 = stablehlo.broadcast_in_dim %1068, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1071 = stablehlo.broadcast_in_dim %1069, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1072 = stablehlo.add %1070, %1071 : tensor<920x1x256xf32>
-    %1073 = stablehlo.convert %1072 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %1074 = stablehlo.reshape %1073 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1075 = stablehlo.convert %1074 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1076 = stablehlo.dot_general %1075, %arg277, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x2048xf32>) -> tensor<920x2048xf32>
-    %1077 = stablehlo.broadcast_in_dim %1076, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %1078 = stablehlo.multiply %1077, %573 : tensor<920x2048xf32>
-    %1079 = stablehlo.broadcast_in_dim %1078, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %1080 = stablehlo.broadcast_in_dim %arg278, dims = [1] : (tensor<2048xf32>) -> tensor<920x2048xf32>
-    %1081 = stablehlo.add %1079, %1080 : tensor<920x2048xf32>
-    %1082 = stablehlo.convert %1081 : (tensor<920x2048xf32>) -> tensor<920x2048xbf16>
-    %1083 = stablehlo.reshape %1082 : (tensor<920x2048xbf16>) -> tensor<920x1x2048xbf16>
-    %1084 = stablehlo.maximum %1083, %cst_15 : tensor<920x1x2048xbf16>
-    %1085 = stablehlo.reshape %1084 : (tensor<920x1x2048xbf16>) -> tensor<920x2048xbf16>
-    %1086 = stablehlo.convert %1085 : (tensor<920x2048xbf16>) -> tensor<920x2048xf32>
-    %1087 = stablehlo.dot_general %1086, %arg279, contracting_dims = [1] x [0] : (tensor<920x2048xf32>, tensor<2048x256xf32>) -> tensor<920x256xf32>
-    %1088 = stablehlo.broadcast_in_dim %1087, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1089 = stablehlo.multiply %1088, %515 : tensor<920x256xf32>
-    %1090 = stablehlo.broadcast_in_dim %1089, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1091 = stablehlo.broadcast_in_dim %arg280, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1092 = stablehlo.add %1090, %1091 : tensor<920x256xf32>
-    %1093 = stablehlo.convert %1092 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1094 = stablehlo.reshape %1093 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1095 = stablehlo.add %1073, %1094 : tensor<920x1x256xbf16>
-    %1096 = stablehlo.convert %1095 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %1097 = stablehlo.convert %1096 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %1098 = stablehlo.reduce(%1097 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1099 = stablehlo.reshape %1098 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1100 = stablehlo.broadcast_in_dim %1099, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1101 = stablehlo.divide %1100, %530 : tensor<920x1x1xf64>
-    %1102 = stablehlo.broadcast_in_dim %1097, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %1103 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %1104 = stablehlo.subtract %1102, %1103 : tensor<920x1x256xf64>
-    %1105 = stablehlo.multiply %1104, %1104 : tensor<920x1x256xf64>
-    %1106 = stablehlo.reduce(%1105 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1107 = stablehlo.reshape %1106 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1108 = stablehlo.broadcast_in_dim %1107, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1109 = stablehlo.divide %1108, %530 : tensor<920x1x1xf64>
-    %1110 = stablehlo.convert %1109 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %1111 = stablehlo.reduce(%1096 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %1112 = stablehlo.reshape %1111 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %1113 = stablehlo.broadcast_in_dim %1112, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1114 = stablehlo.divide %1113, %546 : tensor<920x1x1xf32>
-    %1115 = stablehlo.broadcast_in_dim %1110, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1116 = stablehlo.add %1115, %551 : tensor<920x1x1xf32>
-    %1117 = stablehlo.rsqrt %1116 : tensor<920x1x1xf32>
-    %1118 = stablehlo.broadcast_in_dim %1096, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1119 = stablehlo.broadcast_in_dim %1114, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1120 = stablehlo.subtract %1118, %1119 : tensor<920x1x256xf32>
-    %1121 = stablehlo.broadcast_in_dim %1120, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1122 = stablehlo.broadcast_in_dim %1117, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1123 = stablehlo.multiply %1121, %1122 : tensor<920x1x256xf32>
-    %1124 = stablehlo.convert %arg70 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1125 = stablehlo.broadcast_in_dim %1123, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1126 = stablehlo.broadcast_in_dim %1124, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1127 = stablehlo.multiply %1125, %1126 : tensor<920x1x256xf32>
-    %1128 = stablehlo.convert %arg71 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1129 = stablehlo.broadcast_in_dim %1127, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1130 = stablehlo.broadcast_in_dim %1128, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1131 = stablehlo.add %1129, %1130 : tensor<920x1x256xf32>
-    %1132 = stablehlo.convert %1131 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %1133 = stablehlo.add %1132, %arg228 : tensor<920x1x256xbf16>
-    %1134 = stablehlo.reshape %1133 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1135 = stablehlo.convert %1134 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1136 = stablehlo.dot_general %1135, %arg281, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1137 = stablehlo.broadcast_in_dim %1136, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1138 = stablehlo.multiply %1137, %515 : tensor<920x256xf32>
-    %1139 = stablehlo.broadcast_in_dim %1138, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1140 = stablehlo.broadcast_in_dim %arg282, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1141 = stablehlo.add %1139, %1140 : tensor<920x256xf32>
-    %1142 = stablehlo.convert %1141 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1143 = stablehlo.reshape %1142 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1144 = stablehlo.dot_general %1135, %arg283, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1145 = stablehlo.broadcast_in_dim %1144, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1146 = stablehlo.multiply %1145, %515 : tensor<920x256xf32>
-    %1147 = stablehlo.broadcast_in_dim %1146, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1148 = stablehlo.broadcast_in_dim %arg284, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1149 = stablehlo.add %1147, %1148 : tensor<920x256xf32>
-    %1150 = stablehlo.convert %1149 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1151 = stablehlo.reshape %1150 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1152 = stablehlo.reshape %1132 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1153 = stablehlo.convert %1152 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1154 = stablehlo.dot_general %1153, %arg285, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1155 = stablehlo.broadcast_in_dim %1154, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1156 = stablehlo.multiply %1155, %515 : tensor<920x256xf32>
-    %1157 = stablehlo.broadcast_in_dim %1156, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1158 = stablehlo.broadcast_in_dim %arg286, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1159 = stablehlo.add %1157, %1158 : tensor<920x256xf32>
-    %1160 = stablehlo.convert %1159 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1161 = stablehlo.reshape %1160 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1162 = stablehlo.reshape %1143 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1163 = stablehlo.transpose %1162, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1164 = stablehlo.reshape %1151 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1165 = stablehlo.transpose %1164, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1166 = stablehlo.reshape %1161 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1167 = stablehlo.transpose %1166, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1168 = stablehlo.broadcast_in_dim %1163, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1169 = stablehlo.multiply %1168, %474 : tensor<8x920x32xbf16>
-    %1170 = stablehlo.transpose %1165, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %1171 = stablehlo.broadcast_in_dim %1170, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %1172 = stablehlo.dot_general %1169, %1171, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x920x920xbf16>
-    %1173 = stablehlo.broadcast_in_dim %1172, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %1174 = stablehlo.multiply %1173, %485 : tensor<8x920x920xbf16>
-    %1175 = stablehlo.broadcast_in_dim %1174, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %1176 = stablehlo.broadcast_in_dim %arg287, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x920x920xbf16>
-    %1177 = stablehlo.add %1175, %1176 : tensor<8x920x920xbf16>
-    %1178 = stablehlo.convert %1177 : (tensor<8x920x920xbf16>) -> tensor<8x920x920xf32>
-    %1179 = stablehlo.reduce(%1178 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %1180 = stablehlo.reshape %1179 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %1181 = stablehlo.broadcast_in_dim %1178, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %1182 = stablehlo.broadcast_in_dim %1180, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %1183 = stablehlo.subtract %1181, %1182 : tensor<8x920x920xf32>
-    %1184 = stablehlo.exponential %1183 : tensor<8x920x920xf32>
-    %1185 = stablehlo.reduce(%1184 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %1186 = stablehlo.reshape %1185 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %1187 = stablehlo.broadcast_in_dim %1184, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %1188 = stablehlo.broadcast_in_dim %1186, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %1189 = stablehlo.divide %1187, %1188 : tensor<8x920x920xf32>
-    %1190 = stablehlo.convert %1189 : (tensor<8x920x920xf32>) -> tensor<8x920x920xbf16>
-    %1191 = stablehlo.broadcast_in_dim %1167, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1192 = stablehlo.dot_general %1190, %1191, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1193 = stablehlo.transpose %1192, dims = [1, 0, 2] : (tensor<8x920x32xbf16>) -> tensor<920x8x32xbf16>
-    %1194 = stablehlo.reshape %1193 : (tensor<920x8x32xbf16>) -> tensor<920x256xbf16>
-    %1195 = stablehlo.convert %1194 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1196 = stablehlo.dot_general %1195, %arg288, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1197 = stablehlo.broadcast_in_dim %1196, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1198 = stablehlo.multiply %1197, %515 : tensor<920x256xf32>
-    %1199 = stablehlo.broadcast_in_dim %1198, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1200 = stablehlo.broadcast_in_dim %arg289, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1201 = stablehlo.add %1199, %1200 : tensor<920x256xf32>
-    %1202 = stablehlo.convert %1201 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1203 = stablehlo.reshape %1202 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1204 = stablehlo.add %1132, %1203 : tensor<920x1x256xbf16>
-    %1205 = stablehlo.convert %1204 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %1206 = stablehlo.convert %1205 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %1207 = stablehlo.reduce(%1206 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1208 = stablehlo.reshape %1207 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1209 = stablehlo.broadcast_in_dim %1208, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1210 = stablehlo.divide %1209, %530 : tensor<920x1x1xf64>
-    %1211 = stablehlo.broadcast_in_dim %1206, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %1212 = stablehlo.broadcast_in_dim %1210, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %1213 = stablehlo.subtract %1211, %1212 : tensor<920x1x256xf64>
-    %1214 = stablehlo.multiply %1213, %1213 : tensor<920x1x256xf64>
-    %1215 = stablehlo.reduce(%1214 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1216 = stablehlo.reshape %1215 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1217 = stablehlo.broadcast_in_dim %1216, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1218 = stablehlo.divide %1217, %530 : tensor<920x1x1xf64>
-    %1219 = stablehlo.convert %1218 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %1220 = stablehlo.reduce(%1205 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %1221 = stablehlo.reshape %1220 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %1222 = stablehlo.broadcast_in_dim %1221, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1223 = stablehlo.divide %1222, %546 : tensor<920x1x1xf32>
-    %1224 = stablehlo.broadcast_in_dim %1219, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1225 = stablehlo.add %1224, %551 : tensor<920x1x1xf32>
-    %1226 = stablehlo.rsqrt %1225 : tensor<920x1x1xf32>
-    %1227 = stablehlo.broadcast_in_dim %1205, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1228 = stablehlo.broadcast_in_dim %1223, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1229 = stablehlo.subtract %1227, %1228 : tensor<920x1x256xf32>
-    %1230 = stablehlo.broadcast_in_dim %1229, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1231 = stablehlo.broadcast_in_dim %1226, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1232 = stablehlo.multiply %1230, %1231 : tensor<920x1x256xf32>
-    %1233 = stablehlo.convert %arg72 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1234 = stablehlo.broadcast_in_dim %1232, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1235 = stablehlo.broadcast_in_dim %1233, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1236 = stablehlo.multiply %1234, %1235 : tensor<920x1x256xf32>
-    %1237 = stablehlo.convert %arg73 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1238 = stablehlo.broadcast_in_dim %1236, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1239 = stablehlo.broadcast_in_dim %1237, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1240 = stablehlo.add %1238, %1239 : tensor<920x1x256xf32>
-    %1241 = stablehlo.convert %1240 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %1242 = stablehlo.reshape %1241 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1243 = stablehlo.convert %1242 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1244 = stablehlo.dot_general %1243, %arg290, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x2048xf32>) -> tensor<920x2048xf32>
-    %1245 = stablehlo.broadcast_in_dim %1244, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %1246 = stablehlo.multiply %1245, %573 : tensor<920x2048xf32>
-    %1247 = stablehlo.broadcast_in_dim %1246, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %1248 = stablehlo.broadcast_in_dim %arg291, dims = [1] : (tensor<2048xf32>) -> tensor<920x2048xf32>
-    %1249 = stablehlo.add %1247, %1248 : tensor<920x2048xf32>
-    %1250 = stablehlo.convert %1249 : (tensor<920x2048xf32>) -> tensor<920x2048xbf16>
-    %1251 = stablehlo.reshape %1250 : (tensor<920x2048xbf16>) -> tensor<920x1x2048xbf16>
-    %1252 = stablehlo.maximum %1251, %cst_15 : tensor<920x1x2048xbf16>
-    %1253 = stablehlo.reshape %1252 : (tensor<920x1x2048xbf16>) -> tensor<920x2048xbf16>
-    %1254 = stablehlo.convert %1253 : (tensor<920x2048xbf16>) -> tensor<920x2048xf32>
-    %1255 = stablehlo.dot_general %1254, %arg292, contracting_dims = [1] x [0] : (tensor<920x2048xf32>, tensor<2048x256xf32>) -> tensor<920x256xf32>
-    %1256 = stablehlo.broadcast_in_dim %1255, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1257 = stablehlo.multiply %1256, %515 : tensor<920x256xf32>
-    %1258 = stablehlo.broadcast_in_dim %1257, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1259 = stablehlo.broadcast_in_dim %arg293, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1260 = stablehlo.add %1258, %1259 : tensor<920x256xf32>
-    %1261 = stablehlo.convert %1260 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1262 = stablehlo.reshape %1261 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1263 = stablehlo.add %1241, %1262 : tensor<920x1x256xbf16>
-    %1264 = stablehlo.convert %1263 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %1265 = stablehlo.convert %1264 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %1266 = stablehlo.reduce(%1265 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1267 = stablehlo.reshape %1266 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1268 = stablehlo.broadcast_in_dim %1267, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1269 = stablehlo.divide %1268, %530 : tensor<920x1x1xf64>
-    %1270 = stablehlo.broadcast_in_dim %1265, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %1271 = stablehlo.broadcast_in_dim %1269, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %1272 = stablehlo.subtract %1270, %1271 : tensor<920x1x256xf64>
-    %1273 = stablehlo.multiply %1272, %1272 : tensor<920x1x256xf64>
-    %1274 = stablehlo.reduce(%1273 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1275 = stablehlo.reshape %1274 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1276 = stablehlo.broadcast_in_dim %1275, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1277 = stablehlo.divide %1276, %530 : tensor<920x1x1xf64>
-    %1278 = stablehlo.convert %1277 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %1279 = stablehlo.reduce(%1264 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %1280 = stablehlo.reshape %1279 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %1281 = stablehlo.broadcast_in_dim %1280, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1282 = stablehlo.divide %1281, %546 : tensor<920x1x1xf32>
-    %1283 = stablehlo.broadcast_in_dim %1278, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1284 = stablehlo.add %1283, %551 : tensor<920x1x1xf32>
-    %1285 = stablehlo.rsqrt %1284 : tensor<920x1x1xf32>
-    %1286 = stablehlo.broadcast_in_dim %1264, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1287 = stablehlo.broadcast_in_dim %1282, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1288 = stablehlo.subtract %1286, %1287 : tensor<920x1x256xf32>
-    %1289 = stablehlo.broadcast_in_dim %1288, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1290 = stablehlo.broadcast_in_dim %1285, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1291 = stablehlo.multiply %1289, %1290 : tensor<920x1x256xf32>
-    %1292 = stablehlo.convert %arg74 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1293 = stablehlo.broadcast_in_dim %1291, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1294 = stablehlo.broadcast_in_dim %1292, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1295 = stablehlo.multiply %1293, %1294 : tensor<920x1x256xf32>
-    %1296 = stablehlo.convert %arg75 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1297 = stablehlo.broadcast_in_dim %1295, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1298 = stablehlo.broadcast_in_dim %1296, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1299 = stablehlo.add %1297, %1298 : tensor<920x1x256xf32>
-    %1300 = stablehlo.convert %1299 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %1301 = stablehlo.add %1300, %arg228 : tensor<920x1x256xbf16>
-    %1302 = stablehlo.reshape %1301 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1303 = stablehlo.convert %1302 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1304 = stablehlo.dot_general %1303, %arg294, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1305 = stablehlo.broadcast_in_dim %1304, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1306 = stablehlo.multiply %1305, %515 : tensor<920x256xf32>
-    %1307 = stablehlo.broadcast_in_dim %1306, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1308 = stablehlo.broadcast_in_dim %arg295, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1309 = stablehlo.add %1307, %1308 : tensor<920x256xf32>
-    %1310 = stablehlo.convert %1309 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1311 = stablehlo.reshape %1310 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1312 = stablehlo.dot_general %1303, %arg296, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1313 = stablehlo.broadcast_in_dim %1312, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1314 = stablehlo.multiply %1313, %515 : tensor<920x256xf32>
-    %1315 = stablehlo.broadcast_in_dim %1314, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1316 = stablehlo.broadcast_in_dim %arg297, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1317 = stablehlo.add %1315, %1316 : tensor<920x256xf32>
-    %1318 = stablehlo.convert %1317 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1319 = stablehlo.reshape %1318 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1320 = stablehlo.reshape %1300 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1321 = stablehlo.convert %1320 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1322 = stablehlo.dot_general %1321, %arg298, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1323 = stablehlo.broadcast_in_dim %1322, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1324 = stablehlo.multiply %1323, %515 : tensor<920x256xf32>
-    %1325 = stablehlo.broadcast_in_dim %1324, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1326 = stablehlo.broadcast_in_dim %arg299, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1327 = stablehlo.add %1325, %1326 : tensor<920x256xf32>
-    %1328 = stablehlo.convert %1327 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1329 = stablehlo.reshape %1328 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1330 = stablehlo.reshape %1311 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1331 = stablehlo.transpose %1330, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1332 = stablehlo.reshape %1319 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1333 = stablehlo.transpose %1332, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1334 = stablehlo.reshape %1329 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1335 = stablehlo.transpose %1334, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1336 = stablehlo.broadcast_in_dim %1331, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1337 = stablehlo.multiply %1336, %474 : tensor<8x920x32xbf16>
-    %1338 = stablehlo.transpose %1333, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %1339 = stablehlo.broadcast_in_dim %1338, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %1340 = stablehlo.dot_general %1337, %1339, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x920x920xbf16>
-    %1341 = stablehlo.broadcast_in_dim %1340, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %1342 = stablehlo.multiply %1341, %485 : tensor<8x920x920xbf16>
-    %1343 = stablehlo.broadcast_in_dim %1342, dims = [0, 1, 2] : (tensor<8x920x920xbf16>) -> tensor<8x920x920xbf16>
-    %1344 = stablehlo.broadcast_in_dim %arg300, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x920x920xbf16>
-    %1345 = stablehlo.add %1343, %1344 : tensor<8x920x920xbf16>
-    %1346 = stablehlo.convert %1345 : (tensor<8x920x920xbf16>) -> tensor<8x920x920xf32>
-    %1347 = stablehlo.reduce(%1346 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %1348 = stablehlo.reshape %1347 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %1349 = stablehlo.broadcast_in_dim %1346, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %1350 = stablehlo.broadcast_in_dim %1348, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %1351 = stablehlo.subtract %1349, %1350 : tensor<8x920x920xf32>
-    %1352 = stablehlo.exponential %1351 : tensor<8x920x920xf32>
-    %1353 = stablehlo.reduce(%1352 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x920x920xf32>, tensor<f32>) -> tensor<8x920xf32>
-    %1354 = stablehlo.reshape %1353 : (tensor<8x920xf32>) -> tensor<8x920x1xf32>
-    %1355 = stablehlo.broadcast_in_dim %1352, dims = [0, 1, 2] : (tensor<8x920x920xf32>) -> tensor<8x920x920xf32>
-    %1356 = stablehlo.broadcast_in_dim %1354, dims = [0, 1, 2] : (tensor<8x920x1xf32>) -> tensor<8x920x920xf32>
-    %1357 = stablehlo.divide %1355, %1356 : tensor<8x920x920xf32>
-    %1358 = stablehlo.convert %1357 : (tensor<8x920x920xf32>) -> tensor<8x920x920xbf16>
-    %1359 = stablehlo.broadcast_in_dim %1335, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1360 = stablehlo.dot_general %1358, %1359, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x920x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1361 = stablehlo.transpose %1360, dims = [1, 0, 2] : (tensor<8x920x32xbf16>) -> tensor<920x8x32xbf16>
-    %1362 = stablehlo.reshape %1361 : (tensor<920x8x32xbf16>) -> tensor<920x256xbf16>
-    %1363 = stablehlo.convert %1362 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1364 = stablehlo.dot_general %1363, %arg301, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1365 = stablehlo.broadcast_in_dim %1364, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1366 = stablehlo.multiply %1365, %515 : tensor<920x256xf32>
-    %1367 = stablehlo.broadcast_in_dim %1366, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1368 = stablehlo.broadcast_in_dim %arg302, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1369 = stablehlo.add %1367, %1368 : tensor<920x256xf32>
-    %1370 = stablehlo.convert %1369 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1371 = stablehlo.reshape %1370 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1372 = stablehlo.add %1300, %1371 : tensor<920x1x256xbf16>
-    %1373 = stablehlo.convert %1372 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %1374 = stablehlo.convert %1373 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %1375 = stablehlo.reduce(%1374 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1376 = stablehlo.reshape %1375 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1377 = stablehlo.broadcast_in_dim %1376, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1378 = stablehlo.divide %1377, %530 : tensor<920x1x1xf64>
-    %1379 = stablehlo.broadcast_in_dim %1374, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %1380 = stablehlo.broadcast_in_dim %1378, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %1381 = stablehlo.subtract %1379, %1380 : tensor<920x1x256xf64>
-    %1382 = stablehlo.multiply %1381, %1381 : tensor<920x1x256xf64>
-    %1383 = stablehlo.reduce(%1382 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1384 = stablehlo.reshape %1383 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1385 = stablehlo.broadcast_in_dim %1384, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1386 = stablehlo.divide %1385, %530 : tensor<920x1x1xf64>
-    %1387 = stablehlo.convert %1386 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %1388 = stablehlo.reduce(%1373 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %1389 = stablehlo.reshape %1388 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %1390 = stablehlo.broadcast_in_dim %1389, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1391 = stablehlo.divide %1390, %546 : tensor<920x1x1xf32>
-    %1392 = stablehlo.broadcast_in_dim %1387, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1393 = stablehlo.add %1392, %551 : tensor<920x1x1xf32>
-    %1394 = stablehlo.rsqrt %1393 : tensor<920x1x1xf32>
-    %1395 = stablehlo.broadcast_in_dim %1373, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1396 = stablehlo.broadcast_in_dim %1391, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1397 = stablehlo.subtract %1395, %1396 : tensor<920x1x256xf32>
-    %1398 = stablehlo.broadcast_in_dim %1397, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1399 = stablehlo.broadcast_in_dim %1394, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1400 = stablehlo.multiply %1398, %1399 : tensor<920x1x256xf32>
-    %1401 = stablehlo.convert %arg76 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1402 = stablehlo.broadcast_in_dim %1400, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1403 = stablehlo.broadcast_in_dim %1401, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1404 = stablehlo.multiply %1402, %1403 : tensor<920x1x256xf32>
-    %1405 = stablehlo.convert %arg77 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1406 = stablehlo.broadcast_in_dim %1404, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1407 = stablehlo.broadcast_in_dim %1405, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1408 = stablehlo.add %1406, %1407 : tensor<920x1x256xf32>
-    %1409 = stablehlo.convert %1408 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %1410 = stablehlo.reshape %1409 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1411 = stablehlo.convert %1410 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1412 = stablehlo.dot_general %1411, %arg303, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x2048xf32>) -> tensor<920x2048xf32>
-    %1413 = stablehlo.broadcast_in_dim %1412, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %1414 = stablehlo.multiply %1413, %573 : tensor<920x2048xf32>
-    %1415 = stablehlo.broadcast_in_dim %1414, dims = [0, 1] : (tensor<920x2048xf32>) -> tensor<920x2048xf32>
-    %1416 = stablehlo.broadcast_in_dim %arg304, dims = [1] : (tensor<2048xf32>) -> tensor<920x2048xf32>
-    %1417 = stablehlo.add %1415, %1416 : tensor<920x2048xf32>
-    %1418 = stablehlo.convert %1417 : (tensor<920x2048xf32>) -> tensor<920x2048xbf16>
-    %1419 = stablehlo.reshape %1418 : (tensor<920x2048xbf16>) -> tensor<920x1x2048xbf16>
-    %1420 = stablehlo.maximum %1419, %cst_15 : tensor<920x1x2048xbf16>
-    %1421 = stablehlo.reshape %1420 : (tensor<920x1x2048xbf16>) -> tensor<920x2048xbf16>
-    %1422 = stablehlo.convert %1421 : (tensor<920x2048xbf16>) -> tensor<920x2048xf32>
-    %1423 = stablehlo.dot_general %1422, %arg305, contracting_dims = [1] x [0] : (tensor<920x2048xf32>, tensor<2048x256xf32>) -> tensor<920x256xf32>
-    %1424 = stablehlo.broadcast_in_dim %1423, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1425 = stablehlo.multiply %1424, %515 : tensor<920x256xf32>
-    %1426 = stablehlo.broadcast_in_dim %1425, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1427 = stablehlo.broadcast_in_dim %arg306, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1428 = stablehlo.add %1426, %1427 : tensor<920x256xf32>
-    %1429 = stablehlo.convert %1428 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1430 = stablehlo.reshape %1429 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1431 = stablehlo.add %1409, %1430 : tensor<920x1x256xbf16>
-    %1432 = stablehlo.convert %1431 : (tensor<920x1x256xbf16>) -> tensor<920x1x256xf32>
-    %1433 = stablehlo.convert %1432 : (tensor<920x1x256xf32>) -> tensor<920x1x256xf64>
-    %1434 = stablehlo.reduce(%1433 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1435 = stablehlo.reshape %1434 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1436 = stablehlo.broadcast_in_dim %1435, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1437 = stablehlo.divide %1436, %530 : tensor<920x1x1xf64>
-    %1438 = stablehlo.broadcast_in_dim %1433, dims = [0, 1, 2] : (tensor<920x1x256xf64>) -> tensor<920x1x256xf64>
-    %1439 = stablehlo.broadcast_in_dim %1437, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x256xf64>
-    %1440 = stablehlo.subtract %1438, %1439 : tensor<920x1x256xf64>
-    %1441 = stablehlo.multiply %1440, %1440 : tensor<920x1x256xf64>
-    %1442 = stablehlo.reduce(%1441 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf64>, tensor<f64>) -> tensor<920x1xf64>
-    %1443 = stablehlo.reshape %1442 : (tensor<920x1xf64>) -> tensor<920x1x1xf64>
-    %1444 = stablehlo.broadcast_in_dim %1443, dims = [0, 1, 2] : (tensor<920x1x1xf64>) -> tensor<920x1x1xf64>
-    %1445 = stablehlo.divide %1444, %530 : tensor<920x1x1xf64>
-    %1446 = stablehlo.convert %1445 : (tensor<920x1x1xf64>) -> tensor<920x1x1xf32>
-    %1447 = stablehlo.reduce(%1432 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<920x1x256xf32>, tensor<f32>) -> tensor<920x1xf32>
-    %1448 = stablehlo.reshape %1447 : (tensor<920x1xf32>) -> tensor<920x1x1xf32>
-    %1449 = stablehlo.broadcast_in_dim %1448, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1450 = stablehlo.divide %1449, %546 : tensor<920x1x1xf32>
-    %1451 = stablehlo.broadcast_in_dim %1446, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x1xf32>
-    %1452 = stablehlo.add %1451, %551 : tensor<920x1x1xf32>
-    %1453 = stablehlo.rsqrt %1452 : tensor<920x1x1xf32>
-    %1454 = stablehlo.broadcast_in_dim %1432, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1455 = stablehlo.broadcast_in_dim %1450, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1456 = stablehlo.subtract %1454, %1455 : tensor<920x1x256xf32>
-    %1457 = stablehlo.broadcast_in_dim %1456, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1458 = stablehlo.broadcast_in_dim %1453, dims = [0, 1, 2] : (tensor<920x1x1xf32>) -> tensor<920x1x256xf32>
-    %1459 = stablehlo.multiply %1457, %1458 : tensor<920x1x256xf32>
-    %1460 = stablehlo.convert %arg78 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1461 = stablehlo.broadcast_in_dim %1459, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1462 = stablehlo.broadcast_in_dim %1460, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1463 = stablehlo.multiply %1461, %1462 : tensor<920x1x256xf32>
-    %1464 = stablehlo.convert %arg79 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1465 = stablehlo.broadcast_in_dim %1463, dims = [0, 1, 2] : (tensor<920x1x256xf32>) -> tensor<920x1x256xf32>
-    %1466 = stablehlo.broadcast_in_dim %1464, dims = [2] : (tensor<256xf32>) -> tensor<920x1x256xf32>
-    %1467 = stablehlo.add %1465, %1466 : tensor<920x1x256xf32>
-    %1468 = stablehlo.convert %1467 : (tensor<920x1x256xf32>) -> tensor<920x1x256xbf16>
-    %1469 = stablehlo.add %1468, %arg228 : tensor<920x1x256xbf16>
-    %1470 = stablehlo.reshape %1469 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1471 = stablehlo.convert %1470 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1472 = stablehlo.dot_general %1471, %arg307, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1473 = stablehlo.broadcast_in_dim %1472, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1474 = stablehlo.multiply %1473, %515 : tensor<920x256xf32>
-    %1475 = stablehlo.broadcast_in_dim %1474, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1476 = stablehlo.broadcast_in_dim %arg308, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1477 = stablehlo.add %1475, %1476 : tensor<920x256xf32>
-    %1478 = stablehlo.convert %1477 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1479 = stablehlo.reshape %1478 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1480 = stablehlo.reshape %1468 : (tensor<920x1x256xbf16>) -> tensor<920x256xbf16>
-    %1481 = stablehlo.convert %1480 : (tensor<920x256xbf16>) -> tensor<920x256xf32>
-    %1482 = stablehlo.dot_general %1481, %arg309, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1483 = stablehlo.broadcast_in_dim %1482, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1484 = stablehlo.multiply %1483, %515 : tensor<920x256xf32>
-    %1485 = stablehlo.broadcast_in_dim %1484, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1486 = stablehlo.broadcast_in_dim %arg310, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1487 = stablehlo.add %1485, %1486 : tensor<920x256xf32>
-    %1488 = stablehlo.convert %1487 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1489 = stablehlo.reshape %1488 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1490 = stablehlo.reshape %1479 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1491 = stablehlo.transpose %1490, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1492 = stablehlo.reshape %1489 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1493 = stablehlo.transpose %1492, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1494 = stablehlo.transpose %1491, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %1495 = stablehlo.broadcast_in_dim %1494, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %1496 = stablehlo.dot_general %arg312, %1495, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
-    %1497 = stablehlo.broadcast_in_dim %1496, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %1498 = stablehlo.broadcast_in_dim %483, dims = [] : (tensor<bf16>) -> tensor<8x100x920xbf16>
-    %1499 = stablehlo.multiply %1497, %1498 : tensor<8x100x920xbf16>
-    %1500 = stablehlo.broadcast_in_dim %1499, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %1501 = stablehlo.broadcast_in_dim %arg311, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x100x920xbf16>
-    %1502 = stablehlo.add %1500, %1501 : tensor<8x100x920xbf16>
-    %1503 = stablehlo.convert %1502 : (tensor<8x100x920xbf16>) -> tensor<8x100x920xf32>
-    %1504 = stablehlo.reduce(%1503 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %1505 = stablehlo.reshape %1504 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %1506 = stablehlo.broadcast_in_dim %1503, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %1507 = stablehlo.broadcast_in_dim %1505, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %1508 = stablehlo.subtract %1506, %1507 : tensor<8x100x920xf32>
-    %1509 = stablehlo.exponential %1508 : tensor<8x100x920xf32>
-    %1510 = stablehlo.reduce(%1509 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %1511 = stablehlo.reshape %1510 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %1512 = stablehlo.broadcast_in_dim %1509, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %1513 = stablehlo.broadcast_in_dim %1511, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %1514 = stablehlo.divide %1512, %1513 : tensor<8x100x920xf32>
-    %1515 = stablehlo.convert %1514 : (tensor<8x100x920xf32>) -> tensor<8x100x920xbf16>
-    %1516 = stablehlo.broadcast_in_dim %1493, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1517 = stablehlo.dot_general %1515, %1516, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x100x32xbf16>
-    %1518 = stablehlo.transpose %1517, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %1519 = stablehlo.reshape %1518 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %1520 = stablehlo.convert %1519 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1521 = stablehlo.dot_general %1520, %arg313, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1522 = stablehlo.broadcast_in_dim %1521, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1523 = stablehlo.broadcast_in_dim %513, dims = [] : (tensor<f32>) -> tensor<100x256xf32>
-    %1524 = stablehlo.multiply %1522, %1523 : tensor<100x256xf32>
-    %1525 = stablehlo.broadcast_in_dim %1524, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1526 = stablehlo.broadcast_in_dim %arg314, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1527 = stablehlo.add %1525, %1526 : tensor<100x256xf32>
-    %1528 = stablehlo.convert %1527 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1529 = stablehlo.reshape %1528 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1530 = stablehlo.add %arg315, %1529 : tensor<100x1x256xbf16>
-    %1531 = stablehlo.convert %1530 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1532 = stablehlo.convert %1531 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1533 = stablehlo.reduce(%1532 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1534 = stablehlo.reshape %1533 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1535 = stablehlo.broadcast_in_dim %1534, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1536 = stablehlo.broadcast_in_dim %528, dims = [] : (tensor<f64>) -> tensor<100x1x1xf64>
-    %1537 = stablehlo.divide %1535, %1536 : tensor<100x1x1xf64>
-    %1538 = stablehlo.broadcast_in_dim %1532, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1539 = stablehlo.broadcast_in_dim %1537, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1540 = stablehlo.subtract %1538, %1539 : tensor<100x1x256xf64>
-    %1541 = stablehlo.multiply %1540, %1540 : tensor<100x1x256xf64>
-    %1542 = stablehlo.reduce(%1541 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1543 = stablehlo.reshape %1542 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1544 = stablehlo.broadcast_in_dim %1543, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1545 = stablehlo.divide %1544, %1536 : tensor<100x1x1xf64>
-    %1546 = stablehlo.convert %1545 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1547 = stablehlo.reduce(%1531 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1548 = stablehlo.reshape %1547 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1549 = stablehlo.broadcast_in_dim %1548, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1550 = stablehlo.broadcast_in_dim %544, dims = [] : (tensor<f32>) -> tensor<100x1x1xf32>
-    %1551 = stablehlo.divide %1549, %1550 : tensor<100x1x1xf32>
-    %1552 = stablehlo.broadcast_in_dim %1546, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1553 = stablehlo.broadcast_in_dim %549, dims = [] : (tensor<f32>) -> tensor<100x1x1xf32>
-    %1554 = stablehlo.add %1552, %1553 : tensor<100x1x1xf32>
-    %1555 = stablehlo.rsqrt %1554 : tensor<100x1x1xf32>
-    %1556 = stablehlo.broadcast_in_dim %1531, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1557 = stablehlo.broadcast_in_dim %1551, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1558 = stablehlo.subtract %1556, %1557 : tensor<100x1x256xf32>
-    %1559 = stablehlo.broadcast_in_dim %1558, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1560 = stablehlo.broadcast_in_dim %1555, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1561 = stablehlo.multiply %1559, %1560 : tensor<100x1x256xf32>
-    %1562 = stablehlo.convert %arg80 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1563 = stablehlo.broadcast_in_dim %1561, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1564 = stablehlo.broadcast_in_dim %1562, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1565 = stablehlo.multiply %1563, %1564 : tensor<100x1x256xf32>
-    %1566 = stablehlo.convert %arg81 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1567 = stablehlo.broadcast_in_dim %1565, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1568 = stablehlo.broadcast_in_dim %1566, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1569 = stablehlo.add %1567, %1568 : tensor<100x1x256xf32>
-    %1570 = stablehlo.convert %1569 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1571 = stablehlo.reshape %1570 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1572 = stablehlo.convert %1571 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1573 = stablehlo.dot_general %1572, %arg316, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x2048xf32>) -> tensor<100x2048xf32>
-    %1574 = stablehlo.broadcast_in_dim %1573, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %1575 = stablehlo.broadcast_in_dim %513, dims = [] : (tensor<f32>) -> tensor<100x2048xf32>
-    %1576 = stablehlo.multiply %1574, %1575 : tensor<100x2048xf32>
-    %1577 = stablehlo.broadcast_in_dim %1576, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %1578 = stablehlo.broadcast_in_dim %arg317, dims = [1] : (tensor<2048xf32>) -> tensor<100x2048xf32>
-    %1579 = stablehlo.add %1577, %1578 : tensor<100x2048xf32>
-    %1580 = stablehlo.convert %1579 : (tensor<100x2048xf32>) -> tensor<100x2048xbf16>
-    %1581 = stablehlo.reshape %1580 : (tensor<100x2048xbf16>) -> tensor<100x1x2048xbf16>
-    %1582 = stablehlo.maximum %1581, %cst_16 : tensor<100x1x2048xbf16>
-    %1583 = stablehlo.reshape %1582 : (tensor<100x1x2048xbf16>) -> tensor<100x2048xbf16>
-    %1584 = stablehlo.convert %1583 : (tensor<100x2048xbf16>) -> tensor<100x2048xf32>
-    %1585 = stablehlo.dot_general %1584, %arg318, contracting_dims = [1] x [0] : (tensor<100x2048xf32>, tensor<2048x256xf32>) -> tensor<100x256xf32>
-    %1586 = stablehlo.broadcast_in_dim %1585, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1587 = stablehlo.multiply %1586, %1523 : tensor<100x256xf32>
-    %1588 = stablehlo.broadcast_in_dim %1587, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1589 = stablehlo.broadcast_in_dim %arg319, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1590 = stablehlo.add %1588, %1589 : tensor<100x256xf32>
-    %1591 = stablehlo.convert %1590 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1592 = stablehlo.reshape %1591 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1593 = stablehlo.add %1570, %1592 : tensor<100x1x256xbf16>
-    %1594 = stablehlo.convert %1593 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1595 = stablehlo.convert %1594 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1596 = stablehlo.reduce(%1595 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1597 = stablehlo.reshape %1596 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1598 = stablehlo.broadcast_in_dim %1597, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1599 = stablehlo.divide %1598, %1536 : tensor<100x1x1xf64>
-    %1600 = stablehlo.broadcast_in_dim %1595, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1601 = stablehlo.broadcast_in_dim %1599, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1602 = stablehlo.subtract %1600, %1601 : tensor<100x1x256xf64>
-    %1603 = stablehlo.multiply %1602, %1602 : tensor<100x1x256xf64>
-    %1604 = stablehlo.reduce(%1603 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1605 = stablehlo.reshape %1604 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1606 = stablehlo.broadcast_in_dim %1605, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1607 = stablehlo.divide %1606, %1536 : tensor<100x1x1xf64>
-    %1608 = stablehlo.convert %1607 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1609 = stablehlo.reduce(%1594 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1610 = stablehlo.reshape %1609 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1611 = stablehlo.broadcast_in_dim %1610, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1612 = stablehlo.divide %1611, %1550 : tensor<100x1x1xf32>
-    %1613 = stablehlo.broadcast_in_dim %1608, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1614 = stablehlo.add %1613, %1553 : tensor<100x1x1xf32>
-    %1615 = stablehlo.rsqrt %1614 : tensor<100x1x1xf32>
-    %1616 = stablehlo.broadcast_in_dim %1594, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1617 = stablehlo.broadcast_in_dim %1612, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1618 = stablehlo.subtract %1616, %1617 : tensor<100x1x256xf32>
-    %1619 = stablehlo.broadcast_in_dim %1618, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1620 = stablehlo.broadcast_in_dim %1615, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1621 = stablehlo.multiply %1619, %1620 : tensor<100x1x256xf32>
-    %1622 = stablehlo.convert %arg82 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1623 = stablehlo.broadcast_in_dim %1621, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1624 = stablehlo.broadcast_in_dim %1622, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1625 = stablehlo.multiply %1623, %1624 : tensor<100x1x256xf32>
-    %1626 = stablehlo.convert %arg83 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1627 = stablehlo.broadcast_in_dim %1625, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1628 = stablehlo.broadcast_in_dim %1626, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1629 = stablehlo.add %1627, %1628 : tensor<100x1x256xf32>
-    %1630 = stablehlo.convert %1629 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1631 = stablehlo.convert %1630 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1632 = stablehlo.convert %1631 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1633 = stablehlo.reduce(%1632 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1634 = stablehlo.reshape %1633 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1635 = stablehlo.broadcast_in_dim %1634, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1636 = stablehlo.divide %1635, %1536 : tensor<100x1x1xf64>
-    %1637 = stablehlo.broadcast_in_dim %1632, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1638 = stablehlo.broadcast_in_dim %1636, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1639 = stablehlo.subtract %1637, %1638 : tensor<100x1x256xf64>
-    %1640 = stablehlo.multiply %1639, %1639 : tensor<100x1x256xf64>
-    %1641 = stablehlo.reduce(%1640 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1642 = stablehlo.reshape %1641 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1643 = stablehlo.broadcast_in_dim %1642, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1644 = stablehlo.divide %1643, %1536 : tensor<100x1x1xf64>
-    %1645 = stablehlo.convert %1644 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1646 = stablehlo.reduce(%1631 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1647 = stablehlo.reshape %1646 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1648 = stablehlo.broadcast_in_dim %1647, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1649 = stablehlo.divide %1648, %1550 : tensor<100x1x1xf32>
-    %1650 = stablehlo.broadcast_in_dim %1645, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1651 = stablehlo.add %1650, %1553 : tensor<100x1x1xf32>
-    %1652 = stablehlo.rsqrt %1651 : tensor<100x1x1xf32>
-    %1653 = stablehlo.broadcast_in_dim %1631, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1654 = stablehlo.broadcast_in_dim %1649, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1655 = stablehlo.subtract %1653, %1654 : tensor<100x1x256xf32>
-    %1656 = stablehlo.broadcast_in_dim %1655, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1657 = stablehlo.broadcast_in_dim %1652, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1658 = stablehlo.multiply %1656, %1657 : tensor<100x1x256xf32>
-    %1659 = stablehlo.convert %arg84 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1660 = stablehlo.broadcast_in_dim %1658, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1661 = stablehlo.broadcast_in_dim %1659, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1662 = stablehlo.multiply %1660, %1661 : tensor<100x1x256xf32>
-    %1663 = stablehlo.convert %arg85 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1664 = stablehlo.broadcast_in_dim %1662, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1665 = stablehlo.broadcast_in_dim %1663, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1666 = stablehlo.add %1664, %1665 : tensor<100x1x256xf32>
-    %1667 = stablehlo.convert %1666 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1668 = stablehlo.add %1630, %arg320 : tensor<100x1x256xbf16>
-    %1669 = stablehlo.reshape %1668 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1670 = stablehlo.convert %1669 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1671 = stablehlo.dot_general %1670, %arg321, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1672 = stablehlo.broadcast_in_dim %1671, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1673 = stablehlo.multiply %1672, %1523 : tensor<100x256xf32>
-    %1674 = stablehlo.broadcast_in_dim %1673, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1675 = stablehlo.broadcast_in_dim %arg322, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1676 = stablehlo.add %1674, %1675 : tensor<100x256xf32>
-    %1677 = stablehlo.convert %1676 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1678 = stablehlo.reshape %1677 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1679 = stablehlo.dot_general %1670, %arg323, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1680 = stablehlo.broadcast_in_dim %1679, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1681 = stablehlo.multiply %1680, %1523 : tensor<100x256xf32>
-    %1682 = stablehlo.broadcast_in_dim %1681, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1683 = stablehlo.broadcast_in_dim %arg324, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1684 = stablehlo.add %1682, %1683 : tensor<100x256xf32>
-    %1685 = stablehlo.convert %1684 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1686 = stablehlo.reshape %1685 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1687 = stablehlo.reshape %1630 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1688 = stablehlo.convert %1687 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1689 = stablehlo.dot_general %1688, %arg325, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1690 = stablehlo.broadcast_in_dim %1689, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1691 = stablehlo.multiply %1690, %1523 : tensor<100x256xf32>
-    %1692 = stablehlo.broadcast_in_dim %1691, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1693 = stablehlo.broadcast_in_dim %arg326, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1694 = stablehlo.add %1692, %1693 : tensor<100x256xf32>
-    %1695 = stablehlo.convert %1694 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1696 = stablehlo.reshape %1695 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1697 = stablehlo.reshape %1678 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %1698 = stablehlo.transpose %1697, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %1699 = stablehlo.reshape %1686 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %1700 = stablehlo.transpose %1699, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %1701 = stablehlo.reshape %1696 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %1702 = stablehlo.transpose %1701, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %1703 = stablehlo.broadcast_in_dim %1698, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %1704 = stablehlo.broadcast_in_dim %472, dims = [] : (tensor<bf16>) -> tensor<8x100x32xbf16>
-    %1705 = stablehlo.multiply %1703, %1704 : tensor<8x100x32xbf16>
-    %1706 = stablehlo.transpose %1700, dims = [0, 2, 1] : (tensor<8x100x32xbf16>) -> tensor<8x32x100xbf16>
-    %1707 = stablehlo.broadcast_in_dim %1706, dims = [0, 1, 2] : (tensor<8x32x100xbf16>) -> tensor<8x32x100xbf16>
-    %1708 = stablehlo.dot_general %1705, %1707, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x100xbf16>) -> tensor<8x100x100xbf16>
-    %1709 = stablehlo.convert %1708 : (tensor<8x100x100xbf16>) -> tensor<8x100x100xf32>
-    %1710 = stablehlo.reduce(%1709 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %1711 = stablehlo.reshape %1710 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %1712 = stablehlo.broadcast_in_dim %1709, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %1713 = stablehlo.broadcast_in_dim %1711, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %1714 = stablehlo.subtract %1712, %1713 : tensor<8x100x100xf32>
-    %1715 = stablehlo.exponential %1714 : tensor<8x100x100xf32>
-    %1716 = stablehlo.reduce(%1715 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %1717 = stablehlo.reshape %1716 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %1718 = stablehlo.broadcast_in_dim %1715, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %1719 = stablehlo.broadcast_in_dim %1717, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %1720 = stablehlo.divide %1718, %1719 : tensor<8x100x100xf32>
-    %1721 = stablehlo.convert %1720 : (tensor<8x100x100xf32>) -> tensor<8x100x100xbf16>
-    %1722 = stablehlo.broadcast_in_dim %1702, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %1723 = stablehlo.dot_general %1721, %1722, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x100xbf16>, tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %1724 = stablehlo.transpose %1723, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %1725 = stablehlo.reshape %1724 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %1726 = stablehlo.convert %1725 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1727 = stablehlo.dot_general %1726, %arg327, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1728 = stablehlo.broadcast_in_dim %1727, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1729 = stablehlo.multiply %1728, %1523 : tensor<100x256xf32>
-    %1730 = stablehlo.broadcast_in_dim %1729, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1731 = stablehlo.broadcast_in_dim %arg328, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1732 = stablehlo.add %1730, %1731 : tensor<100x256xf32>
-    %1733 = stablehlo.convert %1732 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1734 = stablehlo.reshape %1733 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1735 = stablehlo.add %1630, %1734 : tensor<100x1x256xbf16>
-    %1736 = stablehlo.convert %1735 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1737 = stablehlo.convert %1736 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1738 = stablehlo.reduce(%1737 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1739 = stablehlo.reshape %1738 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1740 = stablehlo.broadcast_in_dim %1739, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1741 = stablehlo.divide %1740, %1536 : tensor<100x1x1xf64>
-    %1742 = stablehlo.broadcast_in_dim %1737, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1743 = stablehlo.broadcast_in_dim %1741, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1744 = stablehlo.subtract %1742, %1743 : tensor<100x1x256xf64>
-    %1745 = stablehlo.multiply %1744, %1744 : tensor<100x1x256xf64>
-    %1746 = stablehlo.reduce(%1745 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1747 = stablehlo.reshape %1746 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1748 = stablehlo.broadcast_in_dim %1747, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1749 = stablehlo.divide %1748, %1536 : tensor<100x1x1xf64>
-    %1750 = stablehlo.convert %1749 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1751 = stablehlo.reduce(%1736 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1752 = stablehlo.reshape %1751 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1753 = stablehlo.broadcast_in_dim %1752, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1754 = stablehlo.divide %1753, %1550 : tensor<100x1x1xf32>
-    %1755 = stablehlo.broadcast_in_dim %1750, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1756 = stablehlo.add %1755, %1553 : tensor<100x1x1xf32>
-    %1757 = stablehlo.rsqrt %1756 : tensor<100x1x1xf32>
-    %1758 = stablehlo.broadcast_in_dim %1736, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1759 = stablehlo.broadcast_in_dim %1754, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1760 = stablehlo.subtract %1758, %1759 : tensor<100x1x256xf32>
-    %1761 = stablehlo.broadcast_in_dim %1760, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1762 = stablehlo.broadcast_in_dim %1757, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1763 = stablehlo.multiply %1761, %1762 : tensor<100x1x256xf32>
-    %1764 = stablehlo.convert %arg86 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1765 = stablehlo.broadcast_in_dim %1763, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1766 = stablehlo.broadcast_in_dim %1764, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1767 = stablehlo.multiply %1765, %1766 : tensor<100x1x256xf32>
-    %1768 = stablehlo.convert %arg87 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1769 = stablehlo.broadcast_in_dim %1767, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1770 = stablehlo.broadcast_in_dim %1768, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1771 = stablehlo.add %1769, %1770 : tensor<100x1x256xf32>
-    %1772 = stablehlo.convert %1771 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1773 = stablehlo.add %1772, %arg320 : tensor<100x1x256xbf16>
-    %1774 = stablehlo.reshape %1773 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1775 = stablehlo.convert %1774 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1776 = stablehlo.dot_general %1775, %arg329, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1777 = stablehlo.broadcast_in_dim %1776, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1778 = stablehlo.multiply %1777, %1523 : tensor<100x256xf32>
-    %1779 = stablehlo.broadcast_in_dim %1778, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1780 = stablehlo.broadcast_in_dim %arg330, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1781 = stablehlo.add %1779, %1780 : tensor<100x256xf32>
-    %1782 = stablehlo.convert %1781 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1783 = stablehlo.reshape %1782 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1784 = stablehlo.dot_general %1471, %arg331, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1785 = stablehlo.broadcast_in_dim %1784, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1786 = stablehlo.multiply %1785, %515 : tensor<920x256xf32>
-    %1787 = stablehlo.broadcast_in_dim %1786, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1788 = stablehlo.broadcast_in_dim %arg332, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1789 = stablehlo.add %1787, %1788 : tensor<920x256xf32>
-    %1790 = stablehlo.convert %1789 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1791 = stablehlo.reshape %1790 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1792 = stablehlo.dot_general %1481, %arg333, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %1793 = stablehlo.broadcast_in_dim %1792, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1794 = stablehlo.multiply %1793, %515 : tensor<920x256xf32>
-    %1795 = stablehlo.broadcast_in_dim %1794, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %1796 = stablehlo.broadcast_in_dim %arg334, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %1797 = stablehlo.add %1795, %1796 : tensor<920x256xf32>
-    %1798 = stablehlo.convert %1797 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %1799 = stablehlo.reshape %1798 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %1800 = stablehlo.reshape %1783 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %1801 = stablehlo.transpose %1800, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %1802 = stablehlo.reshape %1791 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1803 = stablehlo.transpose %1802, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1804 = stablehlo.reshape %1799 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %1805 = stablehlo.transpose %1804, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %1806 = stablehlo.broadcast_in_dim %1801, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %1807 = stablehlo.multiply %1806, %1704 : tensor<8x100x32xbf16>
-    %1808 = stablehlo.transpose %1803, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %1809 = stablehlo.broadcast_in_dim %1808, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %1810 = stablehlo.dot_general %1807, %1809, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
-    %1811 = stablehlo.broadcast_in_dim %1810, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %1812 = stablehlo.multiply %1811, %1498 : tensor<8x100x920xbf16>
-    %1813 = stablehlo.broadcast_in_dim %1812, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %1814 = stablehlo.broadcast_in_dim %arg335, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x100x920xbf16>
-    %1815 = stablehlo.add %1813, %1814 : tensor<8x100x920xbf16>
-    %1816 = stablehlo.convert %1815 : (tensor<8x100x920xbf16>) -> tensor<8x100x920xf32>
-    %1817 = stablehlo.reduce(%1816 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %1818 = stablehlo.reshape %1817 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %1819 = stablehlo.broadcast_in_dim %1816, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %1820 = stablehlo.broadcast_in_dim %1818, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %1821 = stablehlo.subtract %1819, %1820 : tensor<8x100x920xf32>
-    %1822 = stablehlo.exponential %1821 : tensor<8x100x920xf32>
-    %1823 = stablehlo.reduce(%1822 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %1824 = stablehlo.reshape %1823 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %1825 = stablehlo.broadcast_in_dim %1822, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %1826 = stablehlo.broadcast_in_dim %1824, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %1827 = stablehlo.divide %1825, %1826 : tensor<8x100x920xf32>
-    %1828 = stablehlo.convert %1827 : (tensor<8x100x920xf32>) -> tensor<8x100x920xbf16>
-    %1829 = stablehlo.broadcast_in_dim %1805, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %1830 = stablehlo.dot_general %1828, %1829, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x100x32xbf16>
-    %1831 = stablehlo.transpose %1830, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %1832 = stablehlo.reshape %1831 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %1833 = stablehlo.convert %1832 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1834 = stablehlo.dot_general %1833, %arg336, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1835 = stablehlo.broadcast_in_dim %1834, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1836 = stablehlo.multiply %1835, %1523 : tensor<100x256xf32>
-    %1837 = stablehlo.broadcast_in_dim %1836, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1838 = stablehlo.broadcast_in_dim %arg337, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1839 = stablehlo.add %1837, %1838 : tensor<100x256xf32>
-    %1840 = stablehlo.convert %1839 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1841 = stablehlo.reshape %1840 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1842 = stablehlo.add %1772, %1841 : tensor<100x1x256xbf16>
-    %1843 = stablehlo.convert %1842 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1844 = stablehlo.convert %1843 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1845 = stablehlo.reduce(%1844 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1846 = stablehlo.reshape %1845 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1847 = stablehlo.broadcast_in_dim %1846, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1848 = stablehlo.divide %1847, %1536 : tensor<100x1x1xf64>
-    %1849 = stablehlo.broadcast_in_dim %1844, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1850 = stablehlo.broadcast_in_dim %1848, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1851 = stablehlo.subtract %1849, %1850 : tensor<100x1x256xf64>
-    %1852 = stablehlo.multiply %1851, %1851 : tensor<100x1x256xf64>
-    %1853 = stablehlo.reduce(%1852 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1854 = stablehlo.reshape %1853 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1855 = stablehlo.broadcast_in_dim %1854, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1856 = stablehlo.divide %1855, %1536 : tensor<100x1x1xf64>
-    %1857 = stablehlo.convert %1856 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1858 = stablehlo.reduce(%1843 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1859 = stablehlo.reshape %1858 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1860 = stablehlo.broadcast_in_dim %1859, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1861 = stablehlo.divide %1860, %1550 : tensor<100x1x1xf32>
-    %1862 = stablehlo.broadcast_in_dim %1857, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1863 = stablehlo.add %1862, %1553 : tensor<100x1x1xf32>
-    %1864 = stablehlo.rsqrt %1863 : tensor<100x1x1xf32>
-    %1865 = stablehlo.broadcast_in_dim %1843, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1866 = stablehlo.broadcast_in_dim %1861, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1867 = stablehlo.subtract %1865, %1866 : tensor<100x1x256xf32>
-    %1868 = stablehlo.broadcast_in_dim %1867, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1869 = stablehlo.broadcast_in_dim %1864, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1870 = stablehlo.multiply %1868, %1869 : tensor<100x1x256xf32>
-    %1871 = stablehlo.convert %arg88 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1872 = stablehlo.broadcast_in_dim %1870, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1873 = stablehlo.broadcast_in_dim %1871, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1874 = stablehlo.multiply %1872, %1873 : tensor<100x1x256xf32>
-    %1875 = stablehlo.convert %arg89 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1876 = stablehlo.broadcast_in_dim %1874, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1877 = stablehlo.broadcast_in_dim %1875, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1878 = stablehlo.add %1876, %1877 : tensor<100x1x256xf32>
-    %1879 = stablehlo.convert %1878 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1880 = stablehlo.reshape %1879 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1881 = stablehlo.convert %1880 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1882 = stablehlo.dot_general %1881, %arg338, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x2048xf32>) -> tensor<100x2048xf32>
-    %1883 = stablehlo.broadcast_in_dim %1882, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %1884 = stablehlo.multiply %1883, %1575 : tensor<100x2048xf32>
-    %1885 = stablehlo.broadcast_in_dim %1884, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %1886 = stablehlo.broadcast_in_dim %arg339, dims = [1] : (tensor<2048xf32>) -> tensor<100x2048xf32>
-    %1887 = stablehlo.add %1885, %1886 : tensor<100x2048xf32>
-    %1888 = stablehlo.convert %1887 : (tensor<100x2048xf32>) -> tensor<100x2048xbf16>
-    %1889 = stablehlo.reshape %1888 : (tensor<100x2048xbf16>) -> tensor<100x1x2048xbf16>
-    %1890 = stablehlo.maximum %1889, %cst_16 : tensor<100x1x2048xbf16>
-    %1891 = stablehlo.reshape %1890 : (tensor<100x1x2048xbf16>) -> tensor<100x2048xbf16>
-    %1892 = stablehlo.convert %1891 : (tensor<100x2048xbf16>) -> tensor<100x2048xf32>
-    %1893 = stablehlo.dot_general %1892, %arg340, contracting_dims = [1] x [0] : (tensor<100x2048xf32>, tensor<2048x256xf32>) -> tensor<100x256xf32>
-    %1894 = stablehlo.broadcast_in_dim %1893, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1895 = stablehlo.multiply %1894, %1523 : tensor<100x256xf32>
-    %1896 = stablehlo.broadcast_in_dim %1895, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1897 = stablehlo.broadcast_in_dim %arg341, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1898 = stablehlo.add %1896, %1897 : tensor<100x256xf32>
-    %1899 = stablehlo.convert %1898 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1900 = stablehlo.reshape %1899 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1901 = stablehlo.add %1879, %1900 : tensor<100x1x256xbf16>
-    %1902 = stablehlo.convert %1901 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1903 = stablehlo.convert %1902 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1904 = stablehlo.reduce(%1903 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1905 = stablehlo.reshape %1904 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1906 = stablehlo.broadcast_in_dim %1905, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1907 = stablehlo.divide %1906, %1536 : tensor<100x1x1xf64>
-    %1908 = stablehlo.broadcast_in_dim %1903, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1909 = stablehlo.broadcast_in_dim %1907, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1910 = stablehlo.subtract %1908, %1909 : tensor<100x1x256xf64>
-    %1911 = stablehlo.multiply %1910, %1910 : tensor<100x1x256xf64>
-    %1912 = stablehlo.reduce(%1911 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1913 = stablehlo.reshape %1912 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1914 = stablehlo.broadcast_in_dim %1913, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1915 = stablehlo.divide %1914, %1536 : tensor<100x1x1xf64>
-    %1916 = stablehlo.convert %1915 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1917 = stablehlo.reduce(%1902 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1918 = stablehlo.reshape %1917 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1919 = stablehlo.broadcast_in_dim %1918, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1920 = stablehlo.divide %1919, %1550 : tensor<100x1x1xf32>
-    %1921 = stablehlo.broadcast_in_dim %1916, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1922 = stablehlo.add %1921, %1553 : tensor<100x1x1xf32>
-    %1923 = stablehlo.rsqrt %1922 : tensor<100x1x1xf32>
-    %1924 = stablehlo.broadcast_in_dim %1902, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1925 = stablehlo.broadcast_in_dim %1920, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1926 = stablehlo.subtract %1924, %1925 : tensor<100x1x256xf32>
-    %1927 = stablehlo.broadcast_in_dim %1926, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1928 = stablehlo.broadcast_in_dim %1923, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1929 = stablehlo.multiply %1927, %1928 : tensor<100x1x256xf32>
-    %1930 = stablehlo.convert %arg90 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1931 = stablehlo.broadcast_in_dim %1929, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1932 = stablehlo.broadcast_in_dim %1930, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1933 = stablehlo.multiply %1931, %1932 : tensor<100x1x256xf32>
-    %1934 = stablehlo.convert %arg91 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1935 = stablehlo.broadcast_in_dim %1933, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1936 = stablehlo.broadcast_in_dim %1934, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %1937 = stablehlo.add %1935, %1936 : tensor<100x1x256xf32>
-    %1938 = stablehlo.convert %1937 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1939 = stablehlo.convert %1938 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %1940 = stablehlo.convert %1939 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %1941 = stablehlo.reduce(%1940 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1942 = stablehlo.reshape %1941 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1943 = stablehlo.broadcast_in_dim %1942, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1944 = stablehlo.divide %1943, %1536 : tensor<100x1x1xf64>
-    %1945 = stablehlo.broadcast_in_dim %1940, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %1946 = stablehlo.broadcast_in_dim %1944, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %1947 = stablehlo.subtract %1945, %1946 : tensor<100x1x256xf64>
-    %1948 = stablehlo.multiply %1947, %1947 : tensor<100x1x256xf64>
-    %1949 = stablehlo.reduce(%1948 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %1950 = stablehlo.reshape %1949 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %1951 = stablehlo.broadcast_in_dim %1950, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %1952 = stablehlo.divide %1951, %1536 : tensor<100x1x1xf64>
-    %1953 = stablehlo.convert %1952 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %1954 = stablehlo.reduce(%1939 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %1955 = stablehlo.reshape %1954 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %1956 = stablehlo.broadcast_in_dim %1955, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1957 = stablehlo.divide %1956, %1550 : tensor<100x1x1xf32>
-    %1958 = stablehlo.broadcast_in_dim %1953, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %1959 = stablehlo.add %1958, %1553 : tensor<100x1x1xf32>
-    %1960 = stablehlo.rsqrt %1959 : tensor<100x1x1xf32>
-    %1961 = stablehlo.broadcast_in_dim %1939, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1962 = stablehlo.broadcast_in_dim %1957, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1963 = stablehlo.subtract %1961, %1962 : tensor<100x1x256xf32>
-    %1964 = stablehlo.broadcast_in_dim %1963, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1965 = stablehlo.broadcast_in_dim %1960, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %1966 = stablehlo.multiply %1964, %1965 : tensor<100x1x256xf32>
-    %1967 = stablehlo.broadcast_in_dim %1966, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1968 = stablehlo.multiply %1967, %1661 : tensor<100x1x256xf32>
-    %1969 = stablehlo.broadcast_in_dim %1968, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %1970 = stablehlo.add %1969, %1665 : tensor<100x1x256xf32>
-    %1971 = stablehlo.convert %1970 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %1972 = stablehlo.add %1938, %arg320 : tensor<100x1x256xbf16>
-    %1973 = stablehlo.reshape %1972 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1974 = stablehlo.convert %1973 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1975 = stablehlo.dot_general %1974, %arg342, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1976 = stablehlo.broadcast_in_dim %1975, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1977 = stablehlo.multiply %1976, %1523 : tensor<100x256xf32>
-    %1978 = stablehlo.broadcast_in_dim %1977, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1979 = stablehlo.broadcast_in_dim %arg343, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1980 = stablehlo.add %1978, %1979 : tensor<100x256xf32>
-    %1981 = stablehlo.convert %1980 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1982 = stablehlo.reshape %1981 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1983 = stablehlo.dot_general %1974, %arg344, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1984 = stablehlo.broadcast_in_dim %1983, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1985 = stablehlo.multiply %1984, %1523 : tensor<100x256xf32>
-    %1986 = stablehlo.broadcast_in_dim %1985, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1987 = stablehlo.broadcast_in_dim %arg345, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1988 = stablehlo.add %1986, %1987 : tensor<100x256xf32>
-    %1989 = stablehlo.convert %1988 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %1990 = stablehlo.reshape %1989 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %1991 = stablehlo.reshape %1938 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %1992 = stablehlo.convert %1991 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %1993 = stablehlo.dot_general %1992, %arg346, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %1994 = stablehlo.broadcast_in_dim %1993, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1995 = stablehlo.multiply %1994, %1523 : tensor<100x256xf32>
-    %1996 = stablehlo.broadcast_in_dim %1995, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %1997 = stablehlo.broadcast_in_dim %arg347, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %1998 = stablehlo.add %1996, %1997 : tensor<100x256xf32>
-    %1999 = stablehlo.convert %1998 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2000 = stablehlo.reshape %1999 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2001 = stablehlo.reshape %1982 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2002 = stablehlo.transpose %2001, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2003 = stablehlo.reshape %1990 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2004 = stablehlo.transpose %2003, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2005 = stablehlo.reshape %2000 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2006 = stablehlo.transpose %2005, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2007 = stablehlo.broadcast_in_dim %2002, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2008 = stablehlo.multiply %2007, %1704 : tensor<8x100x32xbf16>
-    %2009 = stablehlo.transpose %2004, dims = [0, 2, 1] : (tensor<8x100x32xbf16>) -> tensor<8x32x100xbf16>
-    %2010 = stablehlo.broadcast_in_dim %2009, dims = [0, 1, 2] : (tensor<8x32x100xbf16>) -> tensor<8x32x100xbf16>
-    %2011 = stablehlo.dot_general %2008, %2010, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x100xbf16>) -> tensor<8x100x100xbf16>
-    %2012 = stablehlo.convert %2011 : (tensor<8x100x100xbf16>) -> tensor<8x100x100xf32>
-    %2013 = stablehlo.reduce(%2012 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2014 = stablehlo.reshape %2013 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2015 = stablehlo.broadcast_in_dim %2012, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2016 = stablehlo.broadcast_in_dim %2014, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2017 = stablehlo.subtract %2015, %2016 : tensor<8x100x100xf32>
-    %2018 = stablehlo.exponential %2017 : tensor<8x100x100xf32>
-    %2019 = stablehlo.reduce(%2018 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2020 = stablehlo.reshape %2019 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2021 = stablehlo.broadcast_in_dim %2018, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2022 = stablehlo.broadcast_in_dim %2020, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2023 = stablehlo.divide %2021, %2022 : tensor<8x100x100xf32>
-    %2024 = stablehlo.convert %2023 : (tensor<8x100x100xf32>) -> tensor<8x100x100xbf16>
-    %2025 = stablehlo.broadcast_in_dim %2006, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2026 = stablehlo.dot_general %2024, %2025, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x100xbf16>, tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2027 = stablehlo.transpose %2026, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2028 = stablehlo.reshape %2027 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2029 = stablehlo.convert %2028 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2030 = stablehlo.dot_general %2029, %arg348, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2031 = stablehlo.broadcast_in_dim %2030, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2032 = stablehlo.multiply %2031, %1523 : tensor<100x256xf32>
-    %2033 = stablehlo.broadcast_in_dim %2032, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2034 = stablehlo.broadcast_in_dim %arg349, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2035 = stablehlo.add %2033, %2034 : tensor<100x256xf32>
-    %2036 = stablehlo.convert %2035 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2037 = stablehlo.reshape %2036 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2038 = stablehlo.add %1938, %2037 : tensor<100x1x256xbf16>
-    %2039 = stablehlo.convert %2038 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2040 = stablehlo.convert %2039 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2041 = stablehlo.reduce(%2040 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2042 = stablehlo.reshape %2041 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2043 = stablehlo.broadcast_in_dim %2042, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2044 = stablehlo.divide %2043, %1536 : tensor<100x1x1xf64>
-    %2045 = stablehlo.broadcast_in_dim %2040, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2046 = stablehlo.broadcast_in_dim %2044, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2047 = stablehlo.subtract %2045, %2046 : tensor<100x1x256xf64>
-    %2048 = stablehlo.multiply %2047, %2047 : tensor<100x1x256xf64>
-    %2049 = stablehlo.reduce(%2048 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2050 = stablehlo.reshape %2049 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2051 = stablehlo.broadcast_in_dim %2050, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2052 = stablehlo.divide %2051, %1536 : tensor<100x1x1xf64>
-    %2053 = stablehlo.convert %2052 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2054 = stablehlo.reduce(%2039 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2055 = stablehlo.reshape %2054 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2056 = stablehlo.broadcast_in_dim %2055, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2057 = stablehlo.divide %2056, %1550 : tensor<100x1x1xf32>
-    %2058 = stablehlo.broadcast_in_dim %2053, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2059 = stablehlo.add %2058, %1553 : tensor<100x1x1xf32>
-    %2060 = stablehlo.rsqrt %2059 : tensor<100x1x1xf32>
-    %2061 = stablehlo.broadcast_in_dim %2039, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2062 = stablehlo.broadcast_in_dim %2057, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2063 = stablehlo.subtract %2061, %2062 : tensor<100x1x256xf32>
-    %2064 = stablehlo.broadcast_in_dim %2063, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2065 = stablehlo.broadcast_in_dim %2060, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2066 = stablehlo.multiply %2064, %2065 : tensor<100x1x256xf32>
-    %2067 = stablehlo.convert %arg92 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2068 = stablehlo.broadcast_in_dim %2066, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2069 = stablehlo.broadcast_in_dim %2067, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2070 = stablehlo.multiply %2068, %2069 : tensor<100x1x256xf32>
-    %2071 = stablehlo.convert %arg93 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2072 = stablehlo.broadcast_in_dim %2070, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2073 = stablehlo.broadcast_in_dim %2071, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2074 = stablehlo.add %2072, %2073 : tensor<100x1x256xf32>
-    %2075 = stablehlo.convert %2074 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2076 = stablehlo.add %2075, %arg320 : tensor<100x1x256xbf16>
-    %2077 = stablehlo.reshape %2076 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2078 = stablehlo.convert %2077 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2079 = stablehlo.dot_general %2078, %arg350, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2080 = stablehlo.broadcast_in_dim %2079, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2081 = stablehlo.multiply %2080, %1523 : tensor<100x256xf32>
-    %2082 = stablehlo.broadcast_in_dim %2081, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2083 = stablehlo.broadcast_in_dim %arg351, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2084 = stablehlo.add %2082, %2083 : tensor<100x256xf32>
-    %2085 = stablehlo.convert %2084 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2086 = stablehlo.reshape %2085 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2087 = stablehlo.dot_general %1471, %arg352, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2088 = stablehlo.broadcast_in_dim %2087, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2089 = stablehlo.multiply %2088, %515 : tensor<920x256xf32>
-    %2090 = stablehlo.broadcast_in_dim %2089, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2091 = stablehlo.broadcast_in_dim %arg353, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %2092 = stablehlo.add %2090, %2091 : tensor<920x256xf32>
-    %2093 = stablehlo.convert %2092 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %2094 = stablehlo.reshape %2093 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %2095 = stablehlo.dot_general %1481, %arg354, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2096 = stablehlo.broadcast_in_dim %2095, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2097 = stablehlo.multiply %2096, %515 : tensor<920x256xf32>
-    %2098 = stablehlo.broadcast_in_dim %2097, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2099 = stablehlo.broadcast_in_dim %arg355, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %2100 = stablehlo.add %2098, %2099 : tensor<920x256xf32>
-    %2101 = stablehlo.convert %2100 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %2102 = stablehlo.reshape %2101 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %2103 = stablehlo.reshape %2086 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2104 = stablehlo.transpose %2103, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2105 = stablehlo.reshape %2094 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %2106 = stablehlo.transpose %2105, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %2107 = stablehlo.reshape %2102 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %2108 = stablehlo.transpose %2107, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %2109 = stablehlo.broadcast_in_dim %2104, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2110 = stablehlo.multiply %2109, %1704 : tensor<8x100x32xbf16>
-    %2111 = stablehlo.transpose %2106, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %2112 = stablehlo.broadcast_in_dim %2111, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %2113 = stablehlo.dot_general %2110, %2112, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
-    %2114 = stablehlo.broadcast_in_dim %2113, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %2115 = stablehlo.multiply %2114, %1498 : tensor<8x100x920xbf16>
-    %2116 = stablehlo.broadcast_in_dim %2115, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %2117 = stablehlo.broadcast_in_dim %arg356, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x100x920xbf16>
-    %2118 = stablehlo.add %2116, %2117 : tensor<8x100x920xbf16>
-    %2119 = stablehlo.convert %2118 : (tensor<8x100x920xbf16>) -> tensor<8x100x920xf32>
-    %2120 = stablehlo.reduce(%2119 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2121 = stablehlo.reshape %2120 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2122 = stablehlo.broadcast_in_dim %2119, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %2123 = stablehlo.broadcast_in_dim %2121, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %2124 = stablehlo.subtract %2122, %2123 : tensor<8x100x920xf32>
-    %2125 = stablehlo.exponential %2124 : tensor<8x100x920xf32>
-    %2126 = stablehlo.reduce(%2125 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2127 = stablehlo.reshape %2126 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2128 = stablehlo.broadcast_in_dim %2125, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %2129 = stablehlo.broadcast_in_dim %2127, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %2130 = stablehlo.divide %2128, %2129 : tensor<8x100x920xf32>
-    %2131 = stablehlo.convert %2130 : (tensor<8x100x920xf32>) -> tensor<8x100x920xbf16>
-    %2132 = stablehlo.broadcast_in_dim %2108, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %2133 = stablehlo.dot_general %2131, %2132, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x100x32xbf16>
-    %2134 = stablehlo.transpose %2133, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2135 = stablehlo.reshape %2134 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2136 = stablehlo.convert %2135 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2137 = stablehlo.dot_general %2136, %arg357, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2138 = stablehlo.broadcast_in_dim %2137, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2139 = stablehlo.multiply %2138, %1523 : tensor<100x256xf32>
-    %2140 = stablehlo.broadcast_in_dim %2139, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2141 = stablehlo.broadcast_in_dim %arg358, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2142 = stablehlo.add %2140, %2141 : tensor<100x256xf32>
-    %2143 = stablehlo.convert %2142 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2144 = stablehlo.reshape %2143 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2145 = stablehlo.add %2075, %2144 : tensor<100x1x256xbf16>
-    %2146 = stablehlo.convert %2145 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2147 = stablehlo.convert %2146 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2148 = stablehlo.reduce(%2147 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2149 = stablehlo.reshape %2148 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2150 = stablehlo.broadcast_in_dim %2149, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2151 = stablehlo.divide %2150, %1536 : tensor<100x1x1xf64>
-    %2152 = stablehlo.broadcast_in_dim %2147, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2153 = stablehlo.broadcast_in_dim %2151, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2154 = stablehlo.subtract %2152, %2153 : tensor<100x1x256xf64>
-    %2155 = stablehlo.multiply %2154, %2154 : tensor<100x1x256xf64>
-    %2156 = stablehlo.reduce(%2155 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2157 = stablehlo.reshape %2156 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2158 = stablehlo.broadcast_in_dim %2157, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2159 = stablehlo.divide %2158, %1536 : tensor<100x1x1xf64>
-    %2160 = stablehlo.convert %2159 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2161 = stablehlo.reduce(%2146 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2162 = stablehlo.reshape %2161 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2163 = stablehlo.broadcast_in_dim %2162, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2164 = stablehlo.divide %2163, %1550 : tensor<100x1x1xf32>
-    %2165 = stablehlo.broadcast_in_dim %2160, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2166 = stablehlo.add %2165, %1553 : tensor<100x1x1xf32>
-    %2167 = stablehlo.rsqrt %2166 : tensor<100x1x1xf32>
-    %2168 = stablehlo.broadcast_in_dim %2146, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2169 = stablehlo.broadcast_in_dim %2164, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2170 = stablehlo.subtract %2168, %2169 : tensor<100x1x256xf32>
-    %2171 = stablehlo.broadcast_in_dim %2170, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2172 = stablehlo.broadcast_in_dim %2167, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2173 = stablehlo.multiply %2171, %2172 : tensor<100x1x256xf32>
-    %2174 = stablehlo.convert %arg94 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2175 = stablehlo.broadcast_in_dim %2173, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2176 = stablehlo.broadcast_in_dim %2174, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2177 = stablehlo.multiply %2175, %2176 : tensor<100x1x256xf32>
-    %2178 = stablehlo.convert %arg95 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2179 = stablehlo.broadcast_in_dim %2177, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2180 = stablehlo.broadcast_in_dim %2178, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2181 = stablehlo.add %2179, %2180 : tensor<100x1x256xf32>
-    %2182 = stablehlo.convert %2181 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2183 = stablehlo.reshape %2182 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2184 = stablehlo.convert %2183 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2185 = stablehlo.dot_general %2184, %arg359, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x2048xf32>) -> tensor<100x2048xf32>
-    %2186 = stablehlo.broadcast_in_dim %2185, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %2187 = stablehlo.multiply %2186, %1575 : tensor<100x2048xf32>
-    %2188 = stablehlo.broadcast_in_dim %2187, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %2189 = stablehlo.broadcast_in_dim %arg360, dims = [1] : (tensor<2048xf32>) -> tensor<100x2048xf32>
-    %2190 = stablehlo.add %2188, %2189 : tensor<100x2048xf32>
-    %2191 = stablehlo.convert %2190 : (tensor<100x2048xf32>) -> tensor<100x2048xbf16>
-    %2192 = stablehlo.reshape %2191 : (tensor<100x2048xbf16>) -> tensor<100x1x2048xbf16>
-    %2193 = stablehlo.maximum %2192, %cst_16 : tensor<100x1x2048xbf16>
-    %2194 = stablehlo.reshape %2193 : (tensor<100x1x2048xbf16>) -> tensor<100x2048xbf16>
-    %2195 = stablehlo.convert %2194 : (tensor<100x2048xbf16>) -> tensor<100x2048xf32>
-    %2196 = stablehlo.dot_general %2195, %arg361, contracting_dims = [1] x [0] : (tensor<100x2048xf32>, tensor<2048x256xf32>) -> tensor<100x256xf32>
-    %2197 = stablehlo.broadcast_in_dim %2196, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2198 = stablehlo.multiply %2197, %1523 : tensor<100x256xf32>
-    %2199 = stablehlo.broadcast_in_dim %2198, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2200 = stablehlo.broadcast_in_dim %arg362, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2201 = stablehlo.add %2199, %2200 : tensor<100x256xf32>
-    %2202 = stablehlo.convert %2201 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2203 = stablehlo.reshape %2202 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2204 = stablehlo.add %2182, %2203 : tensor<100x1x256xbf16>
-    %2205 = stablehlo.convert %2204 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2206 = stablehlo.convert %2205 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2207 = stablehlo.reduce(%2206 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2208 = stablehlo.reshape %2207 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2209 = stablehlo.broadcast_in_dim %2208, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2210 = stablehlo.divide %2209, %1536 : tensor<100x1x1xf64>
-    %2211 = stablehlo.broadcast_in_dim %2206, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2212 = stablehlo.broadcast_in_dim %2210, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2213 = stablehlo.subtract %2211, %2212 : tensor<100x1x256xf64>
-    %2214 = stablehlo.multiply %2213, %2213 : tensor<100x1x256xf64>
-    %2215 = stablehlo.reduce(%2214 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2216 = stablehlo.reshape %2215 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2217 = stablehlo.broadcast_in_dim %2216, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2218 = stablehlo.divide %2217, %1536 : tensor<100x1x1xf64>
-    %2219 = stablehlo.convert %2218 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2220 = stablehlo.reduce(%2205 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2221 = stablehlo.reshape %2220 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2222 = stablehlo.broadcast_in_dim %2221, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2223 = stablehlo.divide %2222, %1550 : tensor<100x1x1xf32>
-    %2224 = stablehlo.broadcast_in_dim %2219, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2225 = stablehlo.add %2224, %1553 : tensor<100x1x1xf32>
-    %2226 = stablehlo.rsqrt %2225 : tensor<100x1x1xf32>
-    %2227 = stablehlo.broadcast_in_dim %2205, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2228 = stablehlo.broadcast_in_dim %2223, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2229 = stablehlo.subtract %2227, %2228 : tensor<100x1x256xf32>
-    %2230 = stablehlo.broadcast_in_dim %2229, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2231 = stablehlo.broadcast_in_dim %2226, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2232 = stablehlo.multiply %2230, %2231 : tensor<100x1x256xf32>
-    %2233 = stablehlo.convert %arg96 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2234 = stablehlo.broadcast_in_dim %2232, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2235 = stablehlo.broadcast_in_dim %2233, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2236 = stablehlo.multiply %2234, %2235 : tensor<100x1x256xf32>
-    %2237 = stablehlo.convert %arg97 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2238 = stablehlo.broadcast_in_dim %2236, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2239 = stablehlo.broadcast_in_dim %2237, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2240 = stablehlo.add %2238, %2239 : tensor<100x1x256xf32>
-    %2241 = stablehlo.convert %2240 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2242 = stablehlo.convert %2241 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2243 = stablehlo.convert %2242 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2244 = stablehlo.reduce(%2243 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2245 = stablehlo.reshape %2244 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2246 = stablehlo.broadcast_in_dim %2245, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2247 = stablehlo.divide %2246, %1536 : tensor<100x1x1xf64>
-    %2248 = stablehlo.broadcast_in_dim %2243, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2249 = stablehlo.broadcast_in_dim %2247, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2250 = stablehlo.subtract %2248, %2249 : tensor<100x1x256xf64>
-    %2251 = stablehlo.multiply %2250, %2250 : tensor<100x1x256xf64>
-    %2252 = stablehlo.reduce(%2251 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2253 = stablehlo.reshape %2252 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2254 = stablehlo.broadcast_in_dim %2253, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2255 = stablehlo.divide %2254, %1536 : tensor<100x1x1xf64>
-    %2256 = stablehlo.convert %2255 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2257 = stablehlo.reduce(%2242 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2258 = stablehlo.reshape %2257 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2259 = stablehlo.broadcast_in_dim %2258, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2260 = stablehlo.divide %2259, %1550 : tensor<100x1x1xf32>
-    %2261 = stablehlo.broadcast_in_dim %2256, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2262 = stablehlo.add %2261, %1553 : tensor<100x1x1xf32>
-    %2263 = stablehlo.rsqrt %2262 : tensor<100x1x1xf32>
-    %2264 = stablehlo.broadcast_in_dim %2242, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2265 = stablehlo.broadcast_in_dim %2260, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2266 = stablehlo.subtract %2264, %2265 : tensor<100x1x256xf32>
-    %2267 = stablehlo.broadcast_in_dim %2266, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2268 = stablehlo.broadcast_in_dim %2263, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2269 = stablehlo.multiply %2267, %2268 : tensor<100x1x256xf32>
-    %2270 = stablehlo.broadcast_in_dim %2269, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2271 = stablehlo.multiply %2270, %1661 : tensor<100x1x256xf32>
-    %2272 = stablehlo.broadcast_in_dim %2271, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2273 = stablehlo.add %2272, %1665 : tensor<100x1x256xf32>
-    %2274 = stablehlo.convert %2273 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2275 = stablehlo.add %2241, %arg320 : tensor<100x1x256xbf16>
-    %2276 = stablehlo.reshape %2275 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2277 = stablehlo.convert %2276 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2278 = stablehlo.dot_general %2277, %arg363, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2279 = stablehlo.broadcast_in_dim %2278, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2280 = stablehlo.multiply %2279, %1523 : tensor<100x256xf32>
-    %2281 = stablehlo.broadcast_in_dim %2280, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2282 = stablehlo.broadcast_in_dim %arg364, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2283 = stablehlo.add %2281, %2282 : tensor<100x256xf32>
-    %2284 = stablehlo.convert %2283 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2285 = stablehlo.reshape %2284 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2286 = stablehlo.dot_general %2277, %arg365, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2287 = stablehlo.broadcast_in_dim %2286, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2288 = stablehlo.multiply %2287, %1523 : tensor<100x256xf32>
-    %2289 = stablehlo.broadcast_in_dim %2288, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2290 = stablehlo.broadcast_in_dim %arg366, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2291 = stablehlo.add %2289, %2290 : tensor<100x256xf32>
-    %2292 = stablehlo.convert %2291 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2293 = stablehlo.reshape %2292 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2294 = stablehlo.reshape %2241 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2295 = stablehlo.convert %2294 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2296 = stablehlo.dot_general %2295, %arg367, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2297 = stablehlo.broadcast_in_dim %2296, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2298 = stablehlo.multiply %2297, %1523 : tensor<100x256xf32>
-    %2299 = stablehlo.broadcast_in_dim %2298, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2300 = stablehlo.broadcast_in_dim %arg368, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2301 = stablehlo.add %2299, %2300 : tensor<100x256xf32>
-    %2302 = stablehlo.convert %2301 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2303 = stablehlo.reshape %2302 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2304 = stablehlo.reshape %2285 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2305 = stablehlo.transpose %2304, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2306 = stablehlo.reshape %2293 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2307 = stablehlo.transpose %2306, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2308 = stablehlo.reshape %2303 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2309 = stablehlo.transpose %2308, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2310 = stablehlo.broadcast_in_dim %2305, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2311 = stablehlo.multiply %2310, %1704 : tensor<8x100x32xbf16>
-    %2312 = stablehlo.transpose %2307, dims = [0, 2, 1] : (tensor<8x100x32xbf16>) -> tensor<8x32x100xbf16>
-    %2313 = stablehlo.broadcast_in_dim %2312, dims = [0, 1, 2] : (tensor<8x32x100xbf16>) -> tensor<8x32x100xbf16>
-    %2314 = stablehlo.dot_general %2311, %2313, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x100xbf16>) -> tensor<8x100x100xbf16>
-    %2315 = stablehlo.convert %2314 : (tensor<8x100x100xbf16>) -> tensor<8x100x100xf32>
-    %2316 = stablehlo.reduce(%2315 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2317 = stablehlo.reshape %2316 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2318 = stablehlo.broadcast_in_dim %2315, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2319 = stablehlo.broadcast_in_dim %2317, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2320 = stablehlo.subtract %2318, %2319 : tensor<8x100x100xf32>
-    %2321 = stablehlo.exponential %2320 : tensor<8x100x100xf32>
-    %2322 = stablehlo.reduce(%2321 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2323 = stablehlo.reshape %2322 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2324 = stablehlo.broadcast_in_dim %2321, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2325 = stablehlo.broadcast_in_dim %2323, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2326 = stablehlo.divide %2324, %2325 : tensor<8x100x100xf32>
-    %2327 = stablehlo.convert %2326 : (tensor<8x100x100xf32>) -> tensor<8x100x100xbf16>
-    %2328 = stablehlo.broadcast_in_dim %2309, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2329 = stablehlo.dot_general %2327, %2328, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x100xbf16>, tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2330 = stablehlo.transpose %2329, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2331 = stablehlo.reshape %2330 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2332 = stablehlo.convert %2331 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2333 = stablehlo.dot_general %2332, %arg369, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2334 = stablehlo.broadcast_in_dim %2333, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2335 = stablehlo.multiply %2334, %1523 : tensor<100x256xf32>
-    %2336 = stablehlo.broadcast_in_dim %2335, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2337 = stablehlo.broadcast_in_dim %arg370, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2338 = stablehlo.add %2336, %2337 : tensor<100x256xf32>
-    %2339 = stablehlo.convert %2338 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2340 = stablehlo.reshape %2339 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2341 = stablehlo.add %2241, %2340 : tensor<100x1x256xbf16>
-    %2342 = stablehlo.convert %2341 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2343 = stablehlo.convert %2342 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2344 = stablehlo.reduce(%2343 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2345 = stablehlo.reshape %2344 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2346 = stablehlo.broadcast_in_dim %2345, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2347 = stablehlo.divide %2346, %1536 : tensor<100x1x1xf64>
-    %2348 = stablehlo.broadcast_in_dim %2343, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2349 = stablehlo.broadcast_in_dim %2347, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2350 = stablehlo.subtract %2348, %2349 : tensor<100x1x256xf64>
-    %2351 = stablehlo.multiply %2350, %2350 : tensor<100x1x256xf64>
-    %2352 = stablehlo.reduce(%2351 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2353 = stablehlo.reshape %2352 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2354 = stablehlo.broadcast_in_dim %2353, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2355 = stablehlo.divide %2354, %1536 : tensor<100x1x1xf64>
-    %2356 = stablehlo.convert %2355 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2357 = stablehlo.reduce(%2342 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2358 = stablehlo.reshape %2357 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2359 = stablehlo.broadcast_in_dim %2358, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2360 = stablehlo.divide %2359, %1550 : tensor<100x1x1xf32>
-    %2361 = stablehlo.broadcast_in_dim %2356, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2362 = stablehlo.add %2361, %1553 : tensor<100x1x1xf32>
-    %2363 = stablehlo.rsqrt %2362 : tensor<100x1x1xf32>
-    %2364 = stablehlo.broadcast_in_dim %2342, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2365 = stablehlo.broadcast_in_dim %2360, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2366 = stablehlo.subtract %2364, %2365 : tensor<100x1x256xf32>
-    %2367 = stablehlo.broadcast_in_dim %2366, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2368 = stablehlo.broadcast_in_dim %2363, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2369 = stablehlo.multiply %2367, %2368 : tensor<100x1x256xf32>
-    %2370 = stablehlo.convert %arg98 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2371 = stablehlo.broadcast_in_dim %2369, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2372 = stablehlo.broadcast_in_dim %2370, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2373 = stablehlo.multiply %2371, %2372 : tensor<100x1x256xf32>
-    %2374 = stablehlo.convert %arg99 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2375 = stablehlo.broadcast_in_dim %2373, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2376 = stablehlo.broadcast_in_dim %2374, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2377 = stablehlo.add %2375, %2376 : tensor<100x1x256xf32>
-    %2378 = stablehlo.convert %2377 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2379 = stablehlo.add %2378, %arg320 : tensor<100x1x256xbf16>
-    %2380 = stablehlo.reshape %2379 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2381 = stablehlo.convert %2380 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2382 = stablehlo.dot_general %2381, %arg371, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2383 = stablehlo.broadcast_in_dim %2382, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2384 = stablehlo.multiply %2383, %1523 : tensor<100x256xf32>
-    %2385 = stablehlo.broadcast_in_dim %2384, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2386 = stablehlo.broadcast_in_dim %arg372, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2387 = stablehlo.add %2385, %2386 : tensor<100x256xf32>
-    %2388 = stablehlo.convert %2387 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2389 = stablehlo.reshape %2388 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2390 = stablehlo.dot_general %1471, %arg373, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2391 = stablehlo.broadcast_in_dim %2390, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2392 = stablehlo.multiply %2391, %515 : tensor<920x256xf32>
-    %2393 = stablehlo.broadcast_in_dim %2392, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2394 = stablehlo.broadcast_in_dim %arg374, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %2395 = stablehlo.add %2393, %2394 : tensor<920x256xf32>
-    %2396 = stablehlo.convert %2395 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %2397 = stablehlo.reshape %2396 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %2398 = stablehlo.dot_general %1481, %arg375, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2399 = stablehlo.broadcast_in_dim %2398, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2400 = stablehlo.multiply %2399, %515 : tensor<920x256xf32>
-    %2401 = stablehlo.broadcast_in_dim %2400, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2402 = stablehlo.broadcast_in_dim %arg376, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %2403 = stablehlo.add %2401, %2402 : tensor<920x256xf32>
-    %2404 = stablehlo.convert %2403 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %2405 = stablehlo.reshape %2404 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %2406 = stablehlo.reshape %2389 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2407 = stablehlo.transpose %2406, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2408 = stablehlo.reshape %2397 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %2409 = stablehlo.transpose %2408, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %2410 = stablehlo.reshape %2405 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %2411 = stablehlo.transpose %2410, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %2412 = stablehlo.broadcast_in_dim %2407, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2413 = stablehlo.multiply %2412, %1704 : tensor<8x100x32xbf16>
-    %2414 = stablehlo.transpose %2409, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %2415 = stablehlo.broadcast_in_dim %2414, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %2416 = stablehlo.dot_general %2413, %2415, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
-    %2417 = stablehlo.broadcast_in_dim %2416, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %2418 = stablehlo.multiply %2417, %1498 : tensor<8x100x920xbf16>
-    %2419 = stablehlo.broadcast_in_dim %2418, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %2420 = stablehlo.broadcast_in_dim %arg377, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x100x920xbf16>
-    %2421 = stablehlo.add %2419, %2420 : tensor<8x100x920xbf16>
-    %2422 = stablehlo.convert %2421 : (tensor<8x100x920xbf16>) -> tensor<8x100x920xf32>
-    %2423 = stablehlo.reduce(%2422 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2424 = stablehlo.reshape %2423 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2425 = stablehlo.broadcast_in_dim %2422, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %2426 = stablehlo.broadcast_in_dim %2424, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %2427 = stablehlo.subtract %2425, %2426 : tensor<8x100x920xf32>
-    %2428 = stablehlo.exponential %2427 : tensor<8x100x920xf32>
-    %2429 = stablehlo.reduce(%2428 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2430 = stablehlo.reshape %2429 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2431 = stablehlo.broadcast_in_dim %2428, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %2432 = stablehlo.broadcast_in_dim %2430, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %2433 = stablehlo.divide %2431, %2432 : tensor<8x100x920xf32>
-    %2434 = stablehlo.convert %2433 : (tensor<8x100x920xf32>) -> tensor<8x100x920xbf16>
-    %2435 = stablehlo.broadcast_in_dim %2411, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %2436 = stablehlo.dot_general %2434, %2435, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x100x32xbf16>
-    %2437 = stablehlo.transpose %2436, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2438 = stablehlo.reshape %2437 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2439 = stablehlo.convert %2438 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2440 = stablehlo.dot_general %2439, %arg378, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2441 = stablehlo.broadcast_in_dim %2440, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2442 = stablehlo.multiply %2441, %1523 : tensor<100x256xf32>
-    %2443 = stablehlo.broadcast_in_dim %2442, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2444 = stablehlo.broadcast_in_dim %arg379, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2445 = stablehlo.add %2443, %2444 : tensor<100x256xf32>
-    %2446 = stablehlo.convert %2445 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2447 = stablehlo.reshape %2446 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2448 = stablehlo.add %2378, %2447 : tensor<100x1x256xbf16>
-    %2449 = stablehlo.convert %2448 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2450 = stablehlo.convert %2449 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2451 = stablehlo.reduce(%2450 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2452 = stablehlo.reshape %2451 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2453 = stablehlo.broadcast_in_dim %2452, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2454 = stablehlo.divide %2453, %1536 : tensor<100x1x1xf64>
-    %2455 = stablehlo.broadcast_in_dim %2450, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2456 = stablehlo.broadcast_in_dim %2454, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2457 = stablehlo.subtract %2455, %2456 : tensor<100x1x256xf64>
-    %2458 = stablehlo.multiply %2457, %2457 : tensor<100x1x256xf64>
-    %2459 = stablehlo.reduce(%2458 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2460 = stablehlo.reshape %2459 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2461 = stablehlo.broadcast_in_dim %2460, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2462 = stablehlo.divide %2461, %1536 : tensor<100x1x1xf64>
-    %2463 = stablehlo.convert %2462 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2464 = stablehlo.reduce(%2449 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2465 = stablehlo.reshape %2464 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2466 = stablehlo.broadcast_in_dim %2465, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2467 = stablehlo.divide %2466, %1550 : tensor<100x1x1xf32>
-    %2468 = stablehlo.broadcast_in_dim %2463, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2469 = stablehlo.add %2468, %1553 : tensor<100x1x1xf32>
-    %2470 = stablehlo.rsqrt %2469 : tensor<100x1x1xf32>
-    %2471 = stablehlo.broadcast_in_dim %2449, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2472 = stablehlo.broadcast_in_dim %2467, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2473 = stablehlo.subtract %2471, %2472 : tensor<100x1x256xf32>
-    %2474 = stablehlo.broadcast_in_dim %2473, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2475 = stablehlo.broadcast_in_dim %2470, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2476 = stablehlo.multiply %2474, %2475 : tensor<100x1x256xf32>
-    %2477 = stablehlo.convert %arg100 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2478 = stablehlo.broadcast_in_dim %2476, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2479 = stablehlo.broadcast_in_dim %2477, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2480 = stablehlo.multiply %2478, %2479 : tensor<100x1x256xf32>
-    %2481 = stablehlo.convert %arg101 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2482 = stablehlo.broadcast_in_dim %2480, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2483 = stablehlo.broadcast_in_dim %2481, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2484 = stablehlo.add %2482, %2483 : tensor<100x1x256xf32>
-    %2485 = stablehlo.convert %2484 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2486 = stablehlo.reshape %2485 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2487 = stablehlo.convert %2486 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2488 = stablehlo.dot_general %2487, %arg380, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x2048xf32>) -> tensor<100x2048xf32>
-    %2489 = stablehlo.broadcast_in_dim %2488, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %2490 = stablehlo.multiply %2489, %1575 : tensor<100x2048xf32>
-    %2491 = stablehlo.broadcast_in_dim %2490, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %2492 = stablehlo.broadcast_in_dim %arg381, dims = [1] : (tensor<2048xf32>) -> tensor<100x2048xf32>
-    %2493 = stablehlo.add %2491, %2492 : tensor<100x2048xf32>
-    %2494 = stablehlo.convert %2493 : (tensor<100x2048xf32>) -> tensor<100x2048xbf16>
-    %2495 = stablehlo.reshape %2494 : (tensor<100x2048xbf16>) -> tensor<100x1x2048xbf16>
-    %2496 = stablehlo.maximum %2495, %cst_16 : tensor<100x1x2048xbf16>
-    %2497 = stablehlo.reshape %2496 : (tensor<100x1x2048xbf16>) -> tensor<100x2048xbf16>
-    %2498 = stablehlo.convert %2497 : (tensor<100x2048xbf16>) -> tensor<100x2048xf32>
-    %2499 = stablehlo.dot_general %2498, %arg382, contracting_dims = [1] x [0] : (tensor<100x2048xf32>, tensor<2048x256xf32>) -> tensor<100x256xf32>
-    %2500 = stablehlo.broadcast_in_dim %2499, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2501 = stablehlo.multiply %2500, %1523 : tensor<100x256xf32>
-    %2502 = stablehlo.broadcast_in_dim %2501, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2503 = stablehlo.broadcast_in_dim %arg383, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2504 = stablehlo.add %2502, %2503 : tensor<100x256xf32>
-    %2505 = stablehlo.convert %2504 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2506 = stablehlo.reshape %2505 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2507 = stablehlo.add %2485, %2506 : tensor<100x1x256xbf16>
-    %2508 = stablehlo.convert %2507 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2509 = stablehlo.convert %2508 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2510 = stablehlo.reduce(%2509 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2511 = stablehlo.reshape %2510 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2512 = stablehlo.broadcast_in_dim %2511, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2513 = stablehlo.divide %2512, %1536 : tensor<100x1x1xf64>
-    %2514 = stablehlo.broadcast_in_dim %2509, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2515 = stablehlo.broadcast_in_dim %2513, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2516 = stablehlo.subtract %2514, %2515 : tensor<100x1x256xf64>
-    %2517 = stablehlo.multiply %2516, %2516 : tensor<100x1x256xf64>
-    %2518 = stablehlo.reduce(%2517 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2519 = stablehlo.reshape %2518 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2520 = stablehlo.broadcast_in_dim %2519, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2521 = stablehlo.divide %2520, %1536 : tensor<100x1x1xf64>
-    %2522 = stablehlo.convert %2521 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2523 = stablehlo.reduce(%2508 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2524 = stablehlo.reshape %2523 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2525 = stablehlo.broadcast_in_dim %2524, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2526 = stablehlo.divide %2525, %1550 : tensor<100x1x1xf32>
-    %2527 = stablehlo.broadcast_in_dim %2522, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2528 = stablehlo.add %2527, %1553 : tensor<100x1x1xf32>
-    %2529 = stablehlo.rsqrt %2528 : tensor<100x1x1xf32>
-    %2530 = stablehlo.broadcast_in_dim %2508, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2531 = stablehlo.broadcast_in_dim %2526, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2532 = stablehlo.subtract %2530, %2531 : tensor<100x1x256xf32>
-    %2533 = stablehlo.broadcast_in_dim %2532, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2534 = stablehlo.broadcast_in_dim %2529, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2535 = stablehlo.multiply %2533, %2534 : tensor<100x1x256xf32>
-    %2536 = stablehlo.convert %arg102 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2537 = stablehlo.broadcast_in_dim %2535, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2538 = stablehlo.broadcast_in_dim %2536, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2539 = stablehlo.multiply %2537, %2538 : tensor<100x1x256xf32>
-    %2540 = stablehlo.convert %arg103 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2541 = stablehlo.broadcast_in_dim %2539, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2542 = stablehlo.broadcast_in_dim %2540, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2543 = stablehlo.add %2541, %2542 : tensor<100x1x256xf32>
-    %2544 = stablehlo.convert %2543 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2545 = stablehlo.convert %2544 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2546 = stablehlo.convert %2545 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2547 = stablehlo.reduce(%2546 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2548 = stablehlo.reshape %2547 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2549 = stablehlo.broadcast_in_dim %2548, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2550 = stablehlo.divide %2549, %1536 : tensor<100x1x1xf64>
-    %2551 = stablehlo.broadcast_in_dim %2546, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2552 = stablehlo.broadcast_in_dim %2550, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2553 = stablehlo.subtract %2551, %2552 : tensor<100x1x256xf64>
-    %2554 = stablehlo.multiply %2553, %2553 : tensor<100x1x256xf64>
-    %2555 = stablehlo.reduce(%2554 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2556 = stablehlo.reshape %2555 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2557 = stablehlo.broadcast_in_dim %2556, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2558 = stablehlo.divide %2557, %1536 : tensor<100x1x1xf64>
-    %2559 = stablehlo.convert %2558 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2560 = stablehlo.reduce(%2545 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2561 = stablehlo.reshape %2560 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2562 = stablehlo.broadcast_in_dim %2561, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2563 = stablehlo.divide %2562, %1550 : tensor<100x1x1xf32>
-    %2564 = stablehlo.broadcast_in_dim %2559, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2565 = stablehlo.add %2564, %1553 : tensor<100x1x1xf32>
-    %2566 = stablehlo.rsqrt %2565 : tensor<100x1x1xf32>
-    %2567 = stablehlo.broadcast_in_dim %2545, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2568 = stablehlo.broadcast_in_dim %2563, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2569 = stablehlo.subtract %2567, %2568 : tensor<100x1x256xf32>
-    %2570 = stablehlo.broadcast_in_dim %2569, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2571 = stablehlo.broadcast_in_dim %2566, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2572 = stablehlo.multiply %2570, %2571 : tensor<100x1x256xf32>
-    %2573 = stablehlo.broadcast_in_dim %2572, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2574 = stablehlo.multiply %2573, %1661 : tensor<100x1x256xf32>
-    %2575 = stablehlo.broadcast_in_dim %2574, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2576 = stablehlo.add %2575, %1665 : tensor<100x1x256xf32>
-    %2577 = stablehlo.convert %2576 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2578 = stablehlo.add %2544, %arg320 : tensor<100x1x256xbf16>
-    %2579 = stablehlo.reshape %2578 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2580 = stablehlo.convert %2579 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2581 = stablehlo.dot_general %2580, %arg384, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2582 = stablehlo.broadcast_in_dim %2581, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2583 = stablehlo.multiply %2582, %1523 : tensor<100x256xf32>
-    %2584 = stablehlo.broadcast_in_dim %2583, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2585 = stablehlo.broadcast_in_dim %arg385, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2586 = stablehlo.add %2584, %2585 : tensor<100x256xf32>
-    %2587 = stablehlo.convert %2586 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2588 = stablehlo.reshape %2587 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2589 = stablehlo.dot_general %2580, %arg386, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2590 = stablehlo.broadcast_in_dim %2589, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2591 = stablehlo.multiply %2590, %1523 : tensor<100x256xf32>
-    %2592 = stablehlo.broadcast_in_dim %2591, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2593 = stablehlo.broadcast_in_dim %arg387, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2594 = stablehlo.add %2592, %2593 : tensor<100x256xf32>
-    %2595 = stablehlo.convert %2594 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2596 = stablehlo.reshape %2595 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2597 = stablehlo.reshape %2544 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2598 = stablehlo.convert %2597 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2599 = stablehlo.dot_general %2598, %arg388, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2600 = stablehlo.broadcast_in_dim %2599, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2601 = stablehlo.multiply %2600, %1523 : tensor<100x256xf32>
-    %2602 = stablehlo.broadcast_in_dim %2601, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2603 = stablehlo.broadcast_in_dim %arg389, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2604 = stablehlo.add %2602, %2603 : tensor<100x256xf32>
-    %2605 = stablehlo.convert %2604 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2606 = stablehlo.reshape %2605 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2607 = stablehlo.reshape %2588 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2608 = stablehlo.transpose %2607, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2609 = stablehlo.reshape %2596 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2610 = stablehlo.transpose %2609, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2611 = stablehlo.reshape %2606 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2612 = stablehlo.transpose %2611, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2613 = stablehlo.broadcast_in_dim %2608, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2614 = stablehlo.multiply %2613, %1704 : tensor<8x100x32xbf16>
-    %2615 = stablehlo.transpose %2610, dims = [0, 2, 1] : (tensor<8x100x32xbf16>) -> tensor<8x32x100xbf16>
-    %2616 = stablehlo.broadcast_in_dim %2615, dims = [0, 1, 2] : (tensor<8x32x100xbf16>) -> tensor<8x32x100xbf16>
-    %2617 = stablehlo.dot_general %2614, %2616, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x100xbf16>) -> tensor<8x100x100xbf16>
-    %2618 = stablehlo.convert %2617 : (tensor<8x100x100xbf16>) -> tensor<8x100x100xf32>
-    %2619 = stablehlo.reduce(%2618 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2620 = stablehlo.reshape %2619 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2621 = stablehlo.broadcast_in_dim %2618, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2622 = stablehlo.broadcast_in_dim %2620, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2623 = stablehlo.subtract %2621, %2622 : tensor<8x100x100xf32>
-    %2624 = stablehlo.exponential %2623 : tensor<8x100x100xf32>
-    %2625 = stablehlo.reduce(%2624 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2626 = stablehlo.reshape %2625 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2627 = stablehlo.broadcast_in_dim %2624, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2628 = stablehlo.broadcast_in_dim %2626, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2629 = stablehlo.divide %2627, %2628 : tensor<8x100x100xf32>
-    %2630 = stablehlo.convert %2629 : (tensor<8x100x100xf32>) -> tensor<8x100x100xbf16>
-    %2631 = stablehlo.broadcast_in_dim %2612, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2632 = stablehlo.dot_general %2630, %2631, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x100xbf16>, tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2633 = stablehlo.transpose %2632, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2634 = stablehlo.reshape %2633 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2635 = stablehlo.convert %2634 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2636 = stablehlo.dot_general %2635, %arg390, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2637 = stablehlo.broadcast_in_dim %2636, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2638 = stablehlo.multiply %2637, %1523 : tensor<100x256xf32>
-    %2639 = stablehlo.broadcast_in_dim %2638, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2640 = stablehlo.broadcast_in_dim %arg391, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2641 = stablehlo.add %2639, %2640 : tensor<100x256xf32>
-    %2642 = stablehlo.convert %2641 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2643 = stablehlo.reshape %2642 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2644 = stablehlo.add %2544, %2643 : tensor<100x1x256xbf16>
-    %2645 = stablehlo.convert %2644 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2646 = stablehlo.convert %2645 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2647 = stablehlo.reduce(%2646 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2648 = stablehlo.reshape %2647 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2649 = stablehlo.broadcast_in_dim %2648, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2650 = stablehlo.divide %2649, %1536 : tensor<100x1x1xf64>
-    %2651 = stablehlo.broadcast_in_dim %2646, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2652 = stablehlo.broadcast_in_dim %2650, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2653 = stablehlo.subtract %2651, %2652 : tensor<100x1x256xf64>
-    %2654 = stablehlo.multiply %2653, %2653 : tensor<100x1x256xf64>
-    %2655 = stablehlo.reduce(%2654 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2656 = stablehlo.reshape %2655 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2657 = stablehlo.broadcast_in_dim %2656, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2658 = stablehlo.divide %2657, %1536 : tensor<100x1x1xf64>
-    %2659 = stablehlo.convert %2658 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2660 = stablehlo.reduce(%2645 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2661 = stablehlo.reshape %2660 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2662 = stablehlo.broadcast_in_dim %2661, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2663 = stablehlo.divide %2662, %1550 : tensor<100x1x1xf32>
-    %2664 = stablehlo.broadcast_in_dim %2659, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2665 = stablehlo.add %2664, %1553 : tensor<100x1x1xf32>
-    %2666 = stablehlo.rsqrt %2665 : tensor<100x1x1xf32>
-    %2667 = stablehlo.broadcast_in_dim %2645, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2668 = stablehlo.broadcast_in_dim %2663, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2669 = stablehlo.subtract %2667, %2668 : tensor<100x1x256xf32>
-    %2670 = stablehlo.broadcast_in_dim %2669, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2671 = stablehlo.broadcast_in_dim %2666, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2672 = stablehlo.multiply %2670, %2671 : tensor<100x1x256xf32>
-    %2673 = stablehlo.convert %arg104 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2674 = stablehlo.broadcast_in_dim %2672, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2675 = stablehlo.broadcast_in_dim %2673, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2676 = stablehlo.multiply %2674, %2675 : tensor<100x1x256xf32>
-    %2677 = stablehlo.convert %arg105 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2678 = stablehlo.broadcast_in_dim %2676, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2679 = stablehlo.broadcast_in_dim %2677, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2680 = stablehlo.add %2678, %2679 : tensor<100x1x256xf32>
-    %2681 = stablehlo.convert %2680 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2682 = stablehlo.add %2681, %arg320 : tensor<100x1x256xbf16>
-    %2683 = stablehlo.reshape %2682 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2684 = stablehlo.convert %2683 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2685 = stablehlo.dot_general %2684, %arg392, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2686 = stablehlo.broadcast_in_dim %2685, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2687 = stablehlo.multiply %2686, %1523 : tensor<100x256xf32>
-    %2688 = stablehlo.broadcast_in_dim %2687, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2689 = stablehlo.broadcast_in_dim %arg393, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2690 = stablehlo.add %2688, %2689 : tensor<100x256xf32>
-    %2691 = stablehlo.convert %2690 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2692 = stablehlo.reshape %2691 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2693 = stablehlo.dot_general %1471, %arg394, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2694 = stablehlo.broadcast_in_dim %2693, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2695 = stablehlo.multiply %2694, %515 : tensor<920x256xf32>
-    %2696 = stablehlo.broadcast_in_dim %2695, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2697 = stablehlo.broadcast_in_dim %arg395, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %2698 = stablehlo.add %2696, %2697 : tensor<920x256xf32>
-    %2699 = stablehlo.convert %2698 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %2700 = stablehlo.reshape %2699 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %2701 = stablehlo.dot_general %1481, %arg396, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2702 = stablehlo.broadcast_in_dim %2701, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2703 = stablehlo.multiply %2702, %515 : tensor<920x256xf32>
-    %2704 = stablehlo.broadcast_in_dim %2703, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2705 = stablehlo.broadcast_in_dim %arg397, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %2706 = stablehlo.add %2704, %2705 : tensor<920x256xf32>
-    %2707 = stablehlo.convert %2706 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %2708 = stablehlo.reshape %2707 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %2709 = stablehlo.reshape %2692 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2710 = stablehlo.transpose %2709, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2711 = stablehlo.reshape %2700 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %2712 = stablehlo.transpose %2711, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %2713 = stablehlo.reshape %2708 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %2714 = stablehlo.transpose %2713, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %2715 = stablehlo.broadcast_in_dim %2710, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2716 = stablehlo.multiply %2715, %1704 : tensor<8x100x32xbf16>
-    %2717 = stablehlo.transpose %2712, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %2718 = stablehlo.broadcast_in_dim %2717, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %2719 = stablehlo.dot_general %2716, %2718, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
-    %2720 = stablehlo.broadcast_in_dim %2719, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %2721 = stablehlo.multiply %2720, %1498 : tensor<8x100x920xbf16>
-    %2722 = stablehlo.broadcast_in_dim %2721, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %2723 = stablehlo.broadcast_in_dim %arg398, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x100x920xbf16>
-    %2724 = stablehlo.add %2722, %2723 : tensor<8x100x920xbf16>
-    %2725 = stablehlo.convert %2724 : (tensor<8x100x920xbf16>) -> tensor<8x100x920xf32>
-    %2726 = stablehlo.reduce(%2725 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2727 = stablehlo.reshape %2726 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2728 = stablehlo.broadcast_in_dim %2725, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %2729 = stablehlo.broadcast_in_dim %2727, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %2730 = stablehlo.subtract %2728, %2729 : tensor<8x100x920xf32>
-    %2731 = stablehlo.exponential %2730 : tensor<8x100x920xf32>
-    %2732 = stablehlo.reduce(%2731 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2733 = stablehlo.reshape %2732 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2734 = stablehlo.broadcast_in_dim %2731, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %2735 = stablehlo.broadcast_in_dim %2733, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %2736 = stablehlo.divide %2734, %2735 : tensor<8x100x920xf32>
-    %2737 = stablehlo.convert %2736 : (tensor<8x100x920xf32>) -> tensor<8x100x920xbf16>
-    %2738 = stablehlo.broadcast_in_dim %2714, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %2739 = stablehlo.dot_general %2737, %2738, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x100x32xbf16>
-    %2740 = stablehlo.transpose %2739, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2741 = stablehlo.reshape %2740 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2742 = stablehlo.convert %2741 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2743 = stablehlo.dot_general %2742, %arg399, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2744 = stablehlo.broadcast_in_dim %2743, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2745 = stablehlo.multiply %2744, %1523 : tensor<100x256xf32>
-    %2746 = stablehlo.broadcast_in_dim %2745, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2747 = stablehlo.broadcast_in_dim %arg400, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2748 = stablehlo.add %2746, %2747 : tensor<100x256xf32>
-    %2749 = stablehlo.convert %2748 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2750 = stablehlo.reshape %2749 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2751 = stablehlo.add %2681, %2750 : tensor<100x1x256xbf16>
-    %2752 = stablehlo.convert %2751 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2753 = stablehlo.convert %2752 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2754 = stablehlo.reduce(%2753 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2755 = stablehlo.reshape %2754 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2756 = stablehlo.broadcast_in_dim %2755, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2757 = stablehlo.divide %2756, %1536 : tensor<100x1x1xf64>
-    %2758 = stablehlo.broadcast_in_dim %2753, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2759 = stablehlo.broadcast_in_dim %2757, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2760 = stablehlo.subtract %2758, %2759 : tensor<100x1x256xf64>
-    %2761 = stablehlo.multiply %2760, %2760 : tensor<100x1x256xf64>
-    %2762 = stablehlo.reduce(%2761 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2763 = stablehlo.reshape %2762 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2764 = stablehlo.broadcast_in_dim %2763, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2765 = stablehlo.divide %2764, %1536 : tensor<100x1x1xf64>
-    %2766 = stablehlo.convert %2765 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2767 = stablehlo.reduce(%2752 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2768 = stablehlo.reshape %2767 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2769 = stablehlo.broadcast_in_dim %2768, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2770 = stablehlo.divide %2769, %1550 : tensor<100x1x1xf32>
-    %2771 = stablehlo.broadcast_in_dim %2766, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2772 = stablehlo.add %2771, %1553 : tensor<100x1x1xf32>
-    %2773 = stablehlo.rsqrt %2772 : tensor<100x1x1xf32>
-    %2774 = stablehlo.broadcast_in_dim %2752, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2775 = stablehlo.broadcast_in_dim %2770, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2776 = stablehlo.subtract %2774, %2775 : tensor<100x1x256xf32>
-    %2777 = stablehlo.broadcast_in_dim %2776, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2778 = stablehlo.broadcast_in_dim %2773, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2779 = stablehlo.multiply %2777, %2778 : tensor<100x1x256xf32>
-    %2780 = stablehlo.convert %arg106 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2781 = stablehlo.broadcast_in_dim %2779, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2782 = stablehlo.broadcast_in_dim %2780, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2783 = stablehlo.multiply %2781, %2782 : tensor<100x1x256xf32>
-    %2784 = stablehlo.convert %arg107 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2785 = stablehlo.broadcast_in_dim %2783, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2786 = stablehlo.broadcast_in_dim %2784, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2787 = stablehlo.add %2785, %2786 : tensor<100x1x256xf32>
-    %2788 = stablehlo.convert %2787 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2789 = stablehlo.reshape %2788 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2790 = stablehlo.convert %2789 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2791 = stablehlo.dot_general %2790, %arg401, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x2048xf32>) -> tensor<100x2048xf32>
-    %2792 = stablehlo.broadcast_in_dim %2791, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %2793 = stablehlo.multiply %2792, %1575 : tensor<100x2048xf32>
-    %2794 = stablehlo.broadcast_in_dim %2793, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %2795 = stablehlo.broadcast_in_dim %arg402, dims = [1] : (tensor<2048xf32>) -> tensor<100x2048xf32>
-    %2796 = stablehlo.add %2794, %2795 : tensor<100x2048xf32>
-    %2797 = stablehlo.convert %2796 : (tensor<100x2048xf32>) -> tensor<100x2048xbf16>
-    %2798 = stablehlo.reshape %2797 : (tensor<100x2048xbf16>) -> tensor<100x1x2048xbf16>
-    %2799 = stablehlo.maximum %2798, %cst_16 : tensor<100x1x2048xbf16>
-    %2800 = stablehlo.reshape %2799 : (tensor<100x1x2048xbf16>) -> tensor<100x2048xbf16>
-    %2801 = stablehlo.convert %2800 : (tensor<100x2048xbf16>) -> tensor<100x2048xf32>
-    %2802 = stablehlo.dot_general %2801, %arg403, contracting_dims = [1] x [0] : (tensor<100x2048xf32>, tensor<2048x256xf32>) -> tensor<100x256xf32>
-    %2803 = stablehlo.broadcast_in_dim %2802, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2804 = stablehlo.multiply %2803, %1523 : tensor<100x256xf32>
-    %2805 = stablehlo.broadcast_in_dim %2804, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2806 = stablehlo.broadcast_in_dim %arg404, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2807 = stablehlo.add %2805, %2806 : tensor<100x256xf32>
-    %2808 = stablehlo.convert %2807 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2809 = stablehlo.reshape %2808 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2810 = stablehlo.add %2788, %2809 : tensor<100x1x256xbf16>
-    %2811 = stablehlo.convert %2810 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2812 = stablehlo.convert %2811 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2813 = stablehlo.reduce(%2812 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2814 = stablehlo.reshape %2813 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2815 = stablehlo.broadcast_in_dim %2814, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2816 = stablehlo.divide %2815, %1536 : tensor<100x1x1xf64>
-    %2817 = stablehlo.broadcast_in_dim %2812, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2818 = stablehlo.broadcast_in_dim %2816, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2819 = stablehlo.subtract %2817, %2818 : tensor<100x1x256xf64>
-    %2820 = stablehlo.multiply %2819, %2819 : tensor<100x1x256xf64>
-    %2821 = stablehlo.reduce(%2820 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2822 = stablehlo.reshape %2821 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2823 = stablehlo.broadcast_in_dim %2822, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2824 = stablehlo.divide %2823, %1536 : tensor<100x1x1xf64>
-    %2825 = stablehlo.convert %2824 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2826 = stablehlo.reduce(%2811 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2827 = stablehlo.reshape %2826 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2828 = stablehlo.broadcast_in_dim %2827, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2829 = stablehlo.divide %2828, %1550 : tensor<100x1x1xf32>
-    %2830 = stablehlo.broadcast_in_dim %2825, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2831 = stablehlo.add %2830, %1553 : tensor<100x1x1xf32>
-    %2832 = stablehlo.rsqrt %2831 : tensor<100x1x1xf32>
-    %2833 = stablehlo.broadcast_in_dim %2811, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2834 = stablehlo.broadcast_in_dim %2829, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2835 = stablehlo.subtract %2833, %2834 : tensor<100x1x256xf32>
-    %2836 = stablehlo.broadcast_in_dim %2835, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2837 = stablehlo.broadcast_in_dim %2832, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2838 = stablehlo.multiply %2836, %2837 : tensor<100x1x256xf32>
-    %2839 = stablehlo.convert %arg108 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2840 = stablehlo.broadcast_in_dim %2838, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2841 = stablehlo.broadcast_in_dim %2839, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2842 = stablehlo.multiply %2840, %2841 : tensor<100x1x256xf32>
-    %2843 = stablehlo.convert %arg109 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2844 = stablehlo.broadcast_in_dim %2842, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2845 = stablehlo.broadcast_in_dim %2843, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2846 = stablehlo.add %2844, %2845 : tensor<100x1x256xf32>
-    %2847 = stablehlo.convert %2846 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2848 = stablehlo.convert %2847 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2849 = stablehlo.convert %2848 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2850 = stablehlo.reduce(%2849 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2851 = stablehlo.reshape %2850 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2852 = stablehlo.broadcast_in_dim %2851, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2853 = stablehlo.divide %2852, %1536 : tensor<100x1x1xf64>
-    %2854 = stablehlo.broadcast_in_dim %2849, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2855 = stablehlo.broadcast_in_dim %2853, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2856 = stablehlo.subtract %2854, %2855 : tensor<100x1x256xf64>
-    %2857 = stablehlo.multiply %2856, %2856 : tensor<100x1x256xf64>
-    %2858 = stablehlo.reduce(%2857 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2859 = stablehlo.reshape %2858 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2860 = stablehlo.broadcast_in_dim %2859, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2861 = stablehlo.divide %2860, %1536 : tensor<100x1x1xf64>
-    %2862 = stablehlo.convert %2861 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2863 = stablehlo.reduce(%2848 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2864 = stablehlo.reshape %2863 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2865 = stablehlo.broadcast_in_dim %2864, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2866 = stablehlo.divide %2865, %1550 : tensor<100x1x1xf32>
-    %2867 = stablehlo.broadcast_in_dim %2862, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2868 = stablehlo.add %2867, %1553 : tensor<100x1x1xf32>
-    %2869 = stablehlo.rsqrt %2868 : tensor<100x1x1xf32>
-    %2870 = stablehlo.broadcast_in_dim %2848, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2871 = stablehlo.broadcast_in_dim %2866, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2872 = stablehlo.subtract %2870, %2871 : tensor<100x1x256xf32>
-    %2873 = stablehlo.broadcast_in_dim %2872, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2874 = stablehlo.broadcast_in_dim %2869, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2875 = stablehlo.multiply %2873, %2874 : tensor<100x1x256xf32>
-    %2876 = stablehlo.broadcast_in_dim %2875, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2877 = stablehlo.multiply %2876, %1661 : tensor<100x1x256xf32>
-    %2878 = stablehlo.broadcast_in_dim %2877, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2879 = stablehlo.add %2878, %1665 : tensor<100x1x256xf32>
-    %2880 = stablehlo.convert %2879 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2881 = stablehlo.add %2847, %arg320 : tensor<100x1x256xbf16>
-    %2882 = stablehlo.reshape %2881 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2883 = stablehlo.convert %2882 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2884 = stablehlo.dot_general %2883, %arg405, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2885 = stablehlo.broadcast_in_dim %2884, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2886 = stablehlo.multiply %2885, %1523 : tensor<100x256xf32>
-    %2887 = stablehlo.broadcast_in_dim %2886, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2888 = stablehlo.broadcast_in_dim %arg406, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2889 = stablehlo.add %2887, %2888 : tensor<100x256xf32>
-    %2890 = stablehlo.convert %2889 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2891 = stablehlo.reshape %2890 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2892 = stablehlo.dot_general %2883, %arg407, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2893 = stablehlo.broadcast_in_dim %2892, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2894 = stablehlo.multiply %2893, %1523 : tensor<100x256xf32>
-    %2895 = stablehlo.broadcast_in_dim %2894, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2896 = stablehlo.broadcast_in_dim %arg408, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2897 = stablehlo.add %2895, %2896 : tensor<100x256xf32>
-    %2898 = stablehlo.convert %2897 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2899 = stablehlo.reshape %2898 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2900 = stablehlo.reshape %2847 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2901 = stablehlo.convert %2900 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2902 = stablehlo.dot_general %2901, %arg409, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2903 = stablehlo.broadcast_in_dim %2902, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2904 = stablehlo.multiply %2903, %1523 : tensor<100x256xf32>
-    %2905 = stablehlo.broadcast_in_dim %2904, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2906 = stablehlo.broadcast_in_dim %arg410, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2907 = stablehlo.add %2905, %2906 : tensor<100x256xf32>
-    %2908 = stablehlo.convert %2907 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2909 = stablehlo.reshape %2908 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2910 = stablehlo.reshape %2891 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2911 = stablehlo.transpose %2910, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2912 = stablehlo.reshape %2899 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2913 = stablehlo.transpose %2912, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2914 = stablehlo.reshape %2909 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %2915 = stablehlo.transpose %2914, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %2916 = stablehlo.broadcast_in_dim %2911, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2917 = stablehlo.multiply %2916, %1704 : tensor<8x100x32xbf16>
-    %2918 = stablehlo.transpose %2913, dims = [0, 2, 1] : (tensor<8x100x32xbf16>) -> tensor<8x32x100xbf16>
-    %2919 = stablehlo.broadcast_in_dim %2918, dims = [0, 1, 2] : (tensor<8x32x100xbf16>) -> tensor<8x32x100xbf16>
-    %2920 = stablehlo.dot_general %2917, %2919, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x100xbf16>) -> tensor<8x100x100xbf16>
-    %2921 = stablehlo.convert %2920 : (tensor<8x100x100xbf16>) -> tensor<8x100x100xf32>
-    %2922 = stablehlo.reduce(%2921 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2923 = stablehlo.reshape %2922 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2924 = stablehlo.broadcast_in_dim %2921, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2925 = stablehlo.broadcast_in_dim %2923, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2926 = stablehlo.subtract %2924, %2925 : tensor<8x100x100xf32>
-    %2927 = stablehlo.exponential %2926 : tensor<8x100x100xf32>
-    %2928 = stablehlo.reduce(%2927 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x100xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %2929 = stablehlo.reshape %2928 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %2930 = stablehlo.broadcast_in_dim %2927, dims = [0, 1, 2] : (tensor<8x100x100xf32>) -> tensor<8x100x100xf32>
-    %2931 = stablehlo.broadcast_in_dim %2929, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x100xf32>
-    %2932 = stablehlo.divide %2930, %2931 : tensor<8x100x100xf32>
-    %2933 = stablehlo.convert %2932 : (tensor<8x100x100xf32>) -> tensor<8x100x100xbf16>
-    %2934 = stablehlo.broadcast_in_dim %2915, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2935 = stablehlo.dot_general %2933, %2934, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x100xbf16>, tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %2936 = stablehlo.transpose %2935, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %2937 = stablehlo.reshape %2936 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %2938 = stablehlo.convert %2937 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2939 = stablehlo.dot_general %2938, %arg411, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2940 = stablehlo.broadcast_in_dim %2939, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2941 = stablehlo.multiply %2940, %1523 : tensor<100x256xf32>
-    %2942 = stablehlo.broadcast_in_dim %2941, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2943 = stablehlo.broadcast_in_dim %arg412, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2944 = stablehlo.add %2942, %2943 : tensor<100x256xf32>
-    %2945 = stablehlo.convert %2944 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2946 = stablehlo.reshape %2945 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2947 = stablehlo.add %2847, %2946 : tensor<100x1x256xbf16>
-    %2948 = stablehlo.convert %2947 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %2949 = stablehlo.convert %2948 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %2950 = stablehlo.reduce(%2949 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2951 = stablehlo.reshape %2950 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2952 = stablehlo.broadcast_in_dim %2951, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2953 = stablehlo.divide %2952, %1536 : tensor<100x1x1xf64>
-    %2954 = stablehlo.broadcast_in_dim %2949, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %2955 = stablehlo.broadcast_in_dim %2953, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %2956 = stablehlo.subtract %2954, %2955 : tensor<100x1x256xf64>
-    %2957 = stablehlo.multiply %2956, %2956 : tensor<100x1x256xf64>
-    %2958 = stablehlo.reduce(%2957 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %2959 = stablehlo.reshape %2958 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %2960 = stablehlo.broadcast_in_dim %2959, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %2961 = stablehlo.divide %2960, %1536 : tensor<100x1x1xf64>
-    %2962 = stablehlo.convert %2961 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %2963 = stablehlo.reduce(%2948 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %2964 = stablehlo.reshape %2963 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %2965 = stablehlo.broadcast_in_dim %2964, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2966 = stablehlo.divide %2965, %1550 : tensor<100x1x1xf32>
-    %2967 = stablehlo.broadcast_in_dim %2962, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %2968 = stablehlo.add %2967, %1553 : tensor<100x1x1xf32>
-    %2969 = stablehlo.rsqrt %2968 : tensor<100x1x1xf32>
-    %2970 = stablehlo.broadcast_in_dim %2948, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2971 = stablehlo.broadcast_in_dim %2966, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2972 = stablehlo.subtract %2970, %2971 : tensor<100x1x256xf32>
-    %2973 = stablehlo.broadcast_in_dim %2972, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2974 = stablehlo.broadcast_in_dim %2969, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %2975 = stablehlo.multiply %2973, %2974 : tensor<100x1x256xf32>
-    %2976 = stablehlo.convert %arg110 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2977 = stablehlo.broadcast_in_dim %2975, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2978 = stablehlo.broadcast_in_dim %2976, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2979 = stablehlo.multiply %2977, %2978 : tensor<100x1x256xf32>
-    %2980 = stablehlo.convert %arg111 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2981 = stablehlo.broadcast_in_dim %2979, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %2982 = stablehlo.broadcast_in_dim %2980, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %2983 = stablehlo.add %2981, %2982 : tensor<100x1x256xf32>
-    %2984 = stablehlo.convert %2983 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %2985 = stablehlo.add %2984, %arg320 : tensor<100x1x256xbf16>
-    %2986 = stablehlo.reshape %2985 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %2987 = stablehlo.convert %2986 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %2988 = stablehlo.dot_general %2987, %arg413, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %2989 = stablehlo.broadcast_in_dim %2988, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2990 = stablehlo.multiply %2989, %1523 : tensor<100x256xf32>
-    %2991 = stablehlo.broadcast_in_dim %2990, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %2992 = stablehlo.broadcast_in_dim %arg414, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %2993 = stablehlo.add %2991, %2992 : tensor<100x256xf32>
-    %2994 = stablehlo.convert %2993 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %2995 = stablehlo.reshape %2994 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %2996 = stablehlo.dot_general %1471, %arg415, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %2997 = stablehlo.broadcast_in_dim %2996, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %2998 = stablehlo.multiply %2997, %515 : tensor<920x256xf32>
-    %2999 = stablehlo.broadcast_in_dim %2998, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %3000 = stablehlo.broadcast_in_dim %arg416, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %3001 = stablehlo.add %2999, %3000 : tensor<920x256xf32>
-    %3002 = stablehlo.convert %3001 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %3003 = stablehlo.reshape %3002 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %3004 = stablehlo.dot_general %1481, %arg417, contracting_dims = [1] x [0] : (tensor<920x256xf32>, tensor<256x256xf32>) -> tensor<920x256xf32>
-    %3005 = stablehlo.broadcast_in_dim %3004, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %3006 = stablehlo.multiply %3005, %515 : tensor<920x256xf32>
-    %3007 = stablehlo.broadcast_in_dim %3006, dims = [0, 1] : (tensor<920x256xf32>) -> tensor<920x256xf32>
-    %3008 = stablehlo.broadcast_in_dim %arg418, dims = [1] : (tensor<256xf32>) -> tensor<920x256xf32>
-    %3009 = stablehlo.add %3007, %3008 : tensor<920x256xf32>
-    %3010 = stablehlo.convert %3009 : (tensor<920x256xf32>) -> tensor<920x256xbf16>
-    %3011 = stablehlo.reshape %3010 : (tensor<920x256xbf16>) -> tensor<920x1x256xbf16>
-    %3012 = stablehlo.reshape %2995 : (tensor<100x1x256xbf16>) -> tensor<100x8x32xbf16>
-    %3013 = stablehlo.transpose %3012, dims = [1, 0, 2] : (tensor<100x8x32xbf16>) -> tensor<8x100x32xbf16>
-    %3014 = stablehlo.reshape %3003 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %3015 = stablehlo.transpose %3014, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %3016 = stablehlo.reshape %3011 : (tensor<920x1x256xbf16>) -> tensor<920x8x32xbf16>
-    %3017 = stablehlo.transpose %3016, dims = [1, 0, 2] : (tensor<920x8x32xbf16>) -> tensor<8x920x32xbf16>
-    %3018 = stablehlo.broadcast_in_dim %3013, dims = [0, 1, 2] : (tensor<8x100x32xbf16>) -> tensor<8x100x32xbf16>
-    %3019 = stablehlo.multiply %3018, %1704 : tensor<8x100x32xbf16>
-    %3020 = stablehlo.transpose %3015, dims = [0, 2, 1] : (tensor<8x920x32xbf16>) -> tensor<8x32x920xbf16>
-    %3021 = stablehlo.broadcast_in_dim %3020, dims = [0, 1, 2] : (tensor<8x32x920xbf16>) -> tensor<8x32x920xbf16>
-    %3022 = stablehlo.dot_general %3019, %3021, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x32xbf16>, tensor<8x32x920xbf16>) -> tensor<8x100x920xbf16>
-    %3023 = stablehlo.broadcast_in_dim %3022, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %3024 = stablehlo.multiply %3023, %1498 : tensor<8x100x920xbf16>
-    %3025 = stablehlo.broadcast_in_dim %3024, dims = [0, 1, 2] : (tensor<8x100x920xbf16>) -> tensor<8x100x920xbf16>
-    %3026 = stablehlo.broadcast_in_dim %arg419, dims = [0, 1, 2] : (tensor<8x1x920xbf16>) -> tensor<8x100x920xbf16>
-    %3027 = stablehlo.add %3025, %3026 : tensor<8x100x920xbf16>
-    %3028 = stablehlo.convert %3027 : (tensor<8x100x920xbf16>) -> tensor<8x100x920xf32>
-    %3029 = stablehlo.reduce(%3028 init: %cst_12) applies stablehlo.maximum across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %3030 = stablehlo.reshape %3029 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %3031 = stablehlo.broadcast_in_dim %3028, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %3032 = stablehlo.broadcast_in_dim %3030, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %3033 = stablehlo.subtract %3031, %3032 : tensor<8x100x920xf32>
-    %3034 = stablehlo.exponential %3033 : tensor<8x100x920xf32>
-    %3035 = stablehlo.reduce(%3034 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<8x100x920xf32>, tensor<f32>) -> tensor<8x100xf32>
-    %3036 = stablehlo.reshape %3035 : (tensor<8x100xf32>) -> tensor<8x100x1xf32>
-    %3037 = stablehlo.broadcast_in_dim %3034, dims = [0, 1, 2] : (tensor<8x100x920xf32>) -> tensor<8x100x920xf32>
-    %3038 = stablehlo.broadcast_in_dim %3036, dims = [0, 1, 2] : (tensor<8x100x1xf32>) -> tensor<8x100x920xf32>
-    %3039 = stablehlo.divide %3037, %3038 : tensor<8x100x920xf32>
-    %3040 = stablehlo.convert %3039 : (tensor<8x100x920xf32>) -> tensor<8x100x920xbf16>
-    %3041 = stablehlo.broadcast_in_dim %3017, dims = [0, 1, 2] : (tensor<8x920x32xbf16>) -> tensor<8x920x32xbf16>
-    %3042 = stablehlo.dot_general %3040, %3041, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x100x920xbf16>, tensor<8x920x32xbf16>) -> tensor<8x100x32xbf16>
-    %3043 = stablehlo.transpose %3042, dims = [1, 0, 2] : (tensor<8x100x32xbf16>) -> tensor<100x8x32xbf16>
-    %3044 = stablehlo.reshape %3043 : (tensor<100x8x32xbf16>) -> tensor<100x256xbf16>
-    %3045 = stablehlo.convert %3044 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %3046 = stablehlo.dot_general %3045, %arg420, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x256xf32>) -> tensor<100x256xf32>
-    %3047 = stablehlo.broadcast_in_dim %3046, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %3048 = stablehlo.multiply %3047, %1523 : tensor<100x256xf32>
-    %3049 = stablehlo.broadcast_in_dim %3048, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %3050 = stablehlo.broadcast_in_dim %arg421, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %3051 = stablehlo.add %3049, %3050 : tensor<100x256xf32>
-    %3052 = stablehlo.convert %3051 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %3053 = stablehlo.reshape %3052 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %3054 = stablehlo.add %2984, %3053 : tensor<100x1x256xbf16>
-    %3055 = stablehlo.convert %3054 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %3056 = stablehlo.convert %3055 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %3057 = stablehlo.reduce(%3056 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %3058 = stablehlo.reshape %3057 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %3059 = stablehlo.broadcast_in_dim %3058, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %3060 = stablehlo.divide %3059, %1536 : tensor<100x1x1xf64>
-    %3061 = stablehlo.broadcast_in_dim %3056, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %3062 = stablehlo.broadcast_in_dim %3060, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %3063 = stablehlo.subtract %3061, %3062 : tensor<100x1x256xf64>
-    %3064 = stablehlo.multiply %3063, %3063 : tensor<100x1x256xf64>
-    %3065 = stablehlo.reduce(%3064 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %3066 = stablehlo.reshape %3065 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %3067 = stablehlo.broadcast_in_dim %3066, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %3068 = stablehlo.divide %3067, %1536 : tensor<100x1x1xf64>
-    %3069 = stablehlo.convert %3068 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %3070 = stablehlo.reduce(%3055 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %3071 = stablehlo.reshape %3070 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %3072 = stablehlo.broadcast_in_dim %3071, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %3073 = stablehlo.divide %3072, %1550 : tensor<100x1x1xf32>
-    %3074 = stablehlo.broadcast_in_dim %3069, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %3075 = stablehlo.add %3074, %1553 : tensor<100x1x1xf32>
-    %3076 = stablehlo.rsqrt %3075 : tensor<100x1x1xf32>
-    %3077 = stablehlo.broadcast_in_dim %3055, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3078 = stablehlo.broadcast_in_dim %3073, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %3079 = stablehlo.subtract %3077, %3078 : tensor<100x1x256xf32>
-    %3080 = stablehlo.broadcast_in_dim %3079, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3081 = stablehlo.broadcast_in_dim %3076, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %3082 = stablehlo.multiply %3080, %3081 : tensor<100x1x256xf32>
-    %3083 = stablehlo.convert %arg112 : (tensor<256xbf16>) -> tensor<256xf32>
-    %3084 = stablehlo.broadcast_in_dim %3082, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3085 = stablehlo.broadcast_in_dim %3083, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %3086 = stablehlo.multiply %3084, %3085 : tensor<100x1x256xf32>
-    %3087 = stablehlo.convert %arg113 : (tensor<256xbf16>) -> tensor<256xf32>
-    %3088 = stablehlo.broadcast_in_dim %3086, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3089 = stablehlo.broadcast_in_dim %3087, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %3090 = stablehlo.add %3088, %3089 : tensor<100x1x256xf32>
-    %3091 = stablehlo.convert %3090 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %3092 = stablehlo.reshape %3091 : (tensor<100x1x256xbf16>) -> tensor<100x256xbf16>
-    %3093 = stablehlo.convert %3092 : (tensor<100x256xbf16>) -> tensor<100x256xf32>
-    %3094 = stablehlo.dot_general %3093, %arg422, contracting_dims = [1] x [0] : (tensor<100x256xf32>, tensor<256x2048xf32>) -> tensor<100x2048xf32>
-    %3095 = stablehlo.broadcast_in_dim %3094, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %3096 = stablehlo.multiply %3095, %1575 : tensor<100x2048xf32>
-    %3097 = stablehlo.broadcast_in_dim %3096, dims = [0, 1] : (tensor<100x2048xf32>) -> tensor<100x2048xf32>
-    %3098 = stablehlo.broadcast_in_dim %arg423, dims = [1] : (tensor<2048xf32>) -> tensor<100x2048xf32>
-    %3099 = stablehlo.add %3097, %3098 : tensor<100x2048xf32>
-    %3100 = stablehlo.convert %3099 : (tensor<100x2048xf32>) -> tensor<100x2048xbf16>
-    %3101 = stablehlo.reshape %3100 : (tensor<100x2048xbf16>) -> tensor<100x1x2048xbf16>
-    %3102 = stablehlo.maximum %3101, %cst_16 : tensor<100x1x2048xbf16>
-    %3103 = stablehlo.reshape %3102 : (tensor<100x1x2048xbf16>) -> tensor<100x2048xbf16>
-    %3104 = stablehlo.convert %3103 : (tensor<100x2048xbf16>) -> tensor<100x2048xf32>
-    %3105 = stablehlo.dot_general %3104, %arg424, contracting_dims = [1] x [0] : (tensor<100x2048xf32>, tensor<2048x256xf32>) -> tensor<100x256xf32>
-    %3106 = stablehlo.broadcast_in_dim %3105, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %3107 = stablehlo.multiply %3106, %1523 : tensor<100x256xf32>
-    %3108 = stablehlo.broadcast_in_dim %3107, dims = [0, 1] : (tensor<100x256xf32>) -> tensor<100x256xf32>
-    %3109 = stablehlo.broadcast_in_dim %arg425, dims = [1] : (tensor<256xf32>) -> tensor<100x256xf32>
-    %3110 = stablehlo.add %3108, %3109 : tensor<100x256xf32>
-    %3111 = stablehlo.convert %3110 : (tensor<100x256xf32>) -> tensor<100x256xbf16>
-    %3112 = stablehlo.reshape %3111 : (tensor<100x256xbf16>) -> tensor<100x1x256xbf16>
-    %3113 = stablehlo.add %3091, %3112 : tensor<100x1x256xbf16>
-    %3114 = stablehlo.convert %3113 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %3115 = stablehlo.convert %3114 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %3116 = stablehlo.reduce(%3115 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %3117 = stablehlo.reshape %3116 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %3118 = stablehlo.broadcast_in_dim %3117, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %3119 = stablehlo.divide %3118, %1536 : tensor<100x1x1xf64>
-    %3120 = stablehlo.broadcast_in_dim %3115, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %3121 = stablehlo.broadcast_in_dim %3119, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %3122 = stablehlo.subtract %3120, %3121 : tensor<100x1x256xf64>
-    %3123 = stablehlo.multiply %3122, %3122 : tensor<100x1x256xf64>
-    %3124 = stablehlo.reduce(%3123 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %3125 = stablehlo.reshape %3124 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %3126 = stablehlo.broadcast_in_dim %3125, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %3127 = stablehlo.divide %3126, %1536 : tensor<100x1x1xf64>
-    %3128 = stablehlo.convert %3127 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %3129 = stablehlo.reduce(%3114 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %3130 = stablehlo.reshape %3129 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %3131 = stablehlo.broadcast_in_dim %3130, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %3132 = stablehlo.divide %3131, %1550 : tensor<100x1x1xf32>
-    %3133 = stablehlo.broadcast_in_dim %3128, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %3134 = stablehlo.add %3133, %1553 : tensor<100x1x1xf32>
-    %3135 = stablehlo.rsqrt %3134 : tensor<100x1x1xf32>
-    %3136 = stablehlo.broadcast_in_dim %3114, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3137 = stablehlo.broadcast_in_dim %3132, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %3138 = stablehlo.subtract %3136, %3137 : tensor<100x1x256xf32>
-    %3139 = stablehlo.broadcast_in_dim %3138, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3140 = stablehlo.broadcast_in_dim %3135, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %3141 = stablehlo.multiply %3139, %3140 : tensor<100x1x256xf32>
-    %3142 = stablehlo.convert %arg114 : (tensor<256xbf16>) -> tensor<256xf32>
-    %3143 = stablehlo.broadcast_in_dim %3141, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3144 = stablehlo.broadcast_in_dim %3142, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %3145 = stablehlo.multiply %3143, %3144 : tensor<100x1x256xf32>
-    %3146 = stablehlo.convert %arg115 : (tensor<256xbf16>) -> tensor<256xf32>
-    %3147 = stablehlo.broadcast_in_dim %3145, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3148 = stablehlo.broadcast_in_dim %3146, dims = [2] : (tensor<256xf32>) -> tensor<100x1x256xf32>
-    %3149 = stablehlo.add %3147, %3148 : tensor<100x1x256xf32>
-    %3150 = stablehlo.convert %3149 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %3151 = stablehlo.convert %3150 : (tensor<100x1x256xbf16>) -> tensor<100x1x256xf32>
-    %3152 = stablehlo.convert %3151 : (tensor<100x1x256xf32>) -> tensor<100x1x256xf64>
-    %3153 = stablehlo.reduce(%3152 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %3154 = stablehlo.reshape %3153 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %3155 = stablehlo.broadcast_in_dim %3154, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %3156 = stablehlo.divide %3155, %1536 : tensor<100x1x1xf64>
-    %3157 = stablehlo.broadcast_in_dim %3152, dims = [0, 1, 2] : (tensor<100x1x256xf64>) -> tensor<100x1x256xf64>
-    %3158 = stablehlo.broadcast_in_dim %3156, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x256xf64>
-    %3159 = stablehlo.subtract %3157, %3158 : tensor<100x1x256xf64>
-    %3160 = stablehlo.multiply %3159, %3159 : tensor<100x1x256xf64>
-    %3161 = stablehlo.reduce(%3160 init: %cst_14) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf64>, tensor<f64>) -> tensor<100x1xf64>
-    %3162 = stablehlo.reshape %3161 : (tensor<100x1xf64>) -> tensor<100x1x1xf64>
-    %3163 = stablehlo.broadcast_in_dim %3162, dims = [0, 1, 2] : (tensor<100x1x1xf64>) -> tensor<100x1x1xf64>
-    %3164 = stablehlo.divide %3163, %1536 : tensor<100x1x1xf64>
-    %3165 = stablehlo.convert %3164 : (tensor<100x1x1xf64>) -> tensor<100x1x1xf32>
-    %3166 = stablehlo.reduce(%3151 init: %cst_13) applies stablehlo.add across dimensions = [2] : (tensor<100x1x256xf32>, tensor<f32>) -> tensor<100x1xf32>
-    %3167 = stablehlo.reshape %3166 : (tensor<100x1xf32>) -> tensor<100x1x1xf32>
-    %3168 = stablehlo.broadcast_in_dim %3167, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %3169 = stablehlo.divide %3168, %1550 : tensor<100x1x1xf32>
-    %3170 = stablehlo.broadcast_in_dim %3165, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x1xf32>
-    %3171 = stablehlo.add %3170, %1553 : tensor<100x1x1xf32>
-    %3172 = stablehlo.rsqrt %3171 : tensor<100x1x1xf32>
-    %3173 = stablehlo.broadcast_in_dim %3151, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3174 = stablehlo.broadcast_in_dim %3169, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %3175 = stablehlo.subtract %3173, %3174 : tensor<100x1x256xf32>
-    %3176 = stablehlo.broadcast_in_dim %3175, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3177 = stablehlo.broadcast_in_dim %3172, dims = [0, 1, 2] : (tensor<100x1x1xf32>) -> tensor<100x1x256xf32>
-    %3178 = stablehlo.multiply %3176, %3177 : tensor<100x1x256xf32>
-    %3179 = stablehlo.broadcast_in_dim %3178, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3180 = stablehlo.multiply %3179, %1661 : tensor<100x1x256xf32>
-    %3181 = stablehlo.broadcast_in_dim %3180, dims = [0, 1, 2] : (tensor<100x1x256xf32>) -> tensor<100x1x256xf32>
-    %3182 = stablehlo.add %3181, %1665 : tensor<100x1x256xf32>
-    %3183 = stablehlo.convert %3182 : (tensor<100x1x256xf32>) -> tensor<100x1x256xbf16>
-    %3184 = stablehlo.reshape %1667 : (tensor<100x1x256xbf16>) -> tensor<1x100x1x256xbf16>
-    %3185 = stablehlo.reshape %1971 : (tensor<100x1x256xbf16>) -> tensor<1x100x1x256xbf16>
-    %3186 = stablehlo.reshape %2274 : (tensor<100x1x256xbf16>) -> tensor<1x100x1x256xbf16>
-    %3187 = stablehlo.reshape %2577 : (tensor<100x1x256xbf16>) -> tensor<1x100x1x256xbf16>
-    %3188 = stablehlo.reshape %2880 : (tensor<100x1x256xbf16>) -> tensor<1x100x1x256xbf16>
-    %3189 = stablehlo.reshape %3183 : (tensor<100x1x256xbf16>) -> tensor<1x100x1x256xbf16>
-    %3190 = stablehlo.concatenate %3184, %3185, %3186, %3187, %3188, %3189, dim = 0 : (tensor<1x100x1x256xbf16>, tensor<1x100x1x256xbf16>, tensor<1x100x1x256xbf16>, tensor<1x100x1x256xbf16>, tensor<1x100x1x256xbf16>, tensor<1x100x1x256xbf16>) -> tensor<6x100x1x256xbf16>
-    %3191 = stablehlo.transpose %3190, dims = [0, 2, 1, 3] : (tensor<6x100x1x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3192 = stablehlo.reshape %3191 : (tensor<6x1x100x256xbf16>) -> tensor<600x256xbf16>
-    %3193 = stablehlo.dot_general %3192, %arg426, contracting_dims = [1] x [0] : (tensor<600x256xbf16>, tensor<256x92xbf16>) -> tensor<600x92xbf16>
-    %3194 = stablehlo.reshape %3193 : (tensor<600x92xbf16>) -> tensor<6x1x100x92xbf16>
-    %3195 = stablehlo.broadcast_in_dim %3194, dims = [0, 1, 2, 3] : (tensor<6x1x100x92xbf16>) -> tensor<6x1x100x92xbf16>
-    %3196 = stablehlo.broadcast_in_dim %arg116, dims = [3] : (tensor<92xbf16>) -> tensor<6x1x100x92xbf16>
-    %3197 = stablehlo.add %3195, %3196 : tensor<6x1x100x92xbf16>
-    %3198 = stablehlo.reshape %3197 : (tensor<6x1x100x92xbf16>) -> tensor<600x92xbf16>
-    %3199 = stablehlo.dot_general %3192, %arg427, contracting_dims = [1] x [0] : (tensor<600x256xbf16>, tensor<256x256xbf16>) -> tensor<600x256xbf16>
-    %3200 = stablehlo.reshape %3199 : (tensor<600x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3201 = stablehlo.broadcast_in_dim %3200, dims = [0, 1, 2, 3] : (tensor<6x1x100x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3202 = stablehlo.broadcast_in_dim %arg117, dims = [3] : (tensor<256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3203 = stablehlo.add %3201, %3202 : tensor<6x1x100x256xbf16>
-    %3204 = stablehlo.reshape %3203 : (tensor<6x1x100x256xbf16>) -> tensor<600x256xbf16>
-    %3205 = stablehlo.reshape %3204 : (tensor<600x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3206 = stablehlo.maximum %3205, %cst_17 : tensor<6x1x100x256xbf16>
-    %3207 = stablehlo.reshape %3206 : (tensor<6x1x100x256xbf16>) -> tensor<600x256xbf16>
-    %3208 = stablehlo.dot_general %3207, %arg428, contracting_dims = [1] x [0] : (tensor<600x256xbf16>, tensor<256x256xbf16>) -> tensor<600x256xbf16>
-    %3209 = stablehlo.reshape %3208 : (tensor<600x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3210 = stablehlo.broadcast_in_dim %3209, dims = [0, 1, 2, 3] : (tensor<6x1x100x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3211 = stablehlo.broadcast_in_dim %arg118, dims = [3] : (tensor<256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3212 = stablehlo.add %3210, %3211 : tensor<6x1x100x256xbf16>
-    %3213 = stablehlo.reshape %3212 : (tensor<6x1x100x256xbf16>) -> tensor<600x256xbf16>
-    %3214 = stablehlo.reshape %3213 : (tensor<600x256xbf16>) -> tensor<6x1x100x256xbf16>
-    %3215 = stablehlo.maximum %3214, %cst_17 : tensor<6x1x100x256xbf16>
-    %3216 = stablehlo.reshape %3215 : (tensor<6x1x100x256xbf16>) -> tensor<600x256xbf16>
-    %3217 = stablehlo.dot_general %3216, %arg429, contracting_dims = [1] x [0] : (tensor<600x256xbf16>, tensor<256x4xbf16>) -> tensor<600x4xbf16>
-    %3218 = stablehlo.reshape %3217 : (tensor<600x4xbf16>) -> tensor<6x1x100x4xbf16>
-    %3219 = stablehlo.broadcast_in_dim %3218, dims = [0, 1, 2, 3] : (tensor<6x1x100x4xbf16>) -> tensor<6x1x100x4xbf16>
-    %3220 = stablehlo.broadcast_in_dim %arg119, dims = [3] : (tensor<4xbf16>) -> tensor<6x1x100x4xbf16>
-    %3221 = stablehlo.add %3219, %3220 : tensor<6x1x100x4xbf16>
-    %3222 = stablehlo.reshape %3221 : (tensor<6x1x100x4xbf16>) -> tensor<600x4xbf16>
-    %3223 = stablehlo.reshape %3222 : (tensor<600x4xbf16>) -> tensor<6x1x100x4xbf16>
-    %3224 = stablehlo.logistic %3223 : tensor<6x1x100x4xbf16>
-    %3225 = stablehlo.slice %3224 [5:6, 0:1, 0:100, 0:4] : (tensor<6x1x100x4xbf16>) -> tensor<1x1x100x4xbf16>
-    %3226 = stablehlo.reshape %3225 : (tensor<1x1x100x4xbf16>) -> tensor<1x100x4xbf16>
-    %3227 = stablehlo.reshape %3198 : (tensor<600x92xbf16>) -> tensor<6x1x100x92xbf16>
-    %3228 = stablehlo.slice %3227 [5:6, 0:1, 0:100, 0:92] : (tensor<6x1x100x92xbf16>) -> tensor<1x1x100x92xbf16>
-    %3229 = stablehlo.reshape %3228 : (tensor<1x1x100x92xbf16>) -> tensor<1x100x92xbf16>
-    return %3229, %3226 : tensor<1x100x92xbf16>, tensor<1x100x4xbf16>
-  }
-}
diff --git a/mlir_tests/GLPN-KITTI.mlir b/mlir_tests/GLPN-KITTI.mlir
deleted file mode 100644
index 50ebd970..00000000
--- a/mlir_tests/GLPN-KITTI.mlir
+++ /dev/null
@@ -1,10953 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x480x640xbf16>, %arg1: tensor<64x3x7x7xbf16>, %arg2: tensor<64xbf16>, %arg3: tensor<64xbf16>, %arg4: tensor<64xbf16>, %arg5: tensor<64xbf16>, %arg6: tensor<64xbf16>, %arg7: tensor<64x64x8x8xbf16>, %arg8: tensor<64xbf16>, %arg9: tensor<64xbf16>, %arg10: tensor<64xbf16>, %arg11: tensor<64xbf16>, %arg12: tensor<64xbf16>, %arg13: tensor<256x1x3x3xbf16>, %arg14: tensor<256xbf16>, %arg15: tensor<64xbf16>, %arg16: tensor<64xbf16>, %arg17: tensor<64xbf16>, %arg18: tensor<64x64x8x8xbf16>, %arg19: tensor<64xbf16>, %arg20: tensor<64xbf16>, %arg21: tensor<64xbf16>, %arg22: tensor<64xbf16>, %arg23: tensor<64xbf16>, %arg24: tensor<256x1x3x3xbf16>, %arg25: tensor<256xbf16>, %arg26: tensor<64xbf16>, %arg27: tensor<64xbf16>, %arg28: tensor<64xbf16>, %arg29: tensor<64x64x8x8xbf16>, %arg30: tensor<64xbf16>, %arg31: tensor<64xbf16>, %arg32: tensor<64xbf16>, %arg33: tensor<64xbf16>, %arg34: tensor<64xbf16>, %arg35: tensor<256x1x3x3xbf16>, %arg36: tensor<256xbf16>, %arg37: tensor<64xbf16>, %arg38: tensor<64xbf16>, %arg39: tensor<64xbf16>, %arg40: tensor<128x64x3x3xbf16>, %arg41: tensor<128xbf16>, %arg42: tensor<128xbf16>, %arg43: tensor<128xbf16>, %arg44: tensor<128xbf16>, %arg45: tensor<128xbf16>, %arg46: tensor<128x128x4x4xbf16>, %arg47: tensor<128xbf16>, %arg48: tensor<128xbf16>, %arg49: tensor<128xbf16>, %arg50: tensor<128xbf16>, %arg51: tensor<128xbf16>, %arg52: tensor<512x1x3x3xbf16>, %arg53: tensor<512xbf16>, %arg54: tensor<128xbf16>, %arg55: tensor<128xbf16>, %arg56: tensor<128xbf16>, %arg57: tensor<128x128x4x4xbf16>, %arg58: tensor<128xbf16>, %arg59: tensor<128xbf16>, %arg60: tensor<128xbf16>, %arg61: tensor<128xbf16>, %arg62: tensor<128xbf16>, %arg63: tensor<512x1x3x3xbf16>, %arg64: tensor<512xbf16>, %arg65: tensor<128xbf16>, %arg66: tensor<128xbf16>, %arg67: tensor<128xbf16>, %arg68: tensor<128x128x4x4xbf16>, %arg69: tensor<128xbf16>, %arg70: tensor<128xbf16>, %arg71: tensor<128xbf16>, %arg72: tensor<128xbf16>, %arg73: tensor<128xbf16>, %arg74: tensor<512x1x3x3xbf16>, %arg75: tensor<512xbf16>, %arg76: tensor<128xbf16>, %arg77: tensor<128xbf16>, %arg78: tensor<128xbf16>, %arg79: tensor<128x128x4x4xbf16>, %arg80: tensor<128xbf16>, %arg81: tensor<128xbf16>, %arg82: tensor<128xbf16>, %arg83: tensor<128xbf16>, %arg84: tensor<128xbf16>, %arg85: tensor<512x1x3x3xbf16>, %arg86: tensor<512xbf16>, %arg87: tensor<128xbf16>, %arg88: tensor<128xbf16>, %arg89: tensor<128xbf16>, %arg90: tensor<128x128x4x4xbf16>, %arg91: tensor<128xbf16>, %arg92: tensor<128xbf16>, %arg93: tensor<128xbf16>, %arg94: tensor<128xbf16>, %arg95: tensor<128xbf16>, %arg96: tensor<512x1x3x3xbf16>, %arg97: tensor<512xbf16>, %arg98: tensor<128xbf16>, %arg99: tensor<128xbf16>, %arg100: tensor<128xbf16>, %arg101: tensor<128x128x4x4xbf16>, %arg102: tensor<128xbf16>, %arg103: tensor<128xbf16>, %arg104: tensor<128xbf16>, %arg105: tensor<128xbf16>, %arg106: tensor<128xbf16>, %arg107: tensor<512x1x3x3xbf16>, %arg108: tensor<512xbf16>, %arg109: tensor<128xbf16>, %arg110: tensor<128xbf16>, %arg111: tensor<128xbf16>, %arg112: tensor<128x128x4x4xbf16>, %arg113: tensor<128xbf16>, %arg114: tensor<128xbf16>, %arg115: tensor<128xbf16>, %arg116: tensor<128xbf16>, %arg117: tensor<128xbf16>, %arg118: tensor<512x1x3x3xbf16>, %arg119: tensor<512xbf16>, %arg120: tensor<128xbf16>, %arg121: tensor<128xbf16>, %arg122: tensor<128xbf16>, %arg123: tensor<128x128x4x4xbf16>, %arg124: tensor<128xbf16>, %arg125: tensor<128xbf16>, %arg126: tensor<128xbf16>, %arg127: tensor<128xbf16>, %arg128: tensor<128xbf16>, %arg129: tensor<512x1x3x3xbf16>, %arg130: tensor<512xbf16>, %arg131: tensor<128xbf16>, %arg132: tensor<128xbf16>, %arg133: tensor<128xbf16>, %arg134: tensor<320x128x3x3xbf16>, %arg135: tensor<320xbf16>, %arg136: tensor<320xbf16>, %arg137: tensor<320xbf16>, %arg138: tensor<320xbf16>, %arg139: tensor<320xbf16>, %arg140: tensor<320x320x2x2xbf16>, %arg141: tensor<320xbf16>, %arg142: tensor<320xbf16>, %arg143: tensor<320xbf16>, %arg144: tensor<320xbf16>, %arg145: tensor<320xbf16>, %arg146: tensor<1280x1x3x3xbf16>, %arg147: tensor<1280xbf16>, %arg148: tensor<320xbf16>, %arg149: tensor<320xbf16>, %arg150: tensor<320xbf16>, %arg151: tensor<320x320x2x2xbf16>, %arg152: tensor<320xbf16>, %arg153: tensor<320xbf16>, %arg154: tensor<320xbf16>, %arg155: tensor<320xbf16>, %arg156: tensor<320xbf16>, %arg157: tensor<1280x1x3x3xbf16>, %arg158: tensor<1280xbf16>, %arg159: tensor<320xbf16>, %arg160: tensor<320xbf16>, %arg161: tensor<320xbf16>, %arg162: tensor<320x320x2x2xbf16>, %arg163: tensor<320xbf16>, %arg164: tensor<320xbf16>, %arg165: tensor<320xbf16>, %arg166: tensor<320xbf16>, %arg167: tensor<320xbf16>, %arg168: tensor<1280x1x3x3xbf16>, %arg169: tensor<1280xbf16>, %arg170: tensor<320xbf16>, %arg171: tensor<320xbf16>, %arg172: tensor<320xbf16>, %arg173: tensor<320x320x2x2xbf16>, %arg174: tensor<320xbf16>, %arg175: tensor<320xbf16>, %arg176: tensor<320xbf16>, %arg177: tensor<320xbf16>, %arg178: tensor<320xbf16>, %arg179: tensor<1280x1x3x3xbf16>, %arg180: tensor<1280xbf16>, %arg181: tensor<320xbf16>, %arg182: tensor<320xbf16>, %arg183: tensor<320xbf16>, %arg184: tensor<320x320x2x2xbf16>, %arg185: tensor<320xbf16>, %arg186: tensor<320xbf16>, %arg187: tensor<320xbf16>, %arg188: tensor<320xbf16>, %arg189: tensor<320xbf16>, %arg190: tensor<1280x1x3x3xbf16>, %arg191: tensor<1280xbf16>, %arg192: tensor<320xbf16>, %arg193: tensor<320xbf16>, %arg194: tensor<320xbf16>, %arg195: tensor<320x320x2x2xbf16>, %arg196: tensor<320xbf16>, %arg197: tensor<320xbf16>, %arg198: tensor<320xbf16>, %arg199: tensor<320xbf16>, %arg200: tensor<320xbf16>, %arg201: tensor<1280x1x3x3xbf16>, %arg202: tensor<1280xbf16>, %arg203: tensor<320xbf16>, %arg204: tensor<320xbf16>, %arg205: tensor<320xbf16>, %arg206: tensor<320x320x2x2xbf16>, %arg207: tensor<320xbf16>, %arg208: tensor<320xbf16>, %arg209: tensor<320xbf16>, %arg210: tensor<320xbf16>, %arg211: tensor<320xbf16>, %arg212: tensor<1280x1x3x3xbf16>, %arg213: tensor<1280xbf16>, %arg214: tensor<320xbf16>, %arg215: tensor<320xbf16>, %arg216: tensor<320xbf16>, %arg217: tensor<320x320x2x2xbf16>, %arg218: tensor<320xbf16>, %arg219: tensor<320xbf16>, %arg220: tensor<320xbf16>, %arg221: tensor<320xbf16>, %arg222: tensor<320xbf16>, %arg223: tensor<1280x1x3x3xbf16>, %arg224: tensor<1280xbf16>, %arg225: tensor<320xbf16>, %arg226: tensor<320xbf16>, %arg227: tensor<320xbf16>, %arg228: tensor<320x320x2x2xbf16>, %arg229: tensor<320xbf16>, %arg230: tensor<320xbf16>, %arg231: tensor<320xbf16>, %arg232: tensor<320xbf16>, %arg233: tensor<320xbf16>, %arg234: tensor<1280x1x3x3xbf16>, %arg235: tensor<1280xbf16>, %arg236: tensor<320xbf16>, %arg237: tensor<320xbf16>, %arg238: tensor<320xbf16>, %arg239: tensor<320x320x2x2xbf16>, %arg240: tensor<320xbf16>, %arg241: tensor<320xbf16>, %arg242: tensor<320xbf16>, %arg243: tensor<320xbf16>, %arg244: tensor<320xbf16>, %arg245: tensor<1280x1x3x3xbf16>, %arg246: tensor<1280xbf16>, %arg247: tensor<320xbf16>, %arg248: tensor<320xbf16>, %arg249: tensor<320xbf16>, %arg250: tensor<320x320x2x2xbf16>, %arg251: tensor<320xbf16>, %arg252: tensor<320xbf16>, %arg253: tensor<320xbf16>, %arg254: tensor<320xbf16>, %arg255: tensor<320xbf16>, %arg256: tensor<1280x1x3x3xbf16>, %arg257: tensor<1280xbf16>, %arg258: tensor<320xbf16>, %arg259: tensor<320xbf16>, %arg260: tensor<320xbf16>, %arg261: tensor<320x320x2x2xbf16>, %arg262: tensor<320xbf16>, %arg263: tensor<320xbf16>, %arg264: tensor<320xbf16>, %arg265: tensor<320xbf16>, %arg266: tensor<320xbf16>, %arg267: tensor<1280x1x3x3xbf16>, %arg268: tensor<1280xbf16>, %arg269: tensor<320xbf16>, %arg270: tensor<320xbf16>, %arg271: tensor<320xbf16>, %arg272: tensor<320x320x2x2xbf16>, %arg273: tensor<320xbf16>, %arg274: tensor<320xbf16>, %arg275: tensor<320xbf16>, %arg276: tensor<320xbf16>, %arg277: tensor<320xbf16>, %arg278: tensor<1280x1x3x3xbf16>, %arg279: tensor<1280xbf16>, %arg280: tensor<320xbf16>, %arg281: tensor<320xbf16>, %arg282: tensor<320xbf16>, %arg283: tensor<320x320x2x2xbf16>, %arg284: tensor<320xbf16>, %arg285: tensor<320xbf16>, %arg286: tensor<320xbf16>, %arg287: tensor<320xbf16>, %arg288: tensor<320xbf16>, %arg289: tensor<1280x1x3x3xbf16>, %arg290: tensor<1280xbf16>, %arg291: tensor<320xbf16>, %arg292: tensor<320xbf16>, %arg293: tensor<320xbf16>, %arg294: tensor<320x320x2x2xbf16>, %arg295: tensor<320xbf16>, %arg296: tensor<320xbf16>, %arg297: tensor<320xbf16>, %arg298: tensor<320xbf16>, %arg299: tensor<320xbf16>, %arg300: tensor<1280x1x3x3xbf16>, %arg301: tensor<1280xbf16>, %arg302: tensor<320xbf16>, %arg303: tensor<320xbf16>, %arg304: tensor<320xbf16>, %arg305: tensor<320x320x2x2xbf16>, %arg306: tensor<320xbf16>, %arg307: tensor<320xbf16>, %arg308: tensor<320xbf16>, %arg309: tensor<320xbf16>, %arg310: tensor<320xbf16>, %arg311: tensor<1280x1x3x3xbf16>, %arg312: tensor<1280xbf16>, %arg313: tensor<320xbf16>, %arg314: tensor<320xbf16>, %arg315: tensor<320xbf16>, %arg316: tensor<320x320x2x2xbf16>, %arg317: tensor<320xbf16>, %arg318: tensor<320xbf16>, %arg319: tensor<320xbf16>, %arg320: tensor<320xbf16>, %arg321: tensor<320xbf16>, %arg322: tensor<1280x1x3x3xbf16>, %arg323: tensor<1280xbf16>, %arg324: tensor<320xbf16>, %arg325: tensor<320xbf16>, %arg326: tensor<320xbf16>, %arg327: tensor<320x320x2x2xbf16>, %arg328: tensor<320xbf16>, %arg329: tensor<320xbf16>, %arg330: tensor<320xbf16>, %arg331: tensor<320xbf16>, %arg332: tensor<320xbf16>, %arg333: tensor<1280x1x3x3xbf16>, %arg334: tensor<1280xbf16>, %arg335: tensor<320xbf16>, %arg336: tensor<320xbf16>, %arg337: tensor<320xbf16>, %arg338: tensor<320x320x2x2xbf16>, %arg339: tensor<320xbf16>, %arg340: tensor<320xbf16>, %arg341: tensor<320xbf16>, %arg342: tensor<320xbf16>, %arg343: tensor<320xbf16>, %arg344: tensor<1280x1x3x3xbf16>, %arg345: tensor<1280xbf16>, %arg346: tensor<320xbf16>, %arg347: tensor<320xbf16>, %arg348: tensor<320xbf16>, %arg349: tensor<320x320x2x2xbf16>, %arg350: tensor<320xbf16>, %arg351: tensor<320xbf16>, %arg352: tensor<320xbf16>, %arg353: tensor<320xbf16>, %arg354: tensor<320xbf16>, %arg355: tensor<1280x1x3x3xbf16>, %arg356: tensor<1280xbf16>, %arg357: tensor<320xbf16>, %arg358: tensor<320xbf16>, %arg359: tensor<320xbf16>, %arg360: tensor<320x320x2x2xbf16>, %arg361: tensor<320xbf16>, %arg362: tensor<320xbf16>, %arg363: tensor<320xbf16>, %arg364: tensor<320xbf16>, %arg365: tensor<320xbf16>, %arg366: tensor<1280x1x3x3xbf16>, %arg367: tensor<1280xbf16>, %arg368: tensor<320xbf16>, %arg369: tensor<320xbf16>, %arg370: tensor<320xbf16>, %arg371: tensor<320x320x2x2xbf16>, %arg372: tensor<320xbf16>, %arg373: tensor<320xbf16>, %arg374: tensor<320xbf16>, %arg375: tensor<320xbf16>, %arg376: tensor<320xbf16>, %arg377: tensor<1280x1x3x3xbf16>, %arg378: tensor<1280xbf16>, %arg379: tensor<320xbf16>, %arg380: tensor<320xbf16>, %arg381: tensor<320xbf16>, %arg382: tensor<320x320x2x2xbf16>, %arg383: tensor<320xbf16>, %arg384: tensor<320xbf16>, %arg385: tensor<320xbf16>, %arg386: tensor<320xbf16>, %arg387: tensor<320xbf16>, %arg388: tensor<1280x1x3x3xbf16>, %arg389: tensor<1280xbf16>, %arg390: tensor<320xbf16>, %arg391: tensor<320xbf16>, %arg392: tensor<320xbf16>, %arg393: tensor<320x320x2x2xbf16>, %arg394: tensor<320xbf16>, %arg395: tensor<320xbf16>, %arg396: tensor<320xbf16>, %arg397: tensor<320xbf16>, %arg398: tensor<320xbf16>, %arg399: tensor<1280x1x3x3xbf16>, %arg400: tensor<1280xbf16>, %arg401: tensor<320xbf16>, %arg402: tensor<320xbf16>, %arg403: tensor<320xbf16>, %arg404: tensor<320x320x2x2xbf16>, %arg405: tensor<320xbf16>, %arg406: tensor<320xbf16>, %arg407: tensor<320xbf16>, %arg408: tensor<320xbf16>, %arg409: tensor<320xbf16>, %arg410: tensor<1280x1x3x3xbf16>, %arg411: tensor<1280xbf16>, %arg412: tensor<320xbf16>, %arg413: tensor<320xbf16>, %arg414: tensor<320xbf16>, %arg415: tensor<320x320x2x2xbf16>, %arg416: tensor<320xbf16>, %arg417: tensor<320xbf16>, %arg418: tensor<320xbf16>, %arg419: tensor<320xbf16>, %arg420: tensor<320xbf16>, %arg421: tensor<1280x1x3x3xbf16>, %arg422: tensor<1280xbf16>, %arg423: tensor<320xbf16>, %arg424: tensor<320xbf16>, %arg425: tensor<320xbf16>, %arg426: tensor<320x320x2x2xbf16>, %arg427: tensor<320xbf16>, %arg428: tensor<320xbf16>, %arg429: tensor<320xbf16>, %arg430: tensor<320xbf16>, %arg431: tensor<320xbf16>, %arg432: tensor<1280x1x3x3xbf16>, %arg433: tensor<1280xbf16>, %arg434: tensor<320xbf16>, %arg435: tensor<320xbf16>, %arg436: tensor<320xbf16>, %arg437: tensor<512x320x3x3xbf16>, %arg438: tensor<512xbf16>, %arg439: tensor<512xbf16>, %arg440: tensor<512xbf16>, %arg441: tensor<512xbf16>, %arg442: tensor<512xbf16>, %arg443: tensor<512xbf16>, %arg444: tensor<512xbf16>, %arg445: tensor<2048x1x3x3xbf16>, %arg446: tensor<2048xbf16>, %arg447: tensor<512xbf16>, %arg448: tensor<512xbf16>, %arg449: tensor<512xbf16>, %arg450: tensor<512xbf16>, %arg451: tensor<512xbf16>, %arg452: tensor<2048x1x3x3xbf16>, %arg453: tensor<2048xbf16>, %arg454: tensor<512xbf16>, %arg455: tensor<512xbf16>, %arg456: tensor<512xbf16>, %arg457: tensor<512xbf16>, %arg458: tensor<512xbf16>, %arg459: tensor<2048x1x3x3xbf16>, %arg460: tensor<2048xbf16>, %arg461: tensor<512xbf16>, %arg462: tensor<512xbf16>, %arg463: tensor<512xbf16>, %arg464: tensor<64x512x1x1xbf16>, %arg465: tensor<64xbf16>, %arg466: tensor<64x320x1x1xbf16>, %arg467: tensor<64xbf16>, %arg468: tensor<64x128x3x3xbf16>, %arg469: tensor<64xbf16>, %arg470: tensor<32x64x3x3xbf16>, %arg471: tensor<32xbf16>, %arg472: tensor<2x32x3x3xbf16>, %arg473: tensor<2xbf16>, %arg474: tensor<64x128x1x1xbf16>, %arg475: tensor<64xbf16>, %arg476: tensor<64x128x3x3xbf16>, %arg477: tensor<64xbf16>, %arg478: tensor<32x64x3x3xbf16>, %arg479: tensor<32xbf16>, %arg480: tensor<2x32x3x3xbf16>, %arg481: tensor<2xbf16>, %arg482: tensor<64x128x3x3xbf16>, %arg483: tensor<64xbf16>, %arg484: tensor<32x64x3x3xbf16>, %arg485: tensor<32xbf16>, %arg486: tensor<2x32x3x3xbf16>, %arg487: tensor<2xbf16>, %arg488: tensor<64x64x3x3xbf16>, %arg489: tensor<64xbf16>, %arg490: tensor<1x64x3x3xbf16>, %arg491: tensor<1xbf16>, %arg492: tensor<64x64xf32>, %arg493: tensor<64xf32>, %arg494: tensor<64x64xf32>, %arg495: tensor<64xf32>, %arg496: tensor<64x64xf32>, %arg497: tensor<64xf32>, %arg498: tensor<64x64xf32>, %arg499: tensor<64xf32>, %arg500: tensor<64x256xf32>, %arg501: tensor<256xf32>, %arg502: tensor<256x64xbf16>, %arg503: tensor<64x64xf32>, %arg504: tensor<64xf32>, %arg505: tensor<64x64xf32>, %arg506: tensor<64xf32>, %arg507: tensor<64x64xf32>, %arg508: tensor<64xf32>, %arg509: tensor<64x64xf32>, %arg510: tensor<64xf32>, %arg511: tensor<64x256xf32>, %arg512: tensor<256xf32>, %arg513: tensor<256x64xbf16>, %arg514: tensor<64x64xf32>, %arg515: tensor<64xf32>, %arg516: tensor<64x64xf32>, %arg517: tensor<64xf32>, %arg518: tensor<64x64xf32>, %arg519: tensor<64xf32>, %arg520: tensor<64x64xf32>, %arg521: tensor<64xf32>, %arg522: tensor<64x256xf32>, %arg523: tensor<256xf32>, %arg524: tensor<256x64xbf16>, %arg525: tensor<128x128xf32>, %arg526: tensor<128xf32>, %arg527: tensor<128x128xf32>, %arg528: tensor<128xf32>, %arg529: tensor<128x128xf32>, %arg530: tensor<128xf32>, %arg531: tensor<128x128xf32>, %arg532: tensor<128xf32>, %arg533: tensor<128x512xf32>, %arg534: tensor<512xf32>, %arg535: tensor<512x128xbf16>, %arg536: tensor<128x128xf32>, %arg537: tensor<128xf32>, %arg538: tensor<128x128xf32>, %arg539: tensor<128xf32>, %arg540: tensor<128x128xf32>, %arg541: tensor<128xf32>, %arg542: tensor<128x128xf32>, %arg543: tensor<128xf32>, %arg544: tensor<128x512xf32>, %arg545: tensor<512xf32>, %arg546: tensor<512x128xbf16>, %arg547: tensor<128x128xf32>, %arg548: tensor<128xf32>, %arg549: tensor<128x128xf32>, %arg550: tensor<128xf32>, %arg551: tensor<128x128xf32>, %arg552: tensor<128xf32>, %arg553: tensor<128x128xf32>, %arg554: tensor<128xf32>, %arg555: tensor<128x512xf32>, %arg556: tensor<512xf32>, %arg557: tensor<512x128xbf16>, %arg558: tensor<128x128xf32>, %arg559: tensor<128xf32>, %arg560: tensor<128x128xf32>, %arg561: tensor<128xf32>, %arg562: tensor<128x128xf32>, %arg563: tensor<128xf32>, %arg564: tensor<128x128xf32>, %arg565: tensor<128xf32>, %arg566: tensor<128x512xf32>, %arg567: tensor<512xf32>, %arg568: tensor<512x128xbf16>, %arg569: tensor<128x128xf32>, %arg570: tensor<128xf32>, %arg571: tensor<128x128xf32>, %arg572: tensor<128xf32>, %arg573: tensor<128x128xf32>, %arg574: tensor<128xf32>, %arg575: tensor<128x128xf32>, %arg576: tensor<128xf32>, %arg577: tensor<128x512xf32>, %arg578: tensor<512xf32>, %arg579: tensor<512x128xbf16>, %arg580: tensor<128x128xf32>, %arg581: tensor<128xf32>, %arg582: tensor<128x128xf32>, %arg583: tensor<128xf32>, %arg584: tensor<128x128xf32>, %arg585: tensor<128xf32>, %arg586: tensor<128x128xf32>, %arg587: tensor<128xf32>, %arg588: tensor<128x512xf32>, %arg589: tensor<512xf32>, %arg590: tensor<512x128xbf16>, %arg591: tensor<128x128xf32>, %arg592: tensor<128xf32>, %arg593: tensor<128x128xf32>, %arg594: tensor<128xf32>, %arg595: tensor<128x128xf32>, %arg596: tensor<128xf32>, %arg597: tensor<128x128xf32>, %arg598: tensor<128xf32>, %arg599: tensor<128x512xf32>, %arg600: tensor<512xf32>, %arg601: tensor<512x128xbf16>, %arg602: tensor<128x128xf32>, %arg603: tensor<128xf32>, %arg604: tensor<128x128xf32>, %arg605: tensor<128xf32>, %arg606: tensor<128x128xf32>, %arg607: tensor<128xf32>, %arg608: tensor<128x128xf32>, %arg609: tensor<128xf32>, %arg610: tensor<128x512xf32>, %arg611: tensor<512xf32>, %arg612: tensor<512x128xbf16>, %arg613: tensor<320x320xf32>, %arg614: tensor<320xf32>, %arg615: tensor<320x320xf32>, %arg616: tensor<320xf32>, %arg617: tensor<320x320xf32>, %arg618: tensor<320xf32>, %arg619: tensor<320x320xf32>, %arg620: tensor<320xf32>, %arg621: tensor<320x1280xf32>, %arg622: tensor<1280xf32>, %arg623: tensor<1280x320xbf16>, %arg624: tensor<320x320xf32>, %arg625: tensor<320xf32>, %arg626: tensor<320x320xf32>, %arg627: tensor<320xf32>, %arg628: tensor<320x320xf32>, %arg629: tensor<320xf32>, %arg630: tensor<320x320xf32>, %arg631: tensor<320xf32>, %arg632: tensor<320x1280xf32>, %arg633: tensor<1280xf32>, %arg634: tensor<1280x320xbf16>, %arg635: tensor<320x320xf32>, %arg636: tensor<320xf32>, %arg637: tensor<320x320xf32>, %arg638: tensor<320xf32>, %arg639: tensor<320x320xf32>, %arg640: tensor<320xf32>, %arg641: tensor<320x320xf32>, %arg642: tensor<320xf32>, %arg643: tensor<320x1280xf32>, %arg644: tensor<1280xf32>, %arg645: tensor<1280x320xbf16>, %arg646: tensor<320x320xf32>, %arg647: tensor<320xf32>, %arg648: tensor<320x320xf32>, %arg649: tensor<320xf32>, %arg650: tensor<320x320xf32>, %arg651: tensor<320xf32>, %arg652: tensor<320x320xf32>, %arg653: tensor<320xf32>, %arg654: tensor<320x1280xf32>, %arg655: tensor<1280xf32>, %arg656: tensor<1280x320xbf16>, %arg657: tensor<320x320xf32>, %arg658: tensor<320xf32>, %arg659: tensor<320x320xf32>, %arg660: tensor<320xf32>, %arg661: tensor<320x320xf32>, %arg662: tensor<320xf32>, %arg663: tensor<320x320xf32>, %arg664: tensor<320xf32>, %arg665: tensor<320x1280xf32>, %arg666: tensor<1280xf32>, %arg667: tensor<1280x320xbf16>, %arg668: tensor<320x320xf32>, %arg669: tensor<320xf32>, %arg670: tensor<320x320xf32>, %arg671: tensor<320xf32>, %arg672: tensor<320x320xf32>, %arg673: tensor<320xf32>, %arg674: tensor<320x320xf32>, %arg675: tensor<320xf32>, %arg676: tensor<320x1280xf32>, %arg677: tensor<1280xf32>, %arg678: tensor<1280x320xbf16>, %arg679: tensor<320x320xf32>, %arg680: tensor<320xf32>, %arg681: tensor<320x320xf32>, %arg682: tensor<320xf32>, %arg683: tensor<320x320xf32>, %arg684: tensor<320xf32>, %arg685: tensor<320x320xf32>, %arg686: tensor<320xf32>, %arg687: tensor<320x1280xf32>, %arg688: tensor<1280xf32>, %arg689: tensor<1280x320xbf16>, %arg690: tensor<320x320xf32>, %arg691: tensor<320xf32>, %arg692: tensor<320x320xf32>, %arg693: tensor<320xf32>, %arg694: tensor<320x320xf32>, %arg695: tensor<320xf32>, %arg696: tensor<320x320xf32>, %arg697: tensor<320xf32>, %arg698: tensor<320x1280xf32>, %arg699: tensor<1280xf32>, %arg700: tensor<1280x320xbf16>, %arg701: tensor<320x320xf32>, %arg702: tensor<320xf32>, %arg703: tensor<320x320xf32>, %arg704: tensor<320xf32>, %arg705: tensor<320x320xf32>, %arg706: tensor<320xf32>, %arg707: tensor<320x320xf32>, %arg708: tensor<320xf32>, %arg709: tensor<320x1280xf32>, %arg710: tensor<1280xf32>, %arg711: tensor<1280x320xbf16>, %arg712: tensor<320x320xf32>, %arg713: tensor<320xf32>, %arg714: tensor<320x320xf32>, %arg715: tensor<320xf32>, %arg716: tensor<320x320xf32>, %arg717: tensor<320xf32>, %arg718: tensor<320x320xf32>, %arg719: tensor<320xf32>, %arg720: tensor<320x1280xf32>, %arg721: tensor<1280xf32>, %arg722: tensor<1280x320xbf16>, %arg723: tensor<320x320xf32>, %arg724: tensor<320xf32>, %arg725: tensor<320x320xf32>, %arg726: tensor<320xf32>, %arg727: tensor<320x320xf32>, %arg728: tensor<320xf32>, %arg729: tensor<320x320xf32>, %arg730: tensor<320xf32>, %arg731: tensor<320x1280xf32>, %arg732: tensor<1280xf32>, %arg733: tensor<1280x320xbf16>, %arg734: tensor<320x320xf32>, %arg735: tensor<320xf32>, %arg736: tensor<320x320xf32>, %arg737: tensor<320xf32>, %arg738: tensor<320x320xf32>, %arg739: tensor<320xf32>, %arg740: tensor<320x320xf32>, %arg741: tensor<320xf32>, %arg742: tensor<320x1280xf32>, %arg743: tensor<1280xf32>, %arg744: tensor<1280x320xbf16>, %arg745: tensor<320x320xf32>, %arg746: tensor<320xf32>, %arg747: tensor<320x320xf32>, %arg748: tensor<320xf32>, %arg749: tensor<320x320xf32>, %arg750: tensor<320xf32>, %arg751: tensor<320x320xf32>, %arg752: tensor<320xf32>, %arg753: tensor<320x1280xf32>, %arg754: tensor<1280xf32>, %arg755: tensor<1280x320xbf16>, %arg756: tensor<320x320xf32>, %arg757: tensor<320xf32>, %arg758: tensor<320x320xf32>, %arg759: tensor<320xf32>, %arg760: tensor<320x320xf32>, %arg761: tensor<320xf32>, %arg762: tensor<320x320xf32>, %arg763: tensor<320xf32>, %arg764: tensor<320x1280xf32>, %arg765: tensor<1280xf32>, %arg766: tensor<1280x320xbf16>, %arg767: tensor<320x320xf32>, %arg768: tensor<320xf32>, %arg769: tensor<320x320xf32>, %arg770: tensor<320xf32>, %arg771: tensor<320x320xf32>, %arg772: tensor<320xf32>, %arg773: tensor<320x320xf32>, %arg774: tensor<320xf32>, %arg775: tensor<320x1280xf32>, %arg776: tensor<1280xf32>, %arg777: tensor<1280x320xbf16>, %arg778: tensor<320x320xf32>, %arg779: tensor<320xf32>, %arg780: tensor<320x320xf32>, %arg781: tensor<320xf32>, %arg782: tensor<320x320xf32>, %arg783: tensor<320xf32>, %arg784: tensor<320x320xf32>, %arg785: tensor<320xf32>, %arg786: tensor<320x1280xf32>, %arg787: tensor<1280xf32>, %arg788: tensor<1280x320xbf16>, %arg789: tensor<320x320xf32>, %arg790: tensor<320xf32>, %arg791: tensor<320x320xf32>, %arg792: tensor<320xf32>, %arg793: tensor<320x320xf32>, %arg794: tensor<320xf32>, %arg795: tensor<320x320xf32>, %arg796: tensor<320xf32>, %arg797: tensor<320x1280xf32>, %arg798: tensor<1280xf32>, %arg799: tensor<1280x320xbf16>, %arg800: tensor<320x320xf32>, %arg801: tensor<320xf32>, %arg802: tensor<320x320xf32>, %arg803: tensor<320xf32>, %arg804: tensor<320x320xf32>, %arg805: tensor<320xf32>, %arg806: tensor<320x320xf32>, %arg807: tensor<320xf32>, %arg808: tensor<320x1280xf32>, %arg809: tensor<1280xf32>, %arg810: tensor<1280x320xbf16>, %arg811: tensor<320x320xf32>, %arg812: tensor<320xf32>, %arg813: tensor<320x320xf32>, %arg814: tensor<320xf32>, %arg815: tensor<320x320xf32>, %arg816: tensor<320xf32>, %arg817: tensor<320x320xf32>, %arg818: tensor<320xf32>, %arg819: tensor<320x1280xf32>, %arg820: tensor<1280xf32>, %arg821: tensor<1280x320xbf16>, %arg822: tensor<320x320xf32>, %arg823: tensor<320xf32>, %arg824: tensor<320x320xf32>, %arg825: tensor<320xf32>, %arg826: tensor<320x320xf32>, %arg827: tensor<320xf32>, %arg828: tensor<320x320xf32>, %arg829: tensor<320xf32>, %arg830: tensor<320x1280xf32>, %arg831: tensor<1280xf32>, %arg832: tensor<1280x320xbf16>, %arg833: tensor<320x320xf32>, %arg834: tensor<320xf32>, %arg835: tensor<320x320xf32>, %arg836: tensor<320xf32>, %arg837: tensor<320x320xf32>, %arg838: tensor<320xf32>, %arg839: tensor<320x320xf32>, %arg840: tensor<320xf32>, %arg841: tensor<320x1280xf32>, %arg842: tensor<1280xf32>, %arg843: tensor<1280x320xbf16>, %arg844: tensor<320x320xf32>, %arg845: tensor<320xf32>, %arg846: tensor<320x320xf32>, %arg847: tensor<320xf32>, %arg848: tensor<320x320xf32>, %arg849: tensor<320xf32>, %arg850: tensor<320x320xf32>, %arg851: tensor<320xf32>, %arg852: tensor<320x1280xf32>, %arg853: tensor<1280xf32>, %arg854: tensor<1280x320xbf16>, %arg855: tensor<320x320xf32>, %arg856: tensor<320xf32>, %arg857: tensor<320x320xf32>, %arg858: tensor<320xf32>, %arg859: tensor<320x320xf32>, %arg860: tensor<320xf32>, %arg861: tensor<320x320xf32>, %arg862: tensor<320xf32>, %arg863: tensor<320x1280xf32>, %arg864: tensor<1280xf32>, %arg865: tensor<1280x320xbf16>, %arg866: tensor<320x320xf32>, %arg867: tensor<320xf32>, %arg868: tensor<320x320xf32>, %arg869: tensor<320xf32>, %arg870: tensor<320x320xf32>, %arg871: tensor<320xf32>, %arg872: tensor<320x320xf32>, %arg873: tensor<320xf32>, %arg874: tensor<320x1280xf32>, %arg875: tensor<1280xf32>, %arg876: tensor<1280x320xbf16>, %arg877: tensor<320x320xf32>, %arg878: tensor<320xf32>, %arg879: tensor<320x320xf32>, %arg880: tensor<320xf32>, %arg881: tensor<320x320xf32>, %arg882: tensor<320xf32>, %arg883: tensor<320x320xf32>, %arg884: tensor<320xf32>, %arg885: tensor<320x1280xf32>, %arg886: tensor<1280xf32>, %arg887: tensor<1280x320xbf16>, %arg888: tensor<320x320xf32>, %arg889: tensor<320xf32>, %arg890: tensor<320x320xf32>, %arg891: tensor<320xf32>, %arg892: tensor<320x320xf32>, %arg893: tensor<320xf32>, %arg894: tensor<320x320xf32>, %arg895: tensor<320xf32>, %arg896: tensor<320x1280xf32>, %arg897: tensor<1280xf32>, %arg898: tensor<1280x320xbf16>, %arg899: tensor<320x320xf32>, %arg900: tensor<320xf32>, %arg901: tensor<320x320xf32>, %arg902: tensor<320xf32>, %arg903: tensor<320x320xf32>, %arg904: tensor<320xf32>, %arg905: tensor<320x320xf32>, %arg906: tensor<320xf32>, %arg907: tensor<320x1280xf32>, %arg908: tensor<1280xf32>, %arg909: tensor<1280x320xbf16>, %arg910: tensor<512x512xf32>, %arg911: tensor<512xf32>, %arg912: tensor<512x512xf32>, %arg913: tensor<512xf32>, %arg914: tensor<512x512xf32>, %arg915: tensor<512xf32>, %arg916: tensor<512x512xf32>, %arg917: tensor<512xf32>, %arg918: tensor<512x2048xf32>, %arg919: tensor<2048xf32>, %arg920: tensor<2048x512xbf16>, %arg921: tensor<512x512xf32>, %arg922: tensor<512xf32>, %arg923: tensor<512x512xf32>, %arg924: tensor<512xf32>, %arg925: tensor<512x512xf32>, %arg926: tensor<512xf32>, %arg927: tensor<512x512xf32>, %arg928: tensor<512xf32>, %arg929: tensor<512x2048xf32>, %arg930: tensor<2048xf32>, %arg931: tensor<2048x512xbf16>, %arg932: tensor<512x512xf32>, %arg933: tensor<512xf32>, %arg934: tensor<512x512xf32>, %arg935: tensor<512xf32>, %arg936: tensor<512x512xf32>, %arg937: tensor<512xf32>, %arg938: tensor<512x512xf32>, %arg939: tensor<512xf32>, %arg940: tensor<512x2048xf32>, %arg941: tensor<2048xf32>, %arg942: tensor<2048x512xbf16>, %arg943: tensor<64x15x30xbf16>, %arg944: tensor<64x20x40xbf16>, %arg945: tensor<64x1x1xf32>, %arg946: tensor<64x1x1xf32>, %arg947: tensor<64x1x1xbf16>, %arg948: tensor<64x1x1xbf16>, %arg949: tensor<32x1x1xf32>, %arg950: tensor<32x1x1xf32>, %arg951: tensor<32x1x1xbf16>, %arg952: tensor<32x1x1xbf16>, %arg953: tensor<64x30x60xbf16>, %arg954: tensor<64x40x80xbf16>, %arg955: tensor<64x1x1xf32>, %arg956: tensor<64x1x1xf32>, %arg957: tensor<64x1x1xbf16>, %arg958: tensor<64x1x1xbf16>, %arg959: tensor<32x1x1xf32>, %arg960: tensor<32x1x1xf32>, %arg961: tensor<32x1x1xbf16>, %arg962: tensor<32x1x1xbf16>, %arg963: tensor<64x60x120xbf16>, %arg964: tensor<64x80x160xbf16>, %arg965: tensor<64x1x1xf32>, %arg966: tensor<64x1x1xf32>, %arg967: tensor<64x1x1xbf16>, %arg968: tensor<64x1x1xbf16>, %arg969: tensor<32x1x1xf32>, %arg970: tensor<32x1x1xf32>, %arg971: tensor<32x1x1xbf16>, %arg972: tensor<32x1x1xbf16>, %arg973: tensor<64x120x240xbf16>, %arg974: tensor<64x160x320xbf16>, %arg975: tensor<64x240x480xbf16>, %arg976: tensor<64x320x640xbf16>) -> tensor<1x480x640xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x19200x256xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x19200x256xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x19200x256xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x19200x256xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x19200x256xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x19200x256xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x19200x256xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x19200x256xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x19200x256xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x19200x256xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x19200x256xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x19200x256xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x19200x256xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x19200x256xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x19200x256xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x19200x256xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x19200x256xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x19200x256xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x19200x256xf32>
-    %cst_21 = stablehlo.constant dense<1.000000e+00> : tensor<1x4800x512xbf16>
-    %cst_22 = stablehlo.constant dense<2.000000e+00> : tensor<1x4800x512xbf16>
-    %cst_23 = stablehlo.constant dense<5.000000e-01> : tensor<1x4800x512xbf16>
-    %cst_24 = stablehlo.constant dense<-4.000000e+00> : tensor<1x4800x512xf32>
-    %cst_25 = stablehlo.constant dense<4.000000e+00> : tensor<1x4800x512xf32>
-    %cst_26 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x4800x512xf32>
-    %cst_27 = stablehlo.constant dense<2.77068146E-8> : tensor<1x4800x512xf32>
-    %cst_28 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x4800x512xf32>
-    %cst_29 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x4800x512xf32>
-    %cst_30 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x4800x512xf32>
-    %cst_31 = stablehlo.constant dense<-2.954600e-03> : tensor<1x4800x512xf32>
-    %cst_32 = stablehlo.constant dense<-0.0160960332> : tensor<1x4800x512xf32>
-    %cst_33 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x4800x512xf32>
-    %cst_34 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x4800x512xf32>
-    %cst_35 = stablehlo.constant dense<-0.00168282702> : tensor<1x4800x512xf32>
-    %cst_36 = stablehlo.constant dense<-0.00737332925> : tensor<1x4800x512xf32>
-    %cst_37 = stablehlo.constant dense<-0.0142647391> : tensor<1x4800x512xf32>
-    %cst_38 = stablehlo.constant dense<-1.000000e+00> : tensor<1x4800x512xf32>
-    %cst_39 = stablehlo.constant dense<1.000000e+00> : tensor<1x4800x512xf32>
-    %cst_40 = stablehlo.constant dense<1.000000e+00> : tensor<1x1200x1280xbf16>
-    %cst_41 = stablehlo.constant dense<2.000000e+00> : tensor<1x1200x1280xbf16>
-    %cst_42 = stablehlo.constant dense<5.000000e-01> : tensor<1x1200x1280xbf16>
-    %cst_43 = stablehlo.constant dense<-4.000000e+00> : tensor<1x1200x1280xf32>
-    %cst_44 = stablehlo.constant dense<4.000000e+00> : tensor<1x1200x1280xf32>
-    %cst_45 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x1200x1280xf32>
-    %cst_46 = stablehlo.constant dense<2.77068146E-8> : tensor<1x1200x1280xf32>
-    %cst_47 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x1200x1280xf32>
-    %cst_48 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x1200x1280xf32>
-    %cst_49 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x1200x1280xf32>
-    %cst_50 = stablehlo.constant dense<-2.954600e-03> : tensor<1x1200x1280xf32>
-    %cst_51 = stablehlo.constant dense<-0.0160960332> : tensor<1x1200x1280xf32>
-    %cst_52 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x1200x1280xf32>
-    %cst_53 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x1200x1280xf32>
-    %cst_54 = stablehlo.constant dense<-0.00168282702> : tensor<1x1200x1280xf32>
-    %cst_55 = stablehlo.constant dense<-0.00737332925> : tensor<1x1200x1280xf32>
-    %cst_56 = stablehlo.constant dense<-0.0142647391> : tensor<1x1200x1280xf32>
-    %cst_57 = stablehlo.constant dense<-1.000000e+00> : tensor<1x1200x1280xf32>
-    %cst_58 = stablehlo.constant dense<1.000000e+00> : tensor<1x1200x1280xf32>
-    %cst_59 = stablehlo.constant dense<1.000000e+00> : tensor<1x300x2048xbf16>
-    %cst_60 = stablehlo.constant dense<2.000000e+00> : tensor<1x300x2048xbf16>
-    %cst_61 = stablehlo.constant dense<5.000000e-01> : tensor<1x300x2048xbf16>
-    %cst_62 = stablehlo.constant dense<-4.000000e+00> : tensor<1x300x2048xf32>
-    %cst_63 = stablehlo.constant dense<4.000000e+00> : tensor<1x300x2048xf32>
-    %cst_64 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x300x2048xf32>
-    %cst_65 = stablehlo.constant dense<2.77068146E-8> : tensor<1x300x2048xf32>
-    %cst_66 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x300x2048xf32>
-    %cst_67 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x300x2048xf32>
-    %cst_68 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x300x2048xf32>
-    %cst_69 = stablehlo.constant dense<-2.954600e-03> : tensor<1x300x2048xf32>
-    %cst_70 = stablehlo.constant dense<-0.0160960332> : tensor<1x300x2048xf32>
-    %cst_71 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x300x2048xf32>
-    %cst_72 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x300x2048xf32>
-    %cst_73 = stablehlo.constant dense<-0.00168282702> : tensor<1x300x2048xf32>
-    %cst_74 = stablehlo.constant dense<-0.00737332925> : tensor<1x300x2048xf32>
-    %cst_75 = stablehlo.constant dense<-0.0142647391> : tensor<1x300x2048xf32>
-    %cst_76 = stablehlo.constant dense<-1.000000e+00> : tensor<1x300x2048xf32>
-    %cst_77 = stablehlo.constant dense<1.000000e+00> : tensor<1x300x2048xf32>
-    %cst_78 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x30x40xbf16>
-    %cst_79 = stablehlo.constant dense<0.000000e+00> : tensor<1x32x30x40xbf16>
-    %cst_80 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x60x80xbf16>
-    %cst_81 = stablehlo.constant dense<0.000000e+00> : tensor<1x32x60x80xbf16>
-    %cst_82 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x120x160xbf16>
-    %cst_83 = stablehlo.constant dense<0.000000e+00> : tensor<1x32x120x160xbf16>
-    %cst_84 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x480x640xbf16>
-    %cst_85 = arith.constant dense<64> : tensor<1xi64>
-    %cst_86 = arith.constant dense<1.000000e-05> : tensor<1xf64>
-    %cst_87 = arith.constant dense<1> : tensor<1xi64>
-    %cst_88 = arith.constant dense<8.000000e+00> : tensor<1xf64>
-    %cst_89 = arith.constant dense<128> : tensor<1xi64>
-    %cst_90 = arith.constant dense<320> : tensor<1xi64>
-    %cst_91 = arith.constant dense<512> : tensor<1xi64>
-    %cst_92 = arith.constant dense<10> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[3, 3], [3, 3]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x480x640xbf16>, tensor<64x3x7x7xbf16>) -> tensor<1x64x120x160xbf16>
-    %1 = stablehlo.reshape %arg2 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %3 = stablehlo.broadcast_in_dim %1, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x120x160xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x64x120x160xbf16>
-    %5 = stablehlo.reshape %4 : (tensor<1x64x120x160xbf16>) -> tensor<1x64x19200xbf16>
-    %6 = stablehlo.transpose %5, dims = [0, 2, 1] : (tensor<1x64x19200xbf16>) -> tensor<1x19200x64xbf16>
-    %7 = stablehlo.convert %6 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %8 = stablehlo.convert %7 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %9 = stablehlo.reduce(%8 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %10 = stablehlo.reshape %9 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %11 = stablehlo.convert %cst_85 : (tensor<1xi64>) -> tensor<1xf64>
-    %12 = stablehlo.reshape %11 : (tensor<1xf64>) -> tensor<f64>
-    %13 = stablehlo.broadcast_in_dim %10, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [] : (tensor<f64>) -> tensor<1x19200x1xf64>
-    %15 = stablehlo.divide %13, %14 : tensor<1x19200x1xf64>
-    %16 = stablehlo.broadcast_in_dim %8, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %17 = stablehlo.broadcast_in_dim %15, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %18 = stablehlo.subtract %16, %17 : tensor<1x19200x64xf64>
-    %19 = stablehlo.multiply %18, %18 : tensor<1x19200x64xf64>
-    %20 = stablehlo.reduce(%19 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %21 = stablehlo.reshape %20 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %22 = stablehlo.broadcast_in_dim %21, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %23 = stablehlo.divide %22, %14 : tensor<1x19200x1xf64>
-    %24 = stablehlo.convert %23 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %25 = stablehlo.reduce(%7 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %26 = stablehlo.reshape %25 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %27 = stablehlo.convert %cst_85 : (tensor<1xi64>) -> tensor<1xf32>
-    %28 = stablehlo.reshape %27 : (tensor<1xf32>) -> tensor<f32>
-    %29 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %30 = stablehlo.broadcast_in_dim %28, dims = [] : (tensor<f32>) -> tensor<1x19200x1xf32>
-    %31 = stablehlo.divide %29, %30 : tensor<1x19200x1xf32>
-    %32 = stablehlo.convert %cst_86 : (tensor<1xf64>) -> tensor<1xf32>
-    %33 = stablehlo.reshape %32 : (tensor<1xf32>) -> tensor<f32>
-    %34 = stablehlo.broadcast_in_dim %24, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %35 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x19200x1xf32>
-    %36 = stablehlo.add %34, %35 : tensor<1x19200x1xf32>
-    %37 = stablehlo.rsqrt %36 : tensor<1x19200x1xf32>
-    %38 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %39 = stablehlo.broadcast_in_dim %31, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %40 = stablehlo.subtract %38, %39 : tensor<1x19200x64xf32>
-    %41 = stablehlo.broadcast_in_dim %40, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %42 = stablehlo.broadcast_in_dim %37, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %43 = stablehlo.multiply %41, %42 : tensor<1x19200x64xf32>
-    %44 = stablehlo.convert %arg3 : (tensor<64xbf16>) -> tensor<64xf32>
-    %45 = stablehlo.broadcast_in_dim %43, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %47 = stablehlo.multiply %45, %46 : tensor<1x19200x64xf32>
-    %48 = stablehlo.convert %arg4 : (tensor<64xbf16>) -> tensor<64xf32>
-    %49 = stablehlo.broadcast_in_dim %47, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %51 = stablehlo.add %49, %50 : tensor<1x19200x64xf32>
-    %52 = stablehlo.convert %51 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %53 = stablehlo.convert %52 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %54 = stablehlo.convert %53 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %55 = stablehlo.reduce(%54 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %56 = stablehlo.reshape %55 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %57 = stablehlo.broadcast_in_dim %56, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %58 = stablehlo.divide %57, %14 : tensor<1x19200x1xf64>
-    %59 = stablehlo.broadcast_in_dim %54, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %60 = stablehlo.broadcast_in_dim %58, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %61 = stablehlo.subtract %59, %60 : tensor<1x19200x64xf64>
-    %62 = stablehlo.multiply %61, %61 : tensor<1x19200x64xf64>
-    %63 = stablehlo.reduce(%62 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %64 = stablehlo.reshape %63 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %65 = stablehlo.broadcast_in_dim %64, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %66 = stablehlo.divide %65, %14 : tensor<1x19200x1xf64>
-    %67 = stablehlo.convert %66 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %68 = stablehlo.reduce(%53 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %69 = stablehlo.reshape %68 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %70 = stablehlo.broadcast_in_dim %69, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %71 = stablehlo.divide %70, %30 : tensor<1x19200x1xf32>
-    %72 = stablehlo.broadcast_in_dim %67, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %73 = stablehlo.add %72, %35 : tensor<1x19200x1xf32>
-    %74 = stablehlo.rsqrt %73 : tensor<1x19200x1xf32>
-    %75 = stablehlo.broadcast_in_dim %53, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %76 = stablehlo.broadcast_in_dim %71, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %77 = stablehlo.subtract %75, %76 : tensor<1x19200x64xf32>
-    %78 = stablehlo.broadcast_in_dim %77, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %79 = stablehlo.broadcast_in_dim %74, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %80 = stablehlo.multiply %78, %79 : tensor<1x19200x64xf32>
-    %81 = stablehlo.convert %arg5 : (tensor<64xbf16>) -> tensor<64xf32>
-    %82 = stablehlo.broadcast_in_dim %80, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %83 = stablehlo.broadcast_in_dim %81, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %84 = stablehlo.multiply %82, %83 : tensor<1x19200x64xf32>
-    %85 = stablehlo.convert %arg6 : (tensor<64xbf16>) -> tensor<64xf32>
-    %86 = stablehlo.broadcast_in_dim %84, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %87 = stablehlo.broadcast_in_dim %85, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %88 = stablehlo.add %86, %87 : tensor<1x19200x64xf32>
-    %89 = stablehlo.convert %88 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %90 = stablehlo.reshape %89 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %91 = stablehlo.convert %90 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %92 = stablehlo.dot_general %91, %arg492, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x64xf32>) -> tensor<19200x64xf32>
-    %93 = stablehlo.convert %cst_87 : (tensor<1xi64>) -> tensor<1xf32>
-    %94 = stablehlo.reshape %93 : (tensor<1xf32>) -> tensor<f32>
-    %95 = stablehlo.broadcast_in_dim %92, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %96 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<19200x64xf32>
-    %97 = stablehlo.multiply %95, %96 : tensor<19200x64xf32>
-    %98 = stablehlo.broadcast_in_dim %97, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %99 = stablehlo.broadcast_in_dim %arg493, dims = [1] : (tensor<64xf32>) -> tensor<19200x64xf32>
-    %100 = stablehlo.add %98, %99 : tensor<19200x64xf32>
-    %101 = stablehlo.convert %100 : (tensor<19200x64xf32>) -> tensor<19200x64xbf16>
-    %102 = stablehlo.reshape %101 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %103 = stablehlo.reshape %102 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x1x64xbf16>
-    %104 = stablehlo.transpose %103, dims = [0, 2, 1, 3] : (tensor<1x19200x1x64xbf16>) -> tensor<1x1x19200x64xbf16>
-    %105 = stablehlo.transpose %89, dims = [0, 2, 1] : (tensor<1x19200x64xbf16>) -> tensor<1x64x19200xbf16>
-    %106 = stablehlo.reshape %105 : (tensor<1x64x19200xbf16>) -> tensor<1x64x120x160xbf16>
-    %107 = stablehlo.convolution(%106, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [8, 8], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x120x160xbf16>, tensor<64x64x8x8xbf16>) -> tensor<1x64x15x20xbf16>
-    %108 = stablehlo.reshape %arg8 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %109 = stablehlo.broadcast_in_dim %107, dims = [0, 1, 2, 3] : (tensor<1x64x15x20xbf16>) -> tensor<1x64x15x20xbf16>
-    %110 = stablehlo.broadcast_in_dim %108, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x15x20xbf16>
-    %111 = stablehlo.add %109, %110 : tensor<1x64x15x20xbf16>
-    %112 = stablehlo.reshape %111 : (tensor<1x64x15x20xbf16>) -> tensor<1x64x300xbf16>
-    %113 = stablehlo.transpose %112, dims = [0, 2, 1] : (tensor<1x64x300xbf16>) -> tensor<1x300x64xbf16>
-    %114 = stablehlo.convert %113 : (tensor<1x300x64xbf16>) -> tensor<1x300x64xf32>
-    %115 = stablehlo.convert %114 : (tensor<1x300x64xf32>) -> tensor<1x300x64xf64>
-    %116 = stablehlo.reduce(%115 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %117 = stablehlo.reshape %116 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %119 = stablehlo.broadcast_in_dim %12, dims = [] : (tensor<f64>) -> tensor<1x300x1xf64>
-    %120 = stablehlo.divide %118, %119 : tensor<1x300x1xf64>
-    %121 = stablehlo.broadcast_in_dim %115, dims = [0, 1, 2] : (tensor<1x300x64xf64>) -> tensor<1x300x64xf64>
-    %122 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x64xf64>
-    %123 = stablehlo.subtract %121, %122 : tensor<1x300x64xf64>
-    %124 = stablehlo.multiply %123, %123 : tensor<1x300x64xf64>
-    %125 = stablehlo.reduce(%124 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %126 = stablehlo.reshape %125 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %127 = stablehlo.broadcast_in_dim %126, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %128 = stablehlo.divide %127, %119 : tensor<1x300x1xf64>
-    %129 = stablehlo.convert %128 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %130 = stablehlo.reduce(%114 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %131 = stablehlo.reshape %130 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %132 = stablehlo.broadcast_in_dim %131, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %133 = stablehlo.broadcast_in_dim %28, dims = [] : (tensor<f32>) -> tensor<1x300x1xf32>
-    %134 = stablehlo.divide %132, %133 : tensor<1x300x1xf32>
-    %135 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %136 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x300x1xf32>
-    %137 = stablehlo.add %135, %136 : tensor<1x300x1xf32>
-    %138 = stablehlo.rsqrt %137 : tensor<1x300x1xf32>
-    %139 = stablehlo.broadcast_in_dim %114, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %140 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x64xf32>
-    %141 = stablehlo.subtract %139, %140 : tensor<1x300x64xf32>
-    %142 = stablehlo.broadcast_in_dim %141, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %143 = stablehlo.broadcast_in_dim %138, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x64xf32>
-    %144 = stablehlo.multiply %142, %143 : tensor<1x300x64xf32>
-    %145 = stablehlo.convert %arg9 : (tensor<64xbf16>) -> tensor<64xf32>
-    %146 = stablehlo.broadcast_in_dim %144, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %147 = stablehlo.broadcast_in_dim %145, dims = [2] : (tensor<64xf32>) -> tensor<1x300x64xf32>
-    %148 = stablehlo.multiply %146, %147 : tensor<1x300x64xf32>
-    %149 = stablehlo.convert %arg10 : (tensor<64xbf16>) -> tensor<64xf32>
-    %150 = stablehlo.broadcast_in_dim %148, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %151 = stablehlo.broadcast_in_dim %149, dims = [2] : (tensor<64xf32>) -> tensor<1x300x64xf32>
-    %152 = stablehlo.add %150, %151 : tensor<1x300x64xf32>
-    %153 = stablehlo.convert %152 : (tensor<1x300x64xf32>) -> tensor<1x300x64xbf16>
-    %154 = stablehlo.reshape %153 : (tensor<1x300x64xbf16>) -> tensor<300x64xbf16>
-    %155 = stablehlo.convert %154 : (tensor<300x64xbf16>) -> tensor<300x64xf32>
-    %156 = stablehlo.dot_general %155, %arg494, contracting_dims = [1] x [0] : (tensor<300x64xf32>, tensor<64x64xf32>) -> tensor<300x64xf32>
-    %157 = stablehlo.broadcast_in_dim %156, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %158 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<300x64xf32>
-    %159 = stablehlo.multiply %157, %158 : tensor<300x64xf32>
-    %160 = stablehlo.broadcast_in_dim %159, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %161 = stablehlo.broadcast_in_dim %arg495, dims = [1] : (tensor<64xf32>) -> tensor<300x64xf32>
-    %162 = stablehlo.add %160, %161 : tensor<300x64xf32>
-    %163 = stablehlo.convert %162 : (tensor<300x64xf32>) -> tensor<300x64xbf16>
-    %164 = stablehlo.reshape %163 : (tensor<300x64xbf16>) -> tensor<1x300x64xbf16>
-    %165 = stablehlo.reshape %164 : (tensor<1x300x64xbf16>) -> tensor<1x300x1x64xbf16>
-    %166 = stablehlo.transpose %165, dims = [0, 2, 1, 3] : (tensor<1x300x1x64xbf16>) -> tensor<1x1x300x64xbf16>
-    %167 = stablehlo.dot_general %155, %arg496, contracting_dims = [1] x [0] : (tensor<300x64xf32>, tensor<64x64xf32>) -> tensor<300x64xf32>
-    %168 = stablehlo.broadcast_in_dim %167, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %169 = stablehlo.multiply %168, %158 : tensor<300x64xf32>
-    %170 = stablehlo.broadcast_in_dim %169, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %171 = stablehlo.broadcast_in_dim %arg497, dims = [1] : (tensor<64xf32>) -> tensor<300x64xf32>
-    %172 = stablehlo.add %170, %171 : tensor<300x64xf32>
-    %173 = stablehlo.convert %172 : (tensor<300x64xf32>) -> tensor<300x64xbf16>
-    %174 = stablehlo.reshape %173 : (tensor<300x64xbf16>) -> tensor<1x300x64xbf16>
-    %175 = stablehlo.reshape %174 : (tensor<1x300x64xbf16>) -> tensor<1x300x1x64xbf16>
-    %176 = stablehlo.transpose %175, dims = [0, 2, 1, 3] : (tensor<1x300x1x64xbf16>) -> tensor<1x1x300x64xbf16>
-    %177 = stablehlo.transpose %166, dims = [0, 1, 3, 2] : (tensor<1x1x300x64xbf16>) -> tensor<1x1x64x300xbf16>
-    %178 = stablehlo.reshape %104 : (tensor<1x1x19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %179 = stablehlo.reshape %177 : (tensor<1x1x64x300xbf16>) -> tensor<1x64x300xbf16>
-    %180 = stablehlo.broadcast_in_dim %179, dims = [0, 1, 2] : (tensor<1x64x300xbf16>) -> tensor<1x64x300xbf16>
-    %181 = stablehlo.dot_general %178, %180, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x19200x64xbf16>, tensor<1x64x300xbf16>) -> tensor<1x19200x300xbf16>
-    %182 = stablehlo.reshape %181 : (tensor<1x19200x300xbf16>) -> tensor<1x1x19200x300xbf16>
-    %183 = stablehlo.convert %cst_88 : (tensor<1xf64>) -> tensor<1xbf16>
-    %184 = stablehlo.reshape %183 : (tensor<1xbf16>) -> tensor<bf16>
-    %185 = stablehlo.broadcast_in_dim %182, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xbf16>) -> tensor<1x1x19200x300xbf16>
-    %186 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x1x19200x300xbf16>
-    %187 = stablehlo.divide %185, %186 : tensor<1x1x19200x300xbf16>
-    %188 = stablehlo.convert %187 : (tensor<1x1x19200x300xbf16>) -> tensor<1x1x19200x300xf32>
-    %189 = stablehlo.reduce(%188 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x1x19200x300xf32>, tensor<f32>) -> tensor<1x1x19200xf32>
-    %190 = stablehlo.reshape %189 : (tensor<1x1x19200xf32>) -> tensor<1x1x19200x1xf32>
-    %191 = stablehlo.broadcast_in_dim %188, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xf32>
-    %192 = stablehlo.broadcast_in_dim %190, dims = [0, 1, 2, 3] : (tensor<1x1x19200x1xf32>) -> tensor<1x1x19200x300xf32>
-    %193 = stablehlo.subtract %191, %192 : tensor<1x1x19200x300xf32>
-    %194 = stablehlo.exponential %193 : tensor<1x1x19200x300xf32>
-    %195 = stablehlo.reduce(%194 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x1x19200x300xf32>, tensor<f32>) -> tensor<1x1x19200xf32>
-    %196 = stablehlo.reshape %195 : (tensor<1x1x19200xf32>) -> tensor<1x1x19200x1xf32>
-    %197 = stablehlo.broadcast_in_dim %194, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xf32>
-    %198 = stablehlo.broadcast_in_dim %196, dims = [0, 1, 2, 3] : (tensor<1x1x19200x1xf32>) -> tensor<1x1x19200x300xf32>
-    %199 = stablehlo.divide %197, %198 : tensor<1x1x19200x300xf32>
-    %200 = stablehlo.convert %199 : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xbf16>
-    %201 = stablehlo.reshape %200 : (tensor<1x1x19200x300xbf16>) -> tensor<1x19200x300xbf16>
-    %202 = stablehlo.reshape %176 : (tensor<1x1x300x64xbf16>) -> tensor<1x300x64xbf16>
-    %203 = stablehlo.broadcast_in_dim %202, dims = [0, 1, 2] : (tensor<1x300x64xbf16>) -> tensor<1x300x64xbf16>
-    %204 = stablehlo.dot_general %201, %203, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x19200x300xbf16>, tensor<1x300x64xbf16>) -> tensor<1x19200x64xbf16>
-    %205 = stablehlo.reshape %204 : (tensor<1x19200x64xbf16>) -> tensor<1x1x19200x64xbf16>
-    %206 = stablehlo.transpose %205, dims = [0, 2, 1, 3] : (tensor<1x1x19200x64xbf16>) -> tensor<1x19200x1x64xbf16>
-    %207 = stablehlo.reshape %206 : (tensor<1x19200x1x64xbf16>) -> tensor<1x19200x64xbf16>
-    %208 = stablehlo.reshape %207 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %209 = stablehlo.convert %208 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %210 = stablehlo.dot_general %209, %arg498, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x64xf32>) -> tensor<19200x64xf32>
-    %211 = stablehlo.broadcast_in_dim %210, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %212 = stablehlo.multiply %211, %96 : tensor<19200x64xf32>
-    %213 = stablehlo.broadcast_in_dim %212, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %214 = stablehlo.broadcast_in_dim %arg499, dims = [1] : (tensor<64xf32>) -> tensor<19200x64xf32>
-    %215 = stablehlo.add %213, %214 : tensor<19200x64xf32>
-    %216 = stablehlo.convert %215 : (tensor<19200x64xf32>) -> tensor<19200x64xbf16>
-    %217 = stablehlo.reshape %216 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %218 = stablehlo.add %217, %52 : tensor<1x19200x64xbf16>
-    %219 = stablehlo.convert %218 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %220 = stablehlo.convert %219 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %221 = stablehlo.reduce(%220 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %222 = stablehlo.reshape %221 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %223 = stablehlo.broadcast_in_dim %222, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %224 = stablehlo.divide %223, %14 : tensor<1x19200x1xf64>
-    %225 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %226 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %227 = stablehlo.subtract %225, %226 : tensor<1x19200x64xf64>
-    %228 = stablehlo.multiply %227, %227 : tensor<1x19200x64xf64>
-    %229 = stablehlo.reduce(%228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %230 = stablehlo.reshape %229 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %231 = stablehlo.broadcast_in_dim %230, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %232 = stablehlo.divide %231, %14 : tensor<1x19200x1xf64>
-    %233 = stablehlo.convert %232 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %234 = stablehlo.reduce(%219 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %235 = stablehlo.reshape %234 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %236 = stablehlo.broadcast_in_dim %235, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %237 = stablehlo.divide %236, %30 : tensor<1x19200x1xf32>
-    %238 = stablehlo.broadcast_in_dim %233, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %239 = stablehlo.add %238, %35 : tensor<1x19200x1xf32>
-    %240 = stablehlo.rsqrt %239 : tensor<1x19200x1xf32>
-    %241 = stablehlo.broadcast_in_dim %219, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %242 = stablehlo.broadcast_in_dim %237, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %243 = stablehlo.subtract %241, %242 : tensor<1x19200x64xf32>
-    %244 = stablehlo.broadcast_in_dim %243, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %245 = stablehlo.broadcast_in_dim %240, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %246 = stablehlo.multiply %244, %245 : tensor<1x19200x64xf32>
-    %247 = stablehlo.convert %arg11 : (tensor<64xbf16>) -> tensor<64xf32>
-    %248 = stablehlo.broadcast_in_dim %246, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %249 = stablehlo.broadcast_in_dim %247, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %250 = stablehlo.multiply %248, %249 : tensor<1x19200x64xf32>
-    %251 = stablehlo.convert %arg12 : (tensor<64xbf16>) -> tensor<64xf32>
-    %252 = stablehlo.broadcast_in_dim %250, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %253 = stablehlo.broadcast_in_dim %251, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %254 = stablehlo.add %252, %253 : tensor<1x19200x64xf32>
-    %255 = stablehlo.convert %254 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %256 = stablehlo.reshape %255 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %257 = stablehlo.convert %256 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %258 = stablehlo.dot_general %257, %arg500, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x256xf32>) -> tensor<19200x256xf32>
-    %259 = stablehlo.broadcast_in_dim %258, dims = [0, 1] : (tensor<19200x256xf32>) -> tensor<19200x256xf32>
-    %260 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<19200x256xf32>
-    %261 = stablehlo.multiply %259, %260 : tensor<19200x256xf32>
-    %262 = stablehlo.broadcast_in_dim %261, dims = [0, 1] : (tensor<19200x256xf32>) -> tensor<19200x256xf32>
-    %263 = stablehlo.broadcast_in_dim %arg501, dims = [1] : (tensor<256xf32>) -> tensor<19200x256xf32>
-    %264 = stablehlo.add %262, %263 : tensor<19200x256xf32>
-    %265 = stablehlo.convert %264 : (tensor<19200x256xf32>) -> tensor<19200x256xbf16>
-    %266 = stablehlo.reshape %265 : (tensor<19200x256xbf16>) -> tensor<1x19200x256xbf16>
-    %267 = stablehlo.transpose %266, dims = [0, 2, 1] : (tensor<1x19200x256xbf16>) -> tensor<1x256x19200xbf16>
-    %268 = stablehlo.reshape %267 : (tensor<1x256x19200xbf16>) -> tensor<1x256x120x160xbf16>
-    %269 = stablehlo.convolution(%268, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x120x160xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x120x160xbf16>
-    %270 = stablehlo.reshape %arg14 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %271 = stablehlo.broadcast_in_dim %269, dims = [0, 1, 2, 3] : (tensor<1x256x120x160xbf16>) -> tensor<1x256x120x160xbf16>
-    %272 = stablehlo.broadcast_in_dim %270, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x120x160xbf16>
-    %273 = stablehlo.add %271, %272 : tensor<1x256x120x160xbf16>
-    %274 = stablehlo.reshape %273 : (tensor<1x256x120x160xbf16>) -> tensor<1x256x19200xbf16>
-    %275 = stablehlo.transpose %274, dims = [0, 2, 1] : (tensor<1x256x19200xbf16>) -> tensor<1x19200x256xbf16>
-    %276 = stablehlo.multiply %275, %cst_4 : tensor<1x19200x256xbf16>
-    %277 = stablehlo.rsqrt %cst_3 : tensor<1x19200x256xbf16>
-    %278 = stablehlo.multiply %275, %277 : tensor<1x19200x256xbf16>
-    %279 = stablehlo.convert %278 : (tensor<1x19200x256xbf16>) -> tensor<1x19200x256xf32>
-    %280 = stablehlo.clamp %cst_5, %279, %cst_6 : tensor<1x19200x256xf32>
-    %281 = stablehlo.multiply %280, %280 : tensor<1x19200x256xf32>
-    %282 = stablehlo.multiply %cst_7, %281 : tensor<1x19200x256xf32>
-    %283 = stablehlo.add %282, %cst_8 : tensor<1x19200x256xf32>
-    %284 = stablehlo.multiply %283, %281 : tensor<1x19200x256xf32>
-    %285 = stablehlo.add %284, %cst_9 : tensor<1x19200x256xf32>
-    %286 = stablehlo.multiply %285, %281 : tensor<1x19200x256xf32>
-    %287 = stablehlo.add %286, %cst_10 : tensor<1x19200x256xf32>
-    %288 = stablehlo.multiply %287, %281 : tensor<1x19200x256xf32>
-    %289 = stablehlo.add %288, %cst_11 : tensor<1x19200x256xf32>
-    %290 = stablehlo.multiply %289, %281 : tensor<1x19200x256xf32>
-    %291 = stablehlo.add %290, %cst_12 : tensor<1x19200x256xf32>
-    %292 = stablehlo.multiply %291, %281 : tensor<1x19200x256xf32>
-    %293 = stablehlo.add %292, %cst_13 : tensor<1x19200x256xf32>
-    %294 = stablehlo.multiply %cst_14, %281 : tensor<1x19200x256xf32>
-    %295 = stablehlo.add %294, %cst_15 : tensor<1x19200x256xf32>
-    %296 = stablehlo.multiply %295, %281 : tensor<1x19200x256xf32>
-    %297 = stablehlo.add %296, %cst_16 : tensor<1x19200x256xf32>
-    %298 = stablehlo.multiply %297, %281 : tensor<1x19200x256xf32>
-    %299 = stablehlo.add %298, %cst_17 : tensor<1x19200x256xf32>
-    %300 = stablehlo.multiply %299, %281 : tensor<1x19200x256xf32>
-    %301 = stablehlo.add %300, %cst_18 : tensor<1x19200x256xf32>
-    %302 = stablehlo.multiply %280, %293 : tensor<1x19200x256xf32>
-    %303 = stablehlo.divide %302, %301 : tensor<1x19200x256xf32>
-    %304 = stablehlo.clamp %cst_19, %303, %cst_20 : tensor<1x19200x256xf32>
-    %305 = stablehlo.convert %304 : (tensor<1x19200x256xf32>) -> tensor<1x19200x256xbf16>
-    %306 = stablehlo.add %305, %cst_2 : tensor<1x19200x256xbf16>
-    %307 = stablehlo.multiply %306, %276 : tensor<1x19200x256xbf16>
-    %308 = stablehlo.reshape %307 : (tensor<1x19200x256xbf16>) -> tensor<19200x256xbf16>
-    %309 = stablehlo.dot_general %308, %arg502, contracting_dims = [1] x [0] : (tensor<19200x256xbf16>, tensor<256x64xbf16>) -> tensor<19200x64xbf16>
-    %310 = stablehlo.reshape %309 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %311 = stablehlo.broadcast_in_dim %310, dims = [0, 1, 2] : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %312 = stablehlo.broadcast_in_dim %arg15, dims = [2] : (tensor<64xbf16>) -> tensor<1x19200x64xbf16>
-    %313 = stablehlo.add %311, %312 : tensor<1x19200x64xbf16>
-    %314 = stablehlo.reshape %313 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %315 = stablehlo.reshape %314 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %316 = stablehlo.add %315, %218 : tensor<1x19200x64xbf16>
-    %317 = stablehlo.convert %316 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %318 = stablehlo.convert %317 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %319 = stablehlo.reduce(%318 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %320 = stablehlo.reshape %319 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %321 = stablehlo.broadcast_in_dim %320, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %322 = stablehlo.divide %321, %14 : tensor<1x19200x1xf64>
-    %323 = stablehlo.broadcast_in_dim %318, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %324 = stablehlo.broadcast_in_dim %322, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %325 = stablehlo.subtract %323, %324 : tensor<1x19200x64xf64>
-    %326 = stablehlo.multiply %325, %325 : tensor<1x19200x64xf64>
-    %327 = stablehlo.reduce(%326 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %328 = stablehlo.reshape %327 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %329 = stablehlo.broadcast_in_dim %328, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %330 = stablehlo.divide %329, %14 : tensor<1x19200x1xf64>
-    %331 = stablehlo.convert %330 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %332 = stablehlo.reduce(%317 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %333 = stablehlo.reshape %332 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %334 = stablehlo.broadcast_in_dim %333, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %335 = stablehlo.divide %334, %30 : tensor<1x19200x1xf32>
-    %336 = stablehlo.broadcast_in_dim %331, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %337 = stablehlo.add %336, %35 : tensor<1x19200x1xf32>
-    %338 = stablehlo.rsqrt %337 : tensor<1x19200x1xf32>
-    %339 = stablehlo.broadcast_in_dim %317, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %340 = stablehlo.broadcast_in_dim %335, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %341 = stablehlo.subtract %339, %340 : tensor<1x19200x64xf32>
-    %342 = stablehlo.broadcast_in_dim %341, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %343 = stablehlo.broadcast_in_dim %338, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %344 = stablehlo.multiply %342, %343 : tensor<1x19200x64xf32>
-    %345 = stablehlo.convert %arg16 : (tensor<64xbf16>) -> tensor<64xf32>
-    %346 = stablehlo.broadcast_in_dim %344, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %347 = stablehlo.broadcast_in_dim %345, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %348 = stablehlo.multiply %346, %347 : tensor<1x19200x64xf32>
-    %349 = stablehlo.convert %arg17 : (tensor<64xbf16>) -> tensor<64xf32>
-    %350 = stablehlo.broadcast_in_dim %348, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %351 = stablehlo.broadcast_in_dim %349, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %352 = stablehlo.add %350, %351 : tensor<1x19200x64xf32>
-    %353 = stablehlo.convert %352 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %354 = stablehlo.reshape %353 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %355 = stablehlo.convert %354 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %356 = stablehlo.dot_general %355, %arg503, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x64xf32>) -> tensor<19200x64xf32>
-    %357 = stablehlo.broadcast_in_dim %356, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %358 = stablehlo.multiply %357, %96 : tensor<19200x64xf32>
-    %359 = stablehlo.broadcast_in_dim %358, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %360 = stablehlo.broadcast_in_dim %arg504, dims = [1] : (tensor<64xf32>) -> tensor<19200x64xf32>
-    %361 = stablehlo.add %359, %360 : tensor<19200x64xf32>
-    %362 = stablehlo.convert %361 : (tensor<19200x64xf32>) -> tensor<19200x64xbf16>
-    %363 = stablehlo.reshape %362 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %364 = stablehlo.reshape %363 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x1x64xbf16>
-    %365 = stablehlo.transpose %364, dims = [0, 2, 1, 3] : (tensor<1x19200x1x64xbf16>) -> tensor<1x1x19200x64xbf16>
-    %366 = stablehlo.transpose %353, dims = [0, 2, 1] : (tensor<1x19200x64xbf16>) -> tensor<1x64x19200xbf16>
-    %367 = stablehlo.reshape %366 : (tensor<1x64x19200xbf16>) -> tensor<1x64x120x160xbf16>
-    %368 = stablehlo.convolution(%367, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [8, 8], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x120x160xbf16>, tensor<64x64x8x8xbf16>) -> tensor<1x64x15x20xbf16>
-    %369 = stablehlo.reshape %arg19 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %370 = stablehlo.broadcast_in_dim %368, dims = [0, 1, 2, 3] : (tensor<1x64x15x20xbf16>) -> tensor<1x64x15x20xbf16>
-    %371 = stablehlo.broadcast_in_dim %369, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x15x20xbf16>
-    %372 = stablehlo.add %370, %371 : tensor<1x64x15x20xbf16>
-    %373 = stablehlo.reshape %372 : (tensor<1x64x15x20xbf16>) -> tensor<1x64x300xbf16>
-    %374 = stablehlo.transpose %373, dims = [0, 2, 1] : (tensor<1x64x300xbf16>) -> tensor<1x300x64xbf16>
-    %375 = stablehlo.convert %374 : (tensor<1x300x64xbf16>) -> tensor<1x300x64xf32>
-    %376 = stablehlo.convert %375 : (tensor<1x300x64xf32>) -> tensor<1x300x64xf64>
-    %377 = stablehlo.reduce(%376 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %378 = stablehlo.reshape %377 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %379 = stablehlo.broadcast_in_dim %378, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %380 = stablehlo.divide %379, %119 : tensor<1x300x1xf64>
-    %381 = stablehlo.broadcast_in_dim %376, dims = [0, 1, 2] : (tensor<1x300x64xf64>) -> tensor<1x300x64xf64>
-    %382 = stablehlo.broadcast_in_dim %380, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x64xf64>
-    %383 = stablehlo.subtract %381, %382 : tensor<1x300x64xf64>
-    %384 = stablehlo.multiply %383, %383 : tensor<1x300x64xf64>
-    %385 = stablehlo.reduce(%384 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %386 = stablehlo.reshape %385 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %387 = stablehlo.broadcast_in_dim %386, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %388 = stablehlo.divide %387, %119 : tensor<1x300x1xf64>
-    %389 = stablehlo.convert %388 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %390 = stablehlo.reduce(%375 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %391 = stablehlo.reshape %390 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %392 = stablehlo.broadcast_in_dim %391, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %393 = stablehlo.divide %392, %133 : tensor<1x300x1xf32>
-    %394 = stablehlo.broadcast_in_dim %389, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %395 = stablehlo.add %394, %136 : tensor<1x300x1xf32>
-    %396 = stablehlo.rsqrt %395 : tensor<1x300x1xf32>
-    %397 = stablehlo.broadcast_in_dim %375, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %398 = stablehlo.broadcast_in_dim %393, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x64xf32>
-    %399 = stablehlo.subtract %397, %398 : tensor<1x300x64xf32>
-    %400 = stablehlo.broadcast_in_dim %399, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %401 = stablehlo.broadcast_in_dim %396, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x64xf32>
-    %402 = stablehlo.multiply %400, %401 : tensor<1x300x64xf32>
-    %403 = stablehlo.convert %arg20 : (tensor<64xbf16>) -> tensor<64xf32>
-    %404 = stablehlo.broadcast_in_dim %402, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %405 = stablehlo.broadcast_in_dim %403, dims = [2] : (tensor<64xf32>) -> tensor<1x300x64xf32>
-    %406 = stablehlo.multiply %404, %405 : tensor<1x300x64xf32>
-    %407 = stablehlo.convert %arg21 : (tensor<64xbf16>) -> tensor<64xf32>
-    %408 = stablehlo.broadcast_in_dim %406, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %409 = stablehlo.broadcast_in_dim %407, dims = [2] : (tensor<64xf32>) -> tensor<1x300x64xf32>
-    %410 = stablehlo.add %408, %409 : tensor<1x300x64xf32>
-    %411 = stablehlo.convert %410 : (tensor<1x300x64xf32>) -> tensor<1x300x64xbf16>
-    %412 = stablehlo.reshape %411 : (tensor<1x300x64xbf16>) -> tensor<300x64xbf16>
-    %413 = stablehlo.convert %412 : (tensor<300x64xbf16>) -> tensor<300x64xf32>
-    %414 = stablehlo.dot_general %413, %arg505, contracting_dims = [1] x [0] : (tensor<300x64xf32>, tensor<64x64xf32>) -> tensor<300x64xf32>
-    %415 = stablehlo.broadcast_in_dim %414, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %416 = stablehlo.multiply %415, %158 : tensor<300x64xf32>
-    %417 = stablehlo.broadcast_in_dim %416, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %418 = stablehlo.broadcast_in_dim %arg506, dims = [1] : (tensor<64xf32>) -> tensor<300x64xf32>
-    %419 = stablehlo.add %417, %418 : tensor<300x64xf32>
-    %420 = stablehlo.convert %419 : (tensor<300x64xf32>) -> tensor<300x64xbf16>
-    %421 = stablehlo.reshape %420 : (tensor<300x64xbf16>) -> tensor<1x300x64xbf16>
-    %422 = stablehlo.reshape %421 : (tensor<1x300x64xbf16>) -> tensor<1x300x1x64xbf16>
-    %423 = stablehlo.transpose %422, dims = [0, 2, 1, 3] : (tensor<1x300x1x64xbf16>) -> tensor<1x1x300x64xbf16>
-    %424 = stablehlo.dot_general %413, %arg507, contracting_dims = [1] x [0] : (tensor<300x64xf32>, tensor<64x64xf32>) -> tensor<300x64xf32>
-    %425 = stablehlo.broadcast_in_dim %424, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %426 = stablehlo.multiply %425, %158 : tensor<300x64xf32>
-    %427 = stablehlo.broadcast_in_dim %426, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %428 = stablehlo.broadcast_in_dim %arg508, dims = [1] : (tensor<64xf32>) -> tensor<300x64xf32>
-    %429 = stablehlo.add %427, %428 : tensor<300x64xf32>
-    %430 = stablehlo.convert %429 : (tensor<300x64xf32>) -> tensor<300x64xbf16>
-    %431 = stablehlo.reshape %430 : (tensor<300x64xbf16>) -> tensor<1x300x64xbf16>
-    %432 = stablehlo.reshape %431 : (tensor<1x300x64xbf16>) -> tensor<1x300x1x64xbf16>
-    %433 = stablehlo.transpose %432, dims = [0, 2, 1, 3] : (tensor<1x300x1x64xbf16>) -> tensor<1x1x300x64xbf16>
-    %434 = stablehlo.transpose %423, dims = [0, 1, 3, 2] : (tensor<1x1x300x64xbf16>) -> tensor<1x1x64x300xbf16>
-    %435 = stablehlo.reshape %365 : (tensor<1x1x19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %436 = stablehlo.reshape %434 : (tensor<1x1x64x300xbf16>) -> tensor<1x64x300xbf16>
-    %437 = stablehlo.broadcast_in_dim %436, dims = [0, 1, 2] : (tensor<1x64x300xbf16>) -> tensor<1x64x300xbf16>
-    %438 = stablehlo.dot_general %435, %437, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x19200x64xbf16>, tensor<1x64x300xbf16>) -> tensor<1x19200x300xbf16>
-    %439 = stablehlo.reshape %438 : (tensor<1x19200x300xbf16>) -> tensor<1x1x19200x300xbf16>
-    %440 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xbf16>) -> tensor<1x1x19200x300xbf16>
-    %441 = stablehlo.divide %440, %186 : tensor<1x1x19200x300xbf16>
-    %442 = stablehlo.convert %441 : (tensor<1x1x19200x300xbf16>) -> tensor<1x1x19200x300xf32>
-    %443 = stablehlo.reduce(%442 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x1x19200x300xf32>, tensor<f32>) -> tensor<1x1x19200xf32>
-    %444 = stablehlo.reshape %443 : (tensor<1x1x19200xf32>) -> tensor<1x1x19200x1xf32>
-    %445 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xf32>
-    %446 = stablehlo.broadcast_in_dim %444, dims = [0, 1, 2, 3] : (tensor<1x1x19200x1xf32>) -> tensor<1x1x19200x300xf32>
-    %447 = stablehlo.subtract %445, %446 : tensor<1x1x19200x300xf32>
-    %448 = stablehlo.exponential %447 : tensor<1x1x19200x300xf32>
-    %449 = stablehlo.reduce(%448 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x1x19200x300xf32>, tensor<f32>) -> tensor<1x1x19200xf32>
-    %450 = stablehlo.reshape %449 : (tensor<1x1x19200xf32>) -> tensor<1x1x19200x1xf32>
-    %451 = stablehlo.broadcast_in_dim %448, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xf32>
-    %452 = stablehlo.broadcast_in_dim %450, dims = [0, 1, 2, 3] : (tensor<1x1x19200x1xf32>) -> tensor<1x1x19200x300xf32>
-    %453 = stablehlo.divide %451, %452 : tensor<1x1x19200x300xf32>
-    %454 = stablehlo.convert %453 : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xbf16>
-    %455 = stablehlo.reshape %454 : (tensor<1x1x19200x300xbf16>) -> tensor<1x19200x300xbf16>
-    %456 = stablehlo.reshape %433 : (tensor<1x1x300x64xbf16>) -> tensor<1x300x64xbf16>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2] : (tensor<1x300x64xbf16>) -> tensor<1x300x64xbf16>
-    %458 = stablehlo.dot_general %455, %457, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x19200x300xbf16>, tensor<1x300x64xbf16>) -> tensor<1x19200x64xbf16>
-    %459 = stablehlo.reshape %458 : (tensor<1x19200x64xbf16>) -> tensor<1x1x19200x64xbf16>
-    %460 = stablehlo.transpose %459, dims = [0, 2, 1, 3] : (tensor<1x1x19200x64xbf16>) -> tensor<1x19200x1x64xbf16>
-    %461 = stablehlo.reshape %460 : (tensor<1x19200x1x64xbf16>) -> tensor<1x19200x64xbf16>
-    %462 = stablehlo.reshape %461 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %463 = stablehlo.convert %462 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %464 = stablehlo.dot_general %463, %arg509, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x64xf32>) -> tensor<19200x64xf32>
-    %465 = stablehlo.broadcast_in_dim %464, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %466 = stablehlo.multiply %465, %96 : tensor<19200x64xf32>
-    %467 = stablehlo.broadcast_in_dim %466, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %468 = stablehlo.broadcast_in_dim %arg510, dims = [1] : (tensor<64xf32>) -> tensor<19200x64xf32>
-    %469 = stablehlo.add %467, %468 : tensor<19200x64xf32>
-    %470 = stablehlo.convert %469 : (tensor<19200x64xf32>) -> tensor<19200x64xbf16>
-    %471 = stablehlo.reshape %470 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %472 = stablehlo.add %471, %316 : tensor<1x19200x64xbf16>
-    %473 = stablehlo.convert %472 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %474 = stablehlo.convert %473 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %475 = stablehlo.reduce(%474 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %476 = stablehlo.reshape %475 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %477 = stablehlo.broadcast_in_dim %476, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %478 = stablehlo.divide %477, %14 : tensor<1x19200x1xf64>
-    %479 = stablehlo.broadcast_in_dim %474, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %480 = stablehlo.broadcast_in_dim %478, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %481 = stablehlo.subtract %479, %480 : tensor<1x19200x64xf64>
-    %482 = stablehlo.multiply %481, %481 : tensor<1x19200x64xf64>
-    %483 = stablehlo.reduce(%482 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %484 = stablehlo.reshape %483 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %485 = stablehlo.broadcast_in_dim %484, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %486 = stablehlo.divide %485, %14 : tensor<1x19200x1xf64>
-    %487 = stablehlo.convert %486 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %488 = stablehlo.reduce(%473 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %489 = stablehlo.reshape %488 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %490 = stablehlo.broadcast_in_dim %489, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %491 = stablehlo.divide %490, %30 : tensor<1x19200x1xf32>
-    %492 = stablehlo.broadcast_in_dim %487, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %493 = stablehlo.add %492, %35 : tensor<1x19200x1xf32>
-    %494 = stablehlo.rsqrt %493 : tensor<1x19200x1xf32>
-    %495 = stablehlo.broadcast_in_dim %473, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %496 = stablehlo.broadcast_in_dim %491, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %497 = stablehlo.subtract %495, %496 : tensor<1x19200x64xf32>
-    %498 = stablehlo.broadcast_in_dim %497, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %499 = stablehlo.broadcast_in_dim %494, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %500 = stablehlo.multiply %498, %499 : tensor<1x19200x64xf32>
-    %501 = stablehlo.convert %arg22 : (tensor<64xbf16>) -> tensor<64xf32>
-    %502 = stablehlo.broadcast_in_dim %500, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %503 = stablehlo.broadcast_in_dim %501, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %504 = stablehlo.multiply %502, %503 : tensor<1x19200x64xf32>
-    %505 = stablehlo.convert %arg23 : (tensor<64xbf16>) -> tensor<64xf32>
-    %506 = stablehlo.broadcast_in_dim %504, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %507 = stablehlo.broadcast_in_dim %505, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %508 = stablehlo.add %506, %507 : tensor<1x19200x64xf32>
-    %509 = stablehlo.convert %508 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %510 = stablehlo.reshape %509 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %511 = stablehlo.convert %510 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %512 = stablehlo.dot_general %511, %arg511, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x256xf32>) -> tensor<19200x256xf32>
-    %513 = stablehlo.broadcast_in_dim %512, dims = [0, 1] : (tensor<19200x256xf32>) -> tensor<19200x256xf32>
-    %514 = stablehlo.multiply %513, %260 : tensor<19200x256xf32>
-    %515 = stablehlo.broadcast_in_dim %514, dims = [0, 1] : (tensor<19200x256xf32>) -> tensor<19200x256xf32>
-    %516 = stablehlo.broadcast_in_dim %arg512, dims = [1] : (tensor<256xf32>) -> tensor<19200x256xf32>
-    %517 = stablehlo.add %515, %516 : tensor<19200x256xf32>
-    %518 = stablehlo.convert %517 : (tensor<19200x256xf32>) -> tensor<19200x256xbf16>
-    %519 = stablehlo.reshape %518 : (tensor<19200x256xbf16>) -> tensor<1x19200x256xbf16>
-    %520 = stablehlo.transpose %519, dims = [0, 2, 1] : (tensor<1x19200x256xbf16>) -> tensor<1x256x19200xbf16>
-    %521 = stablehlo.reshape %520 : (tensor<1x256x19200xbf16>) -> tensor<1x256x120x160xbf16>
-    %522 = stablehlo.convolution(%521, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x120x160xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x120x160xbf16>
-    %523 = stablehlo.reshape %arg25 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %524 = stablehlo.broadcast_in_dim %522, dims = [0, 1, 2, 3] : (tensor<1x256x120x160xbf16>) -> tensor<1x256x120x160xbf16>
-    %525 = stablehlo.broadcast_in_dim %523, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x120x160xbf16>
-    %526 = stablehlo.add %524, %525 : tensor<1x256x120x160xbf16>
-    %527 = stablehlo.reshape %526 : (tensor<1x256x120x160xbf16>) -> tensor<1x256x19200xbf16>
-    %528 = stablehlo.transpose %527, dims = [0, 2, 1] : (tensor<1x256x19200xbf16>) -> tensor<1x19200x256xbf16>
-    %529 = stablehlo.multiply %528, %cst_4 : tensor<1x19200x256xbf16>
-    %530 = stablehlo.multiply %528, %277 : tensor<1x19200x256xbf16>
-    %531 = stablehlo.convert %530 : (tensor<1x19200x256xbf16>) -> tensor<1x19200x256xf32>
-    %532 = stablehlo.clamp %cst_5, %531, %cst_6 : tensor<1x19200x256xf32>
-    %533 = stablehlo.multiply %532, %532 : tensor<1x19200x256xf32>
-    %534 = stablehlo.multiply %cst_7, %533 : tensor<1x19200x256xf32>
-    %535 = stablehlo.add %534, %cst_8 : tensor<1x19200x256xf32>
-    %536 = stablehlo.multiply %535, %533 : tensor<1x19200x256xf32>
-    %537 = stablehlo.add %536, %cst_9 : tensor<1x19200x256xf32>
-    %538 = stablehlo.multiply %537, %533 : tensor<1x19200x256xf32>
-    %539 = stablehlo.add %538, %cst_10 : tensor<1x19200x256xf32>
-    %540 = stablehlo.multiply %539, %533 : tensor<1x19200x256xf32>
-    %541 = stablehlo.add %540, %cst_11 : tensor<1x19200x256xf32>
-    %542 = stablehlo.multiply %541, %533 : tensor<1x19200x256xf32>
-    %543 = stablehlo.add %542, %cst_12 : tensor<1x19200x256xf32>
-    %544 = stablehlo.multiply %543, %533 : tensor<1x19200x256xf32>
-    %545 = stablehlo.add %544, %cst_13 : tensor<1x19200x256xf32>
-    %546 = stablehlo.multiply %cst_14, %533 : tensor<1x19200x256xf32>
-    %547 = stablehlo.add %546, %cst_15 : tensor<1x19200x256xf32>
-    %548 = stablehlo.multiply %547, %533 : tensor<1x19200x256xf32>
-    %549 = stablehlo.add %548, %cst_16 : tensor<1x19200x256xf32>
-    %550 = stablehlo.multiply %549, %533 : tensor<1x19200x256xf32>
-    %551 = stablehlo.add %550, %cst_17 : tensor<1x19200x256xf32>
-    %552 = stablehlo.multiply %551, %533 : tensor<1x19200x256xf32>
-    %553 = stablehlo.add %552, %cst_18 : tensor<1x19200x256xf32>
-    %554 = stablehlo.multiply %532, %545 : tensor<1x19200x256xf32>
-    %555 = stablehlo.divide %554, %553 : tensor<1x19200x256xf32>
-    %556 = stablehlo.clamp %cst_19, %555, %cst_20 : tensor<1x19200x256xf32>
-    %557 = stablehlo.convert %556 : (tensor<1x19200x256xf32>) -> tensor<1x19200x256xbf16>
-    %558 = stablehlo.add %557, %cst_2 : tensor<1x19200x256xbf16>
-    %559 = stablehlo.multiply %558, %529 : tensor<1x19200x256xbf16>
-    %560 = stablehlo.reshape %559 : (tensor<1x19200x256xbf16>) -> tensor<19200x256xbf16>
-    %561 = stablehlo.dot_general %560, %arg513, contracting_dims = [1] x [0] : (tensor<19200x256xbf16>, tensor<256x64xbf16>) -> tensor<19200x64xbf16>
-    %562 = stablehlo.reshape %561 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %563 = stablehlo.broadcast_in_dim %562, dims = [0, 1, 2] : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %564 = stablehlo.broadcast_in_dim %arg26, dims = [2] : (tensor<64xbf16>) -> tensor<1x19200x64xbf16>
-    %565 = stablehlo.add %563, %564 : tensor<1x19200x64xbf16>
-    %566 = stablehlo.reshape %565 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %567 = stablehlo.reshape %566 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %568 = stablehlo.add %567, %472 : tensor<1x19200x64xbf16>
-    %569 = stablehlo.convert %568 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %570 = stablehlo.convert %569 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %571 = stablehlo.reduce(%570 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %572 = stablehlo.reshape %571 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %573 = stablehlo.broadcast_in_dim %572, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %574 = stablehlo.divide %573, %14 : tensor<1x19200x1xf64>
-    %575 = stablehlo.broadcast_in_dim %570, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %576 = stablehlo.broadcast_in_dim %574, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %577 = stablehlo.subtract %575, %576 : tensor<1x19200x64xf64>
-    %578 = stablehlo.multiply %577, %577 : tensor<1x19200x64xf64>
-    %579 = stablehlo.reduce(%578 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %580 = stablehlo.reshape %579 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %581 = stablehlo.broadcast_in_dim %580, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %582 = stablehlo.divide %581, %14 : tensor<1x19200x1xf64>
-    %583 = stablehlo.convert %582 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %584 = stablehlo.reduce(%569 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %585 = stablehlo.reshape %584 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %586 = stablehlo.broadcast_in_dim %585, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %587 = stablehlo.divide %586, %30 : tensor<1x19200x1xf32>
-    %588 = stablehlo.broadcast_in_dim %583, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %589 = stablehlo.add %588, %35 : tensor<1x19200x1xf32>
-    %590 = stablehlo.rsqrt %589 : tensor<1x19200x1xf32>
-    %591 = stablehlo.broadcast_in_dim %569, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %592 = stablehlo.broadcast_in_dim %587, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %593 = stablehlo.subtract %591, %592 : tensor<1x19200x64xf32>
-    %594 = stablehlo.broadcast_in_dim %593, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %595 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %596 = stablehlo.multiply %594, %595 : tensor<1x19200x64xf32>
-    %597 = stablehlo.convert %arg27 : (tensor<64xbf16>) -> tensor<64xf32>
-    %598 = stablehlo.broadcast_in_dim %596, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %599 = stablehlo.broadcast_in_dim %597, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %600 = stablehlo.multiply %598, %599 : tensor<1x19200x64xf32>
-    %601 = stablehlo.convert %arg28 : (tensor<64xbf16>) -> tensor<64xf32>
-    %602 = stablehlo.broadcast_in_dim %600, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %603 = stablehlo.broadcast_in_dim %601, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %604 = stablehlo.add %602, %603 : tensor<1x19200x64xf32>
-    %605 = stablehlo.convert %604 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %606 = stablehlo.reshape %605 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %607 = stablehlo.convert %606 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %608 = stablehlo.dot_general %607, %arg514, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x64xf32>) -> tensor<19200x64xf32>
-    %609 = stablehlo.broadcast_in_dim %608, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %610 = stablehlo.multiply %609, %96 : tensor<19200x64xf32>
-    %611 = stablehlo.broadcast_in_dim %610, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %612 = stablehlo.broadcast_in_dim %arg515, dims = [1] : (tensor<64xf32>) -> tensor<19200x64xf32>
-    %613 = stablehlo.add %611, %612 : tensor<19200x64xf32>
-    %614 = stablehlo.convert %613 : (tensor<19200x64xf32>) -> tensor<19200x64xbf16>
-    %615 = stablehlo.reshape %614 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %616 = stablehlo.reshape %615 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x1x64xbf16>
-    %617 = stablehlo.transpose %616, dims = [0, 2, 1, 3] : (tensor<1x19200x1x64xbf16>) -> tensor<1x1x19200x64xbf16>
-    %618 = stablehlo.transpose %605, dims = [0, 2, 1] : (tensor<1x19200x64xbf16>) -> tensor<1x64x19200xbf16>
-    %619 = stablehlo.reshape %618 : (tensor<1x64x19200xbf16>) -> tensor<1x64x120x160xbf16>
-    %620 = stablehlo.convolution(%619, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [8, 8], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x120x160xbf16>, tensor<64x64x8x8xbf16>) -> tensor<1x64x15x20xbf16>
-    %621 = stablehlo.reshape %arg30 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %622 = stablehlo.broadcast_in_dim %620, dims = [0, 1, 2, 3] : (tensor<1x64x15x20xbf16>) -> tensor<1x64x15x20xbf16>
-    %623 = stablehlo.broadcast_in_dim %621, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x15x20xbf16>
-    %624 = stablehlo.add %622, %623 : tensor<1x64x15x20xbf16>
-    %625 = stablehlo.reshape %624 : (tensor<1x64x15x20xbf16>) -> tensor<1x64x300xbf16>
-    %626 = stablehlo.transpose %625, dims = [0, 2, 1] : (tensor<1x64x300xbf16>) -> tensor<1x300x64xbf16>
-    %627 = stablehlo.convert %626 : (tensor<1x300x64xbf16>) -> tensor<1x300x64xf32>
-    %628 = stablehlo.convert %627 : (tensor<1x300x64xf32>) -> tensor<1x300x64xf64>
-    %629 = stablehlo.reduce(%628 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %630 = stablehlo.reshape %629 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %631 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %632 = stablehlo.divide %631, %119 : tensor<1x300x1xf64>
-    %633 = stablehlo.broadcast_in_dim %628, dims = [0, 1, 2] : (tensor<1x300x64xf64>) -> tensor<1x300x64xf64>
-    %634 = stablehlo.broadcast_in_dim %632, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x64xf64>
-    %635 = stablehlo.subtract %633, %634 : tensor<1x300x64xf64>
-    %636 = stablehlo.multiply %635, %635 : tensor<1x300x64xf64>
-    %637 = stablehlo.reduce(%636 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %638 = stablehlo.reshape %637 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %639 = stablehlo.broadcast_in_dim %638, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %640 = stablehlo.divide %639, %119 : tensor<1x300x1xf64>
-    %641 = stablehlo.convert %640 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %642 = stablehlo.reduce(%627 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x64xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %643 = stablehlo.reshape %642 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %644 = stablehlo.broadcast_in_dim %643, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %645 = stablehlo.divide %644, %133 : tensor<1x300x1xf32>
-    %646 = stablehlo.broadcast_in_dim %641, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %647 = stablehlo.add %646, %136 : tensor<1x300x1xf32>
-    %648 = stablehlo.rsqrt %647 : tensor<1x300x1xf32>
-    %649 = stablehlo.broadcast_in_dim %627, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %650 = stablehlo.broadcast_in_dim %645, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x64xf32>
-    %651 = stablehlo.subtract %649, %650 : tensor<1x300x64xf32>
-    %652 = stablehlo.broadcast_in_dim %651, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %653 = stablehlo.broadcast_in_dim %648, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x64xf32>
-    %654 = stablehlo.multiply %652, %653 : tensor<1x300x64xf32>
-    %655 = stablehlo.convert %arg31 : (tensor<64xbf16>) -> tensor<64xf32>
-    %656 = stablehlo.broadcast_in_dim %654, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %657 = stablehlo.broadcast_in_dim %655, dims = [2] : (tensor<64xf32>) -> tensor<1x300x64xf32>
-    %658 = stablehlo.multiply %656, %657 : tensor<1x300x64xf32>
-    %659 = stablehlo.convert %arg32 : (tensor<64xbf16>) -> tensor<64xf32>
-    %660 = stablehlo.broadcast_in_dim %658, dims = [0, 1, 2] : (tensor<1x300x64xf32>) -> tensor<1x300x64xf32>
-    %661 = stablehlo.broadcast_in_dim %659, dims = [2] : (tensor<64xf32>) -> tensor<1x300x64xf32>
-    %662 = stablehlo.add %660, %661 : tensor<1x300x64xf32>
-    %663 = stablehlo.convert %662 : (tensor<1x300x64xf32>) -> tensor<1x300x64xbf16>
-    %664 = stablehlo.reshape %663 : (tensor<1x300x64xbf16>) -> tensor<300x64xbf16>
-    %665 = stablehlo.convert %664 : (tensor<300x64xbf16>) -> tensor<300x64xf32>
-    %666 = stablehlo.dot_general %665, %arg516, contracting_dims = [1] x [0] : (tensor<300x64xf32>, tensor<64x64xf32>) -> tensor<300x64xf32>
-    %667 = stablehlo.broadcast_in_dim %666, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %668 = stablehlo.multiply %667, %158 : tensor<300x64xf32>
-    %669 = stablehlo.broadcast_in_dim %668, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %670 = stablehlo.broadcast_in_dim %arg517, dims = [1] : (tensor<64xf32>) -> tensor<300x64xf32>
-    %671 = stablehlo.add %669, %670 : tensor<300x64xf32>
-    %672 = stablehlo.convert %671 : (tensor<300x64xf32>) -> tensor<300x64xbf16>
-    %673 = stablehlo.reshape %672 : (tensor<300x64xbf16>) -> tensor<1x300x64xbf16>
-    %674 = stablehlo.reshape %673 : (tensor<1x300x64xbf16>) -> tensor<1x300x1x64xbf16>
-    %675 = stablehlo.transpose %674, dims = [0, 2, 1, 3] : (tensor<1x300x1x64xbf16>) -> tensor<1x1x300x64xbf16>
-    %676 = stablehlo.dot_general %665, %arg518, contracting_dims = [1] x [0] : (tensor<300x64xf32>, tensor<64x64xf32>) -> tensor<300x64xf32>
-    %677 = stablehlo.broadcast_in_dim %676, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %678 = stablehlo.multiply %677, %158 : tensor<300x64xf32>
-    %679 = stablehlo.broadcast_in_dim %678, dims = [0, 1] : (tensor<300x64xf32>) -> tensor<300x64xf32>
-    %680 = stablehlo.broadcast_in_dim %arg519, dims = [1] : (tensor<64xf32>) -> tensor<300x64xf32>
-    %681 = stablehlo.add %679, %680 : tensor<300x64xf32>
-    %682 = stablehlo.convert %681 : (tensor<300x64xf32>) -> tensor<300x64xbf16>
-    %683 = stablehlo.reshape %682 : (tensor<300x64xbf16>) -> tensor<1x300x64xbf16>
-    %684 = stablehlo.reshape %683 : (tensor<1x300x64xbf16>) -> tensor<1x300x1x64xbf16>
-    %685 = stablehlo.transpose %684, dims = [0, 2, 1, 3] : (tensor<1x300x1x64xbf16>) -> tensor<1x1x300x64xbf16>
-    %686 = stablehlo.transpose %675, dims = [0, 1, 3, 2] : (tensor<1x1x300x64xbf16>) -> tensor<1x1x64x300xbf16>
-    %687 = stablehlo.reshape %617 : (tensor<1x1x19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %688 = stablehlo.reshape %686 : (tensor<1x1x64x300xbf16>) -> tensor<1x64x300xbf16>
-    %689 = stablehlo.broadcast_in_dim %688, dims = [0, 1, 2] : (tensor<1x64x300xbf16>) -> tensor<1x64x300xbf16>
-    %690 = stablehlo.dot_general %687, %689, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x19200x64xbf16>, tensor<1x64x300xbf16>) -> tensor<1x19200x300xbf16>
-    %691 = stablehlo.reshape %690 : (tensor<1x19200x300xbf16>) -> tensor<1x1x19200x300xbf16>
-    %692 = stablehlo.broadcast_in_dim %691, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xbf16>) -> tensor<1x1x19200x300xbf16>
-    %693 = stablehlo.divide %692, %186 : tensor<1x1x19200x300xbf16>
-    %694 = stablehlo.convert %693 : (tensor<1x1x19200x300xbf16>) -> tensor<1x1x19200x300xf32>
-    %695 = stablehlo.reduce(%694 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x1x19200x300xf32>, tensor<f32>) -> tensor<1x1x19200xf32>
-    %696 = stablehlo.reshape %695 : (tensor<1x1x19200xf32>) -> tensor<1x1x19200x1xf32>
-    %697 = stablehlo.broadcast_in_dim %694, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xf32>
-    %698 = stablehlo.broadcast_in_dim %696, dims = [0, 1, 2, 3] : (tensor<1x1x19200x1xf32>) -> tensor<1x1x19200x300xf32>
-    %699 = stablehlo.subtract %697, %698 : tensor<1x1x19200x300xf32>
-    %700 = stablehlo.exponential %699 : tensor<1x1x19200x300xf32>
-    %701 = stablehlo.reduce(%700 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x1x19200x300xf32>, tensor<f32>) -> tensor<1x1x19200xf32>
-    %702 = stablehlo.reshape %701 : (tensor<1x1x19200xf32>) -> tensor<1x1x19200x1xf32>
-    %703 = stablehlo.broadcast_in_dim %700, dims = [0, 1, 2, 3] : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xf32>
-    %704 = stablehlo.broadcast_in_dim %702, dims = [0, 1, 2, 3] : (tensor<1x1x19200x1xf32>) -> tensor<1x1x19200x300xf32>
-    %705 = stablehlo.divide %703, %704 : tensor<1x1x19200x300xf32>
-    %706 = stablehlo.convert %705 : (tensor<1x1x19200x300xf32>) -> tensor<1x1x19200x300xbf16>
-    %707 = stablehlo.reshape %706 : (tensor<1x1x19200x300xbf16>) -> tensor<1x19200x300xbf16>
-    %708 = stablehlo.reshape %685 : (tensor<1x1x300x64xbf16>) -> tensor<1x300x64xbf16>
-    %709 = stablehlo.broadcast_in_dim %708, dims = [0, 1, 2] : (tensor<1x300x64xbf16>) -> tensor<1x300x64xbf16>
-    %710 = stablehlo.dot_general %707, %709, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x19200x300xbf16>, tensor<1x300x64xbf16>) -> tensor<1x19200x64xbf16>
-    %711 = stablehlo.reshape %710 : (tensor<1x19200x64xbf16>) -> tensor<1x1x19200x64xbf16>
-    %712 = stablehlo.transpose %711, dims = [0, 2, 1, 3] : (tensor<1x1x19200x64xbf16>) -> tensor<1x19200x1x64xbf16>
-    %713 = stablehlo.reshape %712 : (tensor<1x19200x1x64xbf16>) -> tensor<1x19200x64xbf16>
-    %714 = stablehlo.reshape %713 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %715 = stablehlo.convert %714 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %716 = stablehlo.dot_general %715, %arg520, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x64xf32>) -> tensor<19200x64xf32>
-    %717 = stablehlo.broadcast_in_dim %716, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %718 = stablehlo.multiply %717, %96 : tensor<19200x64xf32>
-    %719 = stablehlo.broadcast_in_dim %718, dims = [0, 1] : (tensor<19200x64xf32>) -> tensor<19200x64xf32>
-    %720 = stablehlo.broadcast_in_dim %arg521, dims = [1] : (tensor<64xf32>) -> tensor<19200x64xf32>
-    %721 = stablehlo.add %719, %720 : tensor<19200x64xf32>
-    %722 = stablehlo.convert %721 : (tensor<19200x64xf32>) -> tensor<19200x64xbf16>
-    %723 = stablehlo.reshape %722 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %724 = stablehlo.add %723, %568 : tensor<1x19200x64xbf16>
-    %725 = stablehlo.convert %724 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %726 = stablehlo.convert %725 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %727 = stablehlo.reduce(%726 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %728 = stablehlo.reshape %727 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %729 = stablehlo.broadcast_in_dim %728, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %730 = stablehlo.divide %729, %14 : tensor<1x19200x1xf64>
-    %731 = stablehlo.broadcast_in_dim %726, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %732 = stablehlo.broadcast_in_dim %730, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %733 = stablehlo.subtract %731, %732 : tensor<1x19200x64xf64>
-    %734 = stablehlo.multiply %733, %733 : tensor<1x19200x64xf64>
-    %735 = stablehlo.reduce(%734 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %736 = stablehlo.reshape %735 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %737 = stablehlo.broadcast_in_dim %736, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %738 = stablehlo.divide %737, %14 : tensor<1x19200x1xf64>
-    %739 = stablehlo.convert %738 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %740 = stablehlo.reduce(%725 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %741 = stablehlo.reshape %740 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %742 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %743 = stablehlo.divide %742, %30 : tensor<1x19200x1xf32>
-    %744 = stablehlo.broadcast_in_dim %739, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %745 = stablehlo.add %744, %35 : tensor<1x19200x1xf32>
-    %746 = stablehlo.rsqrt %745 : tensor<1x19200x1xf32>
-    %747 = stablehlo.broadcast_in_dim %725, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %748 = stablehlo.broadcast_in_dim %743, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %749 = stablehlo.subtract %747, %748 : tensor<1x19200x64xf32>
-    %750 = stablehlo.broadcast_in_dim %749, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %751 = stablehlo.broadcast_in_dim %746, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %752 = stablehlo.multiply %750, %751 : tensor<1x19200x64xf32>
-    %753 = stablehlo.convert %arg33 : (tensor<64xbf16>) -> tensor<64xf32>
-    %754 = stablehlo.broadcast_in_dim %752, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %755 = stablehlo.broadcast_in_dim %753, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %756 = stablehlo.multiply %754, %755 : tensor<1x19200x64xf32>
-    %757 = stablehlo.convert %arg34 : (tensor<64xbf16>) -> tensor<64xf32>
-    %758 = stablehlo.broadcast_in_dim %756, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %759 = stablehlo.broadcast_in_dim %757, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %760 = stablehlo.add %758, %759 : tensor<1x19200x64xf32>
-    %761 = stablehlo.convert %760 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %762 = stablehlo.reshape %761 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %763 = stablehlo.convert %762 : (tensor<19200x64xbf16>) -> tensor<19200x64xf32>
-    %764 = stablehlo.dot_general %763, %arg522, contracting_dims = [1] x [0] : (tensor<19200x64xf32>, tensor<64x256xf32>) -> tensor<19200x256xf32>
-    %765 = stablehlo.broadcast_in_dim %764, dims = [0, 1] : (tensor<19200x256xf32>) -> tensor<19200x256xf32>
-    %766 = stablehlo.multiply %765, %260 : tensor<19200x256xf32>
-    %767 = stablehlo.broadcast_in_dim %766, dims = [0, 1] : (tensor<19200x256xf32>) -> tensor<19200x256xf32>
-    %768 = stablehlo.broadcast_in_dim %arg523, dims = [1] : (tensor<256xf32>) -> tensor<19200x256xf32>
-    %769 = stablehlo.add %767, %768 : tensor<19200x256xf32>
-    %770 = stablehlo.convert %769 : (tensor<19200x256xf32>) -> tensor<19200x256xbf16>
-    %771 = stablehlo.reshape %770 : (tensor<19200x256xbf16>) -> tensor<1x19200x256xbf16>
-    %772 = stablehlo.transpose %771, dims = [0, 2, 1] : (tensor<1x19200x256xbf16>) -> tensor<1x256x19200xbf16>
-    %773 = stablehlo.reshape %772 : (tensor<1x256x19200xbf16>) -> tensor<1x256x120x160xbf16>
-    %774 = stablehlo.convolution(%773, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x120x160xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x120x160xbf16>
-    %775 = stablehlo.reshape %arg36 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %776 = stablehlo.broadcast_in_dim %774, dims = [0, 1, 2, 3] : (tensor<1x256x120x160xbf16>) -> tensor<1x256x120x160xbf16>
-    %777 = stablehlo.broadcast_in_dim %775, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x120x160xbf16>
-    %778 = stablehlo.add %776, %777 : tensor<1x256x120x160xbf16>
-    %779 = stablehlo.reshape %778 : (tensor<1x256x120x160xbf16>) -> tensor<1x256x19200xbf16>
-    %780 = stablehlo.transpose %779, dims = [0, 2, 1] : (tensor<1x256x19200xbf16>) -> tensor<1x19200x256xbf16>
-    %781 = stablehlo.multiply %780, %cst_4 : tensor<1x19200x256xbf16>
-    %782 = stablehlo.multiply %780, %277 : tensor<1x19200x256xbf16>
-    %783 = stablehlo.convert %782 : (tensor<1x19200x256xbf16>) -> tensor<1x19200x256xf32>
-    %784 = stablehlo.clamp %cst_5, %783, %cst_6 : tensor<1x19200x256xf32>
-    %785 = stablehlo.multiply %784, %784 : tensor<1x19200x256xf32>
-    %786 = stablehlo.multiply %cst_7, %785 : tensor<1x19200x256xf32>
-    %787 = stablehlo.add %786, %cst_8 : tensor<1x19200x256xf32>
-    %788 = stablehlo.multiply %787, %785 : tensor<1x19200x256xf32>
-    %789 = stablehlo.add %788, %cst_9 : tensor<1x19200x256xf32>
-    %790 = stablehlo.multiply %789, %785 : tensor<1x19200x256xf32>
-    %791 = stablehlo.add %790, %cst_10 : tensor<1x19200x256xf32>
-    %792 = stablehlo.multiply %791, %785 : tensor<1x19200x256xf32>
-    %793 = stablehlo.add %792, %cst_11 : tensor<1x19200x256xf32>
-    %794 = stablehlo.multiply %793, %785 : tensor<1x19200x256xf32>
-    %795 = stablehlo.add %794, %cst_12 : tensor<1x19200x256xf32>
-    %796 = stablehlo.multiply %795, %785 : tensor<1x19200x256xf32>
-    %797 = stablehlo.add %796, %cst_13 : tensor<1x19200x256xf32>
-    %798 = stablehlo.multiply %cst_14, %785 : tensor<1x19200x256xf32>
-    %799 = stablehlo.add %798, %cst_15 : tensor<1x19200x256xf32>
-    %800 = stablehlo.multiply %799, %785 : tensor<1x19200x256xf32>
-    %801 = stablehlo.add %800, %cst_16 : tensor<1x19200x256xf32>
-    %802 = stablehlo.multiply %801, %785 : tensor<1x19200x256xf32>
-    %803 = stablehlo.add %802, %cst_17 : tensor<1x19200x256xf32>
-    %804 = stablehlo.multiply %803, %785 : tensor<1x19200x256xf32>
-    %805 = stablehlo.add %804, %cst_18 : tensor<1x19200x256xf32>
-    %806 = stablehlo.multiply %784, %797 : tensor<1x19200x256xf32>
-    %807 = stablehlo.divide %806, %805 : tensor<1x19200x256xf32>
-    %808 = stablehlo.clamp %cst_19, %807, %cst_20 : tensor<1x19200x256xf32>
-    %809 = stablehlo.convert %808 : (tensor<1x19200x256xf32>) -> tensor<1x19200x256xbf16>
-    %810 = stablehlo.add %809, %cst_2 : tensor<1x19200x256xbf16>
-    %811 = stablehlo.multiply %810, %781 : tensor<1x19200x256xbf16>
-    %812 = stablehlo.reshape %811 : (tensor<1x19200x256xbf16>) -> tensor<19200x256xbf16>
-    %813 = stablehlo.dot_general %812, %arg524, contracting_dims = [1] x [0] : (tensor<19200x256xbf16>, tensor<256x64xbf16>) -> tensor<19200x64xbf16>
-    %814 = stablehlo.reshape %813 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %815 = stablehlo.broadcast_in_dim %814, dims = [0, 1, 2] : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %816 = stablehlo.broadcast_in_dim %arg37, dims = [2] : (tensor<64xbf16>) -> tensor<1x19200x64xbf16>
-    %817 = stablehlo.add %815, %816 : tensor<1x19200x64xbf16>
-    %818 = stablehlo.reshape %817 : (tensor<1x19200x64xbf16>) -> tensor<19200x64xbf16>
-    %819 = stablehlo.reshape %818 : (tensor<19200x64xbf16>) -> tensor<1x19200x64xbf16>
-    %820 = stablehlo.add %819, %724 : tensor<1x19200x64xbf16>
-    %821 = stablehlo.convert %820 : (tensor<1x19200x64xbf16>) -> tensor<1x19200x64xf32>
-    %822 = stablehlo.convert %821 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf64>
-    %823 = stablehlo.reduce(%822 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %824 = stablehlo.reshape %823 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %825 = stablehlo.broadcast_in_dim %824, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %826 = stablehlo.divide %825, %14 : tensor<1x19200x1xf64>
-    %827 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2] : (tensor<1x19200x64xf64>) -> tensor<1x19200x64xf64>
-    %828 = stablehlo.broadcast_in_dim %826, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x64xf64>
-    %829 = stablehlo.subtract %827, %828 : tensor<1x19200x64xf64>
-    %830 = stablehlo.multiply %829, %829 : tensor<1x19200x64xf64>
-    %831 = stablehlo.reduce(%830 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf64>, tensor<f64>) -> tensor<1x19200xf64>
-    %832 = stablehlo.reshape %831 : (tensor<1x19200xf64>) -> tensor<1x19200x1xf64>
-    %833 = stablehlo.broadcast_in_dim %832, dims = [0, 1, 2] : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf64>
-    %834 = stablehlo.divide %833, %14 : tensor<1x19200x1xf64>
-    %835 = stablehlo.convert %834 : (tensor<1x19200x1xf64>) -> tensor<1x19200x1xf32>
-    %836 = stablehlo.reduce(%821 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x19200x64xf32>, tensor<f32>) -> tensor<1x19200xf32>
-    %837 = stablehlo.reshape %836 : (tensor<1x19200xf32>) -> tensor<1x19200x1xf32>
-    %838 = stablehlo.broadcast_in_dim %837, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %839 = stablehlo.divide %838, %30 : tensor<1x19200x1xf32>
-    %840 = stablehlo.broadcast_in_dim %835, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x1xf32>
-    %841 = stablehlo.add %840, %35 : tensor<1x19200x1xf32>
-    %842 = stablehlo.rsqrt %841 : tensor<1x19200x1xf32>
-    %843 = stablehlo.broadcast_in_dim %821, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %844 = stablehlo.broadcast_in_dim %839, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %845 = stablehlo.subtract %843, %844 : tensor<1x19200x64xf32>
-    %846 = stablehlo.broadcast_in_dim %845, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %847 = stablehlo.broadcast_in_dim %842, dims = [0, 1, 2] : (tensor<1x19200x1xf32>) -> tensor<1x19200x64xf32>
-    %848 = stablehlo.multiply %846, %847 : tensor<1x19200x64xf32>
-    %849 = stablehlo.convert %arg38 : (tensor<64xbf16>) -> tensor<64xf32>
-    %850 = stablehlo.broadcast_in_dim %848, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %851 = stablehlo.broadcast_in_dim %849, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %852 = stablehlo.multiply %850, %851 : tensor<1x19200x64xf32>
-    %853 = stablehlo.convert %arg39 : (tensor<64xbf16>) -> tensor<64xf32>
-    %854 = stablehlo.broadcast_in_dim %852, dims = [0, 1, 2] : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xf32>
-    %855 = stablehlo.broadcast_in_dim %853, dims = [2] : (tensor<64xf32>) -> tensor<1x19200x64xf32>
-    %856 = stablehlo.add %854, %855 : tensor<1x19200x64xf32>
-    %857 = stablehlo.convert %856 : (tensor<1x19200x64xf32>) -> tensor<1x19200x64xbf16>
-    %858 = stablehlo.reshape %857 : (tensor<1x19200x64xbf16>) -> tensor<1x120x160x64xbf16>
-    %859 = stablehlo.transpose %858, dims = [0, 3, 1, 2] : (tensor<1x120x160x64xbf16>) -> tensor<1x64x120x160xbf16>
-    %860 = stablehlo.convolution(%859, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x120x160xbf16>, tensor<128x64x3x3xbf16>) -> tensor<1x128x60x80xbf16>
-    %861 = stablehlo.reshape %arg41 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %862 = stablehlo.broadcast_in_dim %860, dims = [0, 1, 2, 3] : (tensor<1x128x60x80xbf16>) -> tensor<1x128x60x80xbf16>
-    %863 = stablehlo.broadcast_in_dim %861, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x60x80xbf16>
-    %864 = stablehlo.add %862, %863 : tensor<1x128x60x80xbf16>
-    %865 = stablehlo.reshape %864 : (tensor<1x128x60x80xbf16>) -> tensor<1x128x4800xbf16>
-    %866 = stablehlo.transpose %865, dims = [0, 2, 1] : (tensor<1x128x4800xbf16>) -> tensor<1x4800x128xbf16>
-    %867 = stablehlo.convert %866 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %868 = stablehlo.convert %867 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %869 = stablehlo.reduce(%868 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %870 = stablehlo.reshape %869 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %871 = stablehlo.convert %cst_89 : (tensor<1xi64>) -> tensor<1xf64>
-    %872 = stablehlo.reshape %871 : (tensor<1xf64>) -> tensor<f64>
-    %873 = stablehlo.broadcast_in_dim %870, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %874 = stablehlo.broadcast_in_dim %872, dims = [] : (tensor<f64>) -> tensor<1x4800x1xf64>
-    %875 = stablehlo.divide %873, %874 : tensor<1x4800x1xf64>
-    %876 = stablehlo.broadcast_in_dim %868, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %877 = stablehlo.broadcast_in_dim %875, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %878 = stablehlo.subtract %876, %877 : tensor<1x4800x128xf64>
-    %879 = stablehlo.multiply %878, %878 : tensor<1x4800x128xf64>
-    %880 = stablehlo.reduce(%879 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %881 = stablehlo.reshape %880 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %882 = stablehlo.broadcast_in_dim %881, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %883 = stablehlo.divide %882, %874 : tensor<1x4800x1xf64>
-    %884 = stablehlo.convert %883 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %885 = stablehlo.reduce(%867 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %886 = stablehlo.reshape %885 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %887 = stablehlo.convert %cst_89 : (tensor<1xi64>) -> tensor<1xf32>
-    %888 = stablehlo.reshape %887 : (tensor<1xf32>) -> tensor<f32>
-    %889 = stablehlo.broadcast_in_dim %886, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %890 = stablehlo.broadcast_in_dim %888, dims = [] : (tensor<f32>) -> tensor<1x4800x1xf32>
-    %891 = stablehlo.divide %889, %890 : tensor<1x4800x1xf32>
-    %892 = stablehlo.broadcast_in_dim %884, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %893 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x4800x1xf32>
-    %894 = stablehlo.add %892, %893 : tensor<1x4800x1xf32>
-    %895 = stablehlo.rsqrt %894 : tensor<1x4800x1xf32>
-    %896 = stablehlo.broadcast_in_dim %867, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %897 = stablehlo.broadcast_in_dim %891, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %898 = stablehlo.subtract %896, %897 : tensor<1x4800x128xf32>
-    %899 = stablehlo.broadcast_in_dim %898, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %900 = stablehlo.broadcast_in_dim %895, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %901 = stablehlo.multiply %899, %900 : tensor<1x4800x128xf32>
-    %902 = stablehlo.convert %arg42 : (tensor<128xbf16>) -> tensor<128xf32>
-    %903 = stablehlo.broadcast_in_dim %901, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %904 = stablehlo.broadcast_in_dim %902, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %905 = stablehlo.multiply %903, %904 : tensor<1x4800x128xf32>
-    %906 = stablehlo.convert %arg43 : (tensor<128xbf16>) -> tensor<128xf32>
-    %907 = stablehlo.broadcast_in_dim %905, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %908 = stablehlo.broadcast_in_dim %906, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %909 = stablehlo.add %907, %908 : tensor<1x4800x128xf32>
-    %910 = stablehlo.convert %909 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %911 = stablehlo.convert %910 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %912 = stablehlo.convert %911 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %913 = stablehlo.reduce(%912 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %914 = stablehlo.reshape %913 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %915 = stablehlo.broadcast_in_dim %914, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %916 = stablehlo.divide %915, %874 : tensor<1x4800x1xf64>
-    %917 = stablehlo.broadcast_in_dim %912, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %918 = stablehlo.broadcast_in_dim %916, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %919 = stablehlo.subtract %917, %918 : tensor<1x4800x128xf64>
-    %920 = stablehlo.multiply %919, %919 : tensor<1x4800x128xf64>
-    %921 = stablehlo.reduce(%920 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %922 = stablehlo.reshape %921 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %923 = stablehlo.broadcast_in_dim %922, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %924 = stablehlo.divide %923, %874 : tensor<1x4800x1xf64>
-    %925 = stablehlo.convert %924 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %926 = stablehlo.reduce(%911 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %927 = stablehlo.reshape %926 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %928 = stablehlo.broadcast_in_dim %927, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %929 = stablehlo.divide %928, %890 : tensor<1x4800x1xf32>
-    %930 = stablehlo.broadcast_in_dim %925, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %931 = stablehlo.add %930, %893 : tensor<1x4800x1xf32>
-    %932 = stablehlo.rsqrt %931 : tensor<1x4800x1xf32>
-    %933 = stablehlo.broadcast_in_dim %911, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %934 = stablehlo.broadcast_in_dim %929, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %935 = stablehlo.subtract %933, %934 : tensor<1x4800x128xf32>
-    %936 = stablehlo.broadcast_in_dim %935, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %937 = stablehlo.broadcast_in_dim %932, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %938 = stablehlo.multiply %936, %937 : tensor<1x4800x128xf32>
-    %939 = stablehlo.convert %arg44 : (tensor<128xbf16>) -> tensor<128xf32>
-    %940 = stablehlo.broadcast_in_dim %938, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %941 = stablehlo.broadcast_in_dim %939, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %942 = stablehlo.multiply %940, %941 : tensor<1x4800x128xf32>
-    %943 = stablehlo.convert %arg45 : (tensor<128xbf16>) -> tensor<128xf32>
-    %944 = stablehlo.broadcast_in_dim %942, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %945 = stablehlo.broadcast_in_dim %943, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %946 = stablehlo.add %944, %945 : tensor<1x4800x128xf32>
-    %947 = stablehlo.convert %946 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %948 = stablehlo.reshape %947 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %949 = stablehlo.convert %948 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %950 = stablehlo.dot_general %949, %arg525, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %951 = stablehlo.broadcast_in_dim %950, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %952 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<4800x128xf32>
-    %953 = stablehlo.multiply %951, %952 : tensor<4800x128xf32>
-    %954 = stablehlo.broadcast_in_dim %953, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %955 = stablehlo.broadcast_in_dim %arg526, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %956 = stablehlo.add %954, %955 : tensor<4800x128xf32>
-    %957 = stablehlo.convert %956 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %958 = stablehlo.reshape %957 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %959 = stablehlo.reshape %958 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %960 = stablehlo.transpose %959, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %961 = stablehlo.transpose %947, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %962 = stablehlo.reshape %961 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %963 = stablehlo.convolution(%962, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %964 = stablehlo.reshape %arg47 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %965 = stablehlo.broadcast_in_dim %963, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %966 = stablehlo.broadcast_in_dim %964, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %967 = stablehlo.add %965, %966 : tensor<1x128x15x20xbf16>
-    %968 = stablehlo.reshape %967 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %969 = stablehlo.transpose %968, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %970 = stablehlo.convert %969 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %971 = stablehlo.convert %970 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %972 = stablehlo.reduce(%971 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %973 = stablehlo.reshape %972 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %974 = stablehlo.broadcast_in_dim %973, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %975 = stablehlo.broadcast_in_dim %872, dims = [] : (tensor<f64>) -> tensor<1x300x1xf64>
-    %976 = stablehlo.divide %974, %975 : tensor<1x300x1xf64>
-    %977 = stablehlo.broadcast_in_dim %971, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %978 = stablehlo.broadcast_in_dim %976, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %979 = stablehlo.subtract %977, %978 : tensor<1x300x128xf64>
-    %980 = stablehlo.multiply %979, %979 : tensor<1x300x128xf64>
-    %981 = stablehlo.reduce(%980 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %982 = stablehlo.reshape %981 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %983 = stablehlo.broadcast_in_dim %982, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %984 = stablehlo.divide %983, %975 : tensor<1x300x1xf64>
-    %985 = stablehlo.convert %984 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %986 = stablehlo.reduce(%970 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %987 = stablehlo.reshape %986 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %988 = stablehlo.broadcast_in_dim %987, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %989 = stablehlo.broadcast_in_dim %888, dims = [] : (tensor<f32>) -> tensor<1x300x1xf32>
-    %990 = stablehlo.divide %988, %989 : tensor<1x300x1xf32>
-    %991 = stablehlo.broadcast_in_dim %985, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %992 = stablehlo.add %991, %136 : tensor<1x300x1xf32>
-    %993 = stablehlo.rsqrt %992 : tensor<1x300x1xf32>
-    %994 = stablehlo.broadcast_in_dim %970, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %995 = stablehlo.broadcast_in_dim %990, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %996 = stablehlo.subtract %994, %995 : tensor<1x300x128xf32>
-    %997 = stablehlo.broadcast_in_dim %996, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %998 = stablehlo.broadcast_in_dim %993, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %999 = stablehlo.multiply %997, %998 : tensor<1x300x128xf32>
-    %1000 = stablehlo.convert %arg48 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1001 = stablehlo.broadcast_in_dim %999, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1002 = stablehlo.broadcast_in_dim %1000, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1003 = stablehlo.multiply %1001, %1002 : tensor<1x300x128xf32>
-    %1004 = stablehlo.convert %arg49 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1005 = stablehlo.broadcast_in_dim %1003, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1006 = stablehlo.broadcast_in_dim %1004, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1007 = stablehlo.add %1005, %1006 : tensor<1x300x128xf32>
-    %1008 = stablehlo.convert %1007 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %1009 = stablehlo.reshape %1008 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %1010 = stablehlo.convert %1009 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %1011 = stablehlo.dot_general %1010, %arg527, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1012 = stablehlo.broadcast_in_dim %1011, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1013 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<300x128xf32>
-    %1014 = stablehlo.multiply %1012, %1013 : tensor<300x128xf32>
-    %1015 = stablehlo.broadcast_in_dim %1014, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1016 = stablehlo.broadcast_in_dim %arg528, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1017 = stablehlo.add %1015, %1016 : tensor<300x128xf32>
-    %1018 = stablehlo.convert %1017 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1019 = stablehlo.reshape %1018 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1020 = stablehlo.reshape %1019 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1021 = stablehlo.transpose %1020, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1022 = stablehlo.dot_general %1010, %arg529, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1023 = stablehlo.broadcast_in_dim %1022, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1024 = stablehlo.multiply %1023, %1013 : tensor<300x128xf32>
-    %1025 = stablehlo.broadcast_in_dim %1024, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1026 = stablehlo.broadcast_in_dim %arg530, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1027 = stablehlo.add %1025, %1026 : tensor<300x128xf32>
-    %1028 = stablehlo.convert %1027 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1029 = stablehlo.reshape %1028 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1030 = stablehlo.reshape %1029 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1031 = stablehlo.transpose %1030, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1032 = stablehlo.transpose %1021, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %1033 = stablehlo.reshape %960 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1034 = stablehlo.reshape %1032 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1035 = stablehlo.broadcast_in_dim %1034, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1036 = stablehlo.dot_general %1033, %1035, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1037 = stablehlo.reshape %1036 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1038 = stablehlo.broadcast_in_dim %1037, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1039 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x2x4800x300xbf16>
-    %1040 = stablehlo.divide %1038, %1039 : tensor<1x2x4800x300xbf16>
-    %1041 = stablehlo.convert %1040 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %1042 = stablehlo.reduce(%1041 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1043 = stablehlo.reshape %1042 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1044 = stablehlo.broadcast_in_dim %1041, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1045 = stablehlo.broadcast_in_dim %1043, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1046 = stablehlo.subtract %1044, %1045 : tensor<1x2x4800x300xf32>
-    %1047 = stablehlo.exponential %1046 : tensor<1x2x4800x300xf32>
-    %1048 = stablehlo.reduce(%1047 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1049 = stablehlo.reshape %1048 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1050 = stablehlo.broadcast_in_dim %1047, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1051 = stablehlo.broadcast_in_dim %1049, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1052 = stablehlo.divide %1050, %1051 : tensor<1x2x4800x300xf32>
-    %1053 = stablehlo.convert %1052 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %1054 = stablehlo.reshape %1053 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1055 = stablehlo.reshape %1031 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1056 = stablehlo.broadcast_in_dim %1055, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1057 = stablehlo.dot_general %1054, %1056, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1058 = stablehlo.reshape %1057 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1059 = stablehlo.transpose %1058, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1060 = stablehlo.reshape %1059 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %1061 = stablehlo.reshape %1060 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1062 = stablehlo.convert %1061 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1063 = stablehlo.dot_general %1062, %arg531, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1064 = stablehlo.broadcast_in_dim %1063, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1065 = stablehlo.multiply %1064, %952 : tensor<4800x128xf32>
-    %1066 = stablehlo.broadcast_in_dim %1065, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1067 = stablehlo.broadcast_in_dim %arg532, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1068 = stablehlo.add %1066, %1067 : tensor<4800x128xf32>
-    %1069 = stablehlo.convert %1068 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1070 = stablehlo.reshape %1069 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1071 = stablehlo.add %1070, %910 : tensor<1x4800x128xbf16>
-    %1072 = stablehlo.convert %1071 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1073 = stablehlo.convert %1072 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1074 = stablehlo.reduce(%1073 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1075 = stablehlo.reshape %1074 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1076 = stablehlo.broadcast_in_dim %1075, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1077 = stablehlo.divide %1076, %874 : tensor<1x4800x1xf64>
-    %1078 = stablehlo.broadcast_in_dim %1073, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1079 = stablehlo.broadcast_in_dim %1077, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1080 = stablehlo.subtract %1078, %1079 : tensor<1x4800x128xf64>
-    %1081 = stablehlo.multiply %1080, %1080 : tensor<1x4800x128xf64>
-    %1082 = stablehlo.reduce(%1081 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1083 = stablehlo.reshape %1082 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1084 = stablehlo.broadcast_in_dim %1083, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1085 = stablehlo.divide %1084, %874 : tensor<1x4800x1xf64>
-    %1086 = stablehlo.convert %1085 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1087 = stablehlo.reduce(%1072 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1088 = stablehlo.reshape %1087 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1089 = stablehlo.broadcast_in_dim %1088, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1090 = stablehlo.divide %1089, %890 : tensor<1x4800x1xf32>
-    %1091 = stablehlo.broadcast_in_dim %1086, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1092 = stablehlo.add %1091, %893 : tensor<1x4800x1xf32>
-    %1093 = stablehlo.rsqrt %1092 : tensor<1x4800x1xf32>
-    %1094 = stablehlo.broadcast_in_dim %1072, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1095 = stablehlo.broadcast_in_dim %1090, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1096 = stablehlo.subtract %1094, %1095 : tensor<1x4800x128xf32>
-    %1097 = stablehlo.broadcast_in_dim %1096, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1098 = stablehlo.broadcast_in_dim %1093, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1099 = stablehlo.multiply %1097, %1098 : tensor<1x4800x128xf32>
-    %1100 = stablehlo.convert %arg50 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1101 = stablehlo.broadcast_in_dim %1099, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1102 = stablehlo.broadcast_in_dim %1100, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1103 = stablehlo.multiply %1101, %1102 : tensor<1x4800x128xf32>
-    %1104 = stablehlo.convert %arg51 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1105 = stablehlo.broadcast_in_dim %1103, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1106 = stablehlo.broadcast_in_dim %1104, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1107 = stablehlo.add %1105, %1106 : tensor<1x4800x128xf32>
-    %1108 = stablehlo.convert %1107 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1109 = stablehlo.reshape %1108 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1110 = stablehlo.convert %1109 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1111 = stablehlo.dot_general %1110, %arg533, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %1112 = stablehlo.broadcast_in_dim %1111, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1113 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<4800x512xf32>
-    %1114 = stablehlo.multiply %1112, %1113 : tensor<4800x512xf32>
-    %1115 = stablehlo.broadcast_in_dim %1114, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1116 = stablehlo.broadcast_in_dim %arg534, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %1117 = stablehlo.add %1115, %1116 : tensor<4800x512xf32>
-    %1118 = stablehlo.convert %1117 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %1119 = stablehlo.reshape %1118 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %1120 = stablehlo.transpose %1119, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %1121 = stablehlo.reshape %1120 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %1122 = stablehlo.convolution(%1121, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %1123 = stablehlo.reshape %arg53 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %1124 = stablehlo.broadcast_in_dim %1122, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %1125 = stablehlo.broadcast_in_dim %1123, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %1126 = stablehlo.add %1124, %1125 : tensor<1x512x60x80xbf16>
-    %1127 = stablehlo.reshape %1126 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %1128 = stablehlo.transpose %1127, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %1129 = stablehlo.multiply %1128, %cst_23 : tensor<1x4800x512xbf16>
-    %1130 = stablehlo.rsqrt %cst_22 : tensor<1x4800x512xbf16>
-    %1131 = stablehlo.multiply %1128, %1130 : tensor<1x4800x512xbf16>
-    %1132 = stablehlo.convert %1131 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %1133 = stablehlo.clamp %cst_24, %1132, %cst_25 : tensor<1x4800x512xf32>
-    %1134 = stablehlo.multiply %1133, %1133 : tensor<1x4800x512xf32>
-    %1135 = stablehlo.multiply %cst_26, %1134 : tensor<1x4800x512xf32>
-    %1136 = stablehlo.add %1135, %cst_27 : tensor<1x4800x512xf32>
-    %1137 = stablehlo.multiply %1136, %1134 : tensor<1x4800x512xf32>
-    %1138 = stablehlo.add %1137, %cst_28 : tensor<1x4800x512xf32>
-    %1139 = stablehlo.multiply %1138, %1134 : tensor<1x4800x512xf32>
-    %1140 = stablehlo.add %1139, %cst_29 : tensor<1x4800x512xf32>
-    %1141 = stablehlo.multiply %1140, %1134 : tensor<1x4800x512xf32>
-    %1142 = stablehlo.add %1141, %cst_30 : tensor<1x4800x512xf32>
-    %1143 = stablehlo.multiply %1142, %1134 : tensor<1x4800x512xf32>
-    %1144 = stablehlo.add %1143, %cst_31 : tensor<1x4800x512xf32>
-    %1145 = stablehlo.multiply %1144, %1134 : tensor<1x4800x512xf32>
-    %1146 = stablehlo.add %1145, %cst_32 : tensor<1x4800x512xf32>
-    %1147 = stablehlo.multiply %cst_33, %1134 : tensor<1x4800x512xf32>
-    %1148 = stablehlo.add %1147, %cst_34 : tensor<1x4800x512xf32>
-    %1149 = stablehlo.multiply %1148, %1134 : tensor<1x4800x512xf32>
-    %1150 = stablehlo.add %1149, %cst_35 : tensor<1x4800x512xf32>
-    %1151 = stablehlo.multiply %1150, %1134 : tensor<1x4800x512xf32>
-    %1152 = stablehlo.add %1151, %cst_36 : tensor<1x4800x512xf32>
-    %1153 = stablehlo.multiply %1152, %1134 : tensor<1x4800x512xf32>
-    %1154 = stablehlo.add %1153, %cst_37 : tensor<1x4800x512xf32>
-    %1155 = stablehlo.multiply %1133, %1146 : tensor<1x4800x512xf32>
-    %1156 = stablehlo.divide %1155, %1154 : tensor<1x4800x512xf32>
-    %1157 = stablehlo.clamp %cst_38, %1156, %cst_39 : tensor<1x4800x512xf32>
-    %1158 = stablehlo.convert %1157 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %1159 = stablehlo.add %1158, %cst_21 : tensor<1x4800x512xbf16>
-    %1160 = stablehlo.multiply %1159, %1129 : tensor<1x4800x512xbf16>
-    %1161 = stablehlo.reshape %1160 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %1162 = stablehlo.dot_general %1161, %arg535, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %1163 = stablehlo.reshape %1162 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1164 = stablehlo.broadcast_in_dim %1163, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1165 = stablehlo.broadcast_in_dim %arg54, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %1166 = stablehlo.add %1164, %1165 : tensor<1x4800x128xbf16>
-    %1167 = stablehlo.reshape %1166 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1168 = stablehlo.reshape %1167 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1169 = stablehlo.add %1168, %1071 : tensor<1x4800x128xbf16>
-    %1170 = stablehlo.convert %1169 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1171 = stablehlo.convert %1170 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1172 = stablehlo.reduce(%1171 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1173 = stablehlo.reshape %1172 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1174 = stablehlo.broadcast_in_dim %1173, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1175 = stablehlo.divide %1174, %874 : tensor<1x4800x1xf64>
-    %1176 = stablehlo.broadcast_in_dim %1171, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1177 = stablehlo.broadcast_in_dim %1175, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1178 = stablehlo.subtract %1176, %1177 : tensor<1x4800x128xf64>
-    %1179 = stablehlo.multiply %1178, %1178 : tensor<1x4800x128xf64>
-    %1180 = stablehlo.reduce(%1179 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1181 = stablehlo.reshape %1180 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1182 = stablehlo.broadcast_in_dim %1181, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1183 = stablehlo.divide %1182, %874 : tensor<1x4800x1xf64>
-    %1184 = stablehlo.convert %1183 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1185 = stablehlo.reduce(%1170 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1186 = stablehlo.reshape %1185 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1187 = stablehlo.broadcast_in_dim %1186, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1188 = stablehlo.divide %1187, %890 : tensor<1x4800x1xf32>
-    %1189 = stablehlo.broadcast_in_dim %1184, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1190 = stablehlo.add %1189, %893 : tensor<1x4800x1xf32>
-    %1191 = stablehlo.rsqrt %1190 : tensor<1x4800x1xf32>
-    %1192 = stablehlo.broadcast_in_dim %1170, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1193 = stablehlo.broadcast_in_dim %1188, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1194 = stablehlo.subtract %1192, %1193 : tensor<1x4800x128xf32>
-    %1195 = stablehlo.broadcast_in_dim %1194, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1196 = stablehlo.broadcast_in_dim %1191, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1197 = stablehlo.multiply %1195, %1196 : tensor<1x4800x128xf32>
-    %1198 = stablehlo.convert %arg55 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1199 = stablehlo.broadcast_in_dim %1197, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1200 = stablehlo.broadcast_in_dim %1198, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1201 = stablehlo.multiply %1199, %1200 : tensor<1x4800x128xf32>
-    %1202 = stablehlo.convert %arg56 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1203 = stablehlo.broadcast_in_dim %1201, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1204 = stablehlo.broadcast_in_dim %1202, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1205 = stablehlo.add %1203, %1204 : tensor<1x4800x128xf32>
-    %1206 = stablehlo.convert %1205 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1207 = stablehlo.reshape %1206 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1208 = stablehlo.convert %1207 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1209 = stablehlo.dot_general %1208, %arg536, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1210 = stablehlo.broadcast_in_dim %1209, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1211 = stablehlo.multiply %1210, %952 : tensor<4800x128xf32>
-    %1212 = stablehlo.broadcast_in_dim %1211, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1213 = stablehlo.broadcast_in_dim %arg537, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1214 = stablehlo.add %1212, %1213 : tensor<4800x128xf32>
-    %1215 = stablehlo.convert %1214 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1216 = stablehlo.reshape %1215 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1217 = stablehlo.reshape %1216 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1218 = stablehlo.transpose %1217, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1219 = stablehlo.transpose %1206, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %1220 = stablehlo.reshape %1219 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %1221 = stablehlo.convolution(%1220, %arg57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %1222 = stablehlo.reshape %arg58 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %1223 = stablehlo.broadcast_in_dim %1221, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %1224 = stablehlo.broadcast_in_dim %1222, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %1225 = stablehlo.add %1223, %1224 : tensor<1x128x15x20xbf16>
-    %1226 = stablehlo.reshape %1225 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %1227 = stablehlo.transpose %1226, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %1228 = stablehlo.convert %1227 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %1229 = stablehlo.convert %1228 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %1230 = stablehlo.reduce(%1229 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1231 = stablehlo.reshape %1230 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1232 = stablehlo.broadcast_in_dim %1231, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1233 = stablehlo.divide %1232, %975 : tensor<1x300x1xf64>
-    %1234 = stablehlo.broadcast_in_dim %1229, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %1235 = stablehlo.broadcast_in_dim %1233, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %1236 = stablehlo.subtract %1234, %1235 : tensor<1x300x128xf64>
-    %1237 = stablehlo.multiply %1236, %1236 : tensor<1x300x128xf64>
-    %1238 = stablehlo.reduce(%1237 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1239 = stablehlo.reshape %1238 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1240 = stablehlo.broadcast_in_dim %1239, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1241 = stablehlo.divide %1240, %975 : tensor<1x300x1xf64>
-    %1242 = stablehlo.convert %1241 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %1243 = stablehlo.reduce(%1228 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %1244 = stablehlo.reshape %1243 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %1245 = stablehlo.broadcast_in_dim %1244, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %1246 = stablehlo.divide %1245, %989 : tensor<1x300x1xf32>
-    %1247 = stablehlo.broadcast_in_dim %1242, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %1248 = stablehlo.add %1247, %136 : tensor<1x300x1xf32>
-    %1249 = stablehlo.rsqrt %1248 : tensor<1x300x1xf32>
-    %1250 = stablehlo.broadcast_in_dim %1228, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1251 = stablehlo.broadcast_in_dim %1246, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %1252 = stablehlo.subtract %1250, %1251 : tensor<1x300x128xf32>
-    %1253 = stablehlo.broadcast_in_dim %1252, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1254 = stablehlo.broadcast_in_dim %1249, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %1255 = stablehlo.multiply %1253, %1254 : tensor<1x300x128xf32>
-    %1256 = stablehlo.convert %arg59 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1257 = stablehlo.broadcast_in_dim %1255, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1258 = stablehlo.broadcast_in_dim %1256, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1259 = stablehlo.multiply %1257, %1258 : tensor<1x300x128xf32>
-    %1260 = stablehlo.convert %arg60 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1261 = stablehlo.broadcast_in_dim %1259, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1262 = stablehlo.broadcast_in_dim %1260, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1263 = stablehlo.add %1261, %1262 : tensor<1x300x128xf32>
-    %1264 = stablehlo.convert %1263 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %1265 = stablehlo.reshape %1264 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %1266 = stablehlo.convert %1265 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %1267 = stablehlo.dot_general %1266, %arg538, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1268 = stablehlo.broadcast_in_dim %1267, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1269 = stablehlo.multiply %1268, %1013 : tensor<300x128xf32>
-    %1270 = stablehlo.broadcast_in_dim %1269, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1271 = stablehlo.broadcast_in_dim %arg539, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1272 = stablehlo.add %1270, %1271 : tensor<300x128xf32>
-    %1273 = stablehlo.convert %1272 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1274 = stablehlo.reshape %1273 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1275 = stablehlo.reshape %1274 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1276 = stablehlo.transpose %1275, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1277 = stablehlo.dot_general %1266, %arg540, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1278 = stablehlo.broadcast_in_dim %1277, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1279 = stablehlo.multiply %1278, %1013 : tensor<300x128xf32>
-    %1280 = stablehlo.broadcast_in_dim %1279, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1281 = stablehlo.broadcast_in_dim %arg541, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1282 = stablehlo.add %1280, %1281 : tensor<300x128xf32>
-    %1283 = stablehlo.convert %1282 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1284 = stablehlo.reshape %1283 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1285 = stablehlo.reshape %1284 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1286 = stablehlo.transpose %1285, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1287 = stablehlo.transpose %1276, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %1288 = stablehlo.reshape %1218 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1289 = stablehlo.reshape %1287 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1290 = stablehlo.broadcast_in_dim %1289, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1291 = stablehlo.dot_general %1288, %1290, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1292 = stablehlo.reshape %1291 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1293 = stablehlo.broadcast_in_dim %1292, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1294 = stablehlo.divide %1293, %1039 : tensor<1x2x4800x300xbf16>
-    %1295 = stablehlo.convert %1294 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %1296 = stablehlo.reduce(%1295 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1297 = stablehlo.reshape %1296 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1298 = stablehlo.broadcast_in_dim %1295, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1299 = stablehlo.broadcast_in_dim %1297, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1300 = stablehlo.subtract %1298, %1299 : tensor<1x2x4800x300xf32>
-    %1301 = stablehlo.exponential %1300 : tensor<1x2x4800x300xf32>
-    %1302 = stablehlo.reduce(%1301 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1303 = stablehlo.reshape %1302 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1304 = stablehlo.broadcast_in_dim %1301, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1305 = stablehlo.broadcast_in_dim %1303, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1306 = stablehlo.divide %1304, %1305 : tensor<1x2x4800x300xf32>
-    %1307 = stablehlo.convert %1306 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %1308 = stablehlo.reshape %1307 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1309 = stablehlo.reshape %1286 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1310 = stablehlo.broadcast_in_dim %1309, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1311 = stablehlo.dot_general %1308, %1310, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1312 = stablehlo.reshape %1311 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1313 = stablehlo.transpose %1312, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1314 = stablehlo.reshape %1313 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %1315 = stablehlo.reshape %1314 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1316 = stablehlo.convert %1315 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1317 = stablehlo.dot_general %1316, %arg542, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1318 = stablehlo.broadcast_in_dim %1317, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1319 = stablehlo.multiply %1318, %952 : tensor<4800x128xf32>
-    %1320 = stablehlo.broadcast_in_dim %1319, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1321 = stablehlo.broadcast_in_dim %arg543, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1322 = stablehlo.add %1320, %1321 : tensor<4800x128xf32>
-    %1323 = stablehlo.convert %1322 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1324 = stablehlo.reshape %1323 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1325 = stablehlo.add %1324, %1169 : tensor<1x4800x128xbf16>
-    %1326 = stablehlo.convert %1325 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1327 = stablehlo.convert %1326 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1328 = stablehlo.reduce(%1327 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1329 = stablehlo.reshape %1328 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1330 = stablehlo.broadcast_in_dim %1329, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1331 = stablehlo.divide %1330, %874 : tensor<1x4800x1xf64>
-    %1332 = stablehlo.broadcast_in_dim %1327, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1333 = stablehlo.broadcast_in_dim %1331, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1334 = stablehlo.subtract %1332, %1333 : tensor<1x4800x128xf64>
-    %1335 = stablehlo.multiply %1334, %1334 : tensor<1x4800x128xf64>
-    %1336 = stablehlo.reduce(%1335 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1337 = stablehlo.reshape %1336 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1338 = stablehlo.broadcast_in_dim %1337, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1339 = stablehlo.divide %1338, %874 : tensor<1x4800x1xf64>
-    %1340 = stablehlo.convert %1339 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1341 = stablehlo.reduce(%1326 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1342 = stablehlo.reshape %1341 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1343 = stablehlo.broadcast_in_dim %1342, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1344 = stablehlo.divide %1343, %890 : tensor<1x4800x1xf32>
-    %1345 = stablehlo.broadcast_in_dim %1340, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1346 = stablehlo.add %1345, %893 : tensor<1x4800x1xf32>
-    %1347 = stablehlo.rsqrt %1346 : tensor<1x4800x1xf32>
-    %1348 = stablehlo.broadcast_in_dim %1326, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1349 = stablehlo.broadcast_in_dim %1344, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1350 = stablehlo.subtract %1348, %1349 : tensor<1x4800x128xf32>
-    %1351 = stablehlo.broadcast_in_dim %1350, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1352 = stablehlo.broadcast_in_dim %1347, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1353 = stablehlo.multiply %1351, %1352 : tensor<1x4800x128xf32>
-    %1354 = stablehlo.convert %arg61 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1355 = stablehlo.broadcast_in_dim %1353, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1356 = stablehlo.broadcast_in_dim %1354, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1357 = stablehlo.multiply %1355, %1356 : tensor<1x4800x128xf32>
-    %1358 = stablehlo.convert %arg62 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1359 = stablehlo.broadcast_in_dim %1357, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1360 = stablehlo.broadcast_in_dim %1358, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1361 = stablehlo.add %1359, %1360 : tensor<1x4800x128xf32>
-    %1362 = stablehlo.convert %1361 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1363 = stablehlo.reshape %1362 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1364 = stablehlo.convert %1363 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1365 = stablehlo.dot_general %1364, %arg544, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %1366 = stablehlo.broadcast_in_dim %1365, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1367 = stablehlo.multiply %1366, %1113 : tensor<4800x512xf32>
-    %1368 = stablehlo.broadcast_in_dim %1367, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1369 = stablehlo.broadcast_in_dim %arg545, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %1370 = stablehlo.add %1368, %1369 : tensor<4800x512xf32>
-    %1371 = stablehlo.convert %1370 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %1372 = stablehlo.reshape %1371 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %1373 = stablehlo.transpose %1372, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %1374 = stablehlo.reshape %1373 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %1375 = stablehlo.convolution(%1374, %arg63) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %1376 = stablehlo.reshape %arg64 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %1377 = stablehlo.broadcast_in_dim %1375, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %1378 = stablehlo.broadcast_in_dim %1376, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %1379 = stablehlo.add %1377, %1378 : tensor<1x512x60x80xbf16>
-    %1380 = stablehlo.reshape %1379 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %1381 = stablehlo.transpose %1380, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %1382 = stablehlo.multiply %1381, %cst_23 : tensor<1x4800x512xbf16>
-    %1383 = stablehlo.multiply %1381, %1130 : tensor<1x4800x512xbf16>
-    %1384 = stablehlo.convert %1383 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %1385 = stablehlo.clamp %cst_24, %1384, %cst_25 : tensor<1x4800x512xf32>
-    %1386 = stablehlo.multiply %1385, %1385 : tensor<1x4800x512xf32>
-    %1387 = stablehlo.multiply %cst_26, %1386 : tensor<1x4800x512xf32>
-    %1388 = stablehlo.add %1387, %cst_27 : tensor<1x4800x512xf32>
-    %1389 = stablehlo.multiply %1388, %1386 : tensor<1x4800x512xf32>
-    %1390 = stablehlo.add %1389, %cst_28 : tensor<1x4800x512xf32>
-    %1391 = stablehlo.multiply %1390, %1386 : tensor<1x4800x512xf32>
-    %1392 = stablehlo.add %1391, %cst_29 : tensor<1x4800x512xf32>
-    %1393 = stablehlo.multiply %1392, %1386 : tensor<1x4800x512xf32>
-    %1394 = stablehlo.add %1393, %cst_30 : tensor<1x4800x512xf32>
-    %1395 = stablehlo.multiply %1394, %1386 : tensor<1x4800x512xf32>
-    %1396 = stablehlo.add %1395, %cst_31 : tensor<1x4800x512xf32>
-    %1397 = stablehlo.multiply %1396, %1386 : tensor<1x4800x512xf32>
-    %1398 = stablehlo.add %1397, %cst_32 : tensor<1x4800x512xf32>
-    %1399 = stablehlo.multiply %cst_33, %1386 : tensor<1x4800x512xf32>
-    %1400 = stablehlo.add %1399, %cst_34 : tensor<1x4800x512xf32>
-    %1401 = stablehlo.multiply %1400, %1386 : tensor<1x4800x512xf32>
-    %1402 = stablehlo.add %1401, %cst_35 : tensor<1x4800x512xf32>
-    %1403 = stablehlo.multiply %1402, %1386 : tensor<1x4800x512xf32>
-    %1404 = stablehlo.add %1403, %cst_36 : tensor<1x4800x512xf32>
-    %1405 = stablehlo.multiply %1404, %1386 : tensor<1x4800x512xf32>
-    %1406 = stablehlo.add %1405, %cst_37 : tensor<1x4800x512xf32>
-    %1407 = stablehlo.multiply %1385, %1398 : tensor<1x4800x512xf32>
-    %1408 = stablehlo.divide %1407, %1406 : tensor<1x4800x512xf32>
-    %1409 = stablehlo.clamp %cst_38, %1408, %cst_39 : tensor<1x4800x512xf32>
-    %1410 = stablehlo.convert %1409 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %1411 = stablehlo.add %1410, %cst_21 : tensor<1x4800x512xbf16>
-    %1412 = stablehlo.multiply %1411, %1382 : tensor<1x4800x512xbf16>
-    %1413 = stablehlo.reshape %1412 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %1414 = stablehlo.dot_general %1413, %arg546, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %1415 = stablehlo.reshape %1414 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1416 = stablehlo.broadcast_in_dim %1415, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1417 = stablehlo.broadcast_in_dim %arg65, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %1418 = stablehlo.add %1416, %1417 : tensor<1x4800x128xbf16>
-    %1419 = stablehlo.reshape %1418 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1420 = stablehlo.reshape %1419 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1421 = stablehlo.add %1420, %1325 : tensor<1x4800x128xbf16>
-    %1422 = stablehlo.convert %1421 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1423 = stablehlo.convert %1422 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1424 = stablehlo.reduce(%1423 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1425 = stablehlo.reshape %1424 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1426 = stablehlo.broadcast_in_dim %1425, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1427 = stablehlo.divide %1426, %874 : tensor<1x4800x1xf64>
-    %1428 = stablehlo.broadcast_in_dim %1423, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1429 = stablehlo.broadcast_in_dim %1427, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1430 = stablehlo.subtract %1428, %1429 : tensor<1x4800x128xf64>
-    %1431 = stablehlo.multiply %1430, %1430 : tensor<1x4800x128xf64>
-    %1432 = stablehlo.reduce(%1431 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1433 = stablehlo.reshape %1432 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1434 = stablehlo.broadcast_in_dim %1433, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1435 = stablehlo.divide %1434, %874 : tensor<1x4800x1xf64>
-    %1436 = stablehlo.convert %1435 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1437 = stablehlo.reduce(%1422 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1438 = stablehlo.reshape %1437 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1439 = stablehlo.broadcast_in_dim %1438, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1440 = stablehlo.divide %1439, %890 : tensor<1x4800x1xf32>
-    %1441 = stablehlo.broadcast_in_dim %1436, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1442 = stablehlo.add %1441, %893 : tensor<1x4800x1xf32>
-    %1443 = stablehlo.rsqrt %1442 : tensor<1x4800x1xf32>
-    %1444 = stablehlo.broadcast_in_dim %1422, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1445 = stablehlo.broadcast_in_dim %1440, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1446 = stablehlo.subtract %1444, %1445 : tensor<1x4800x128xf32>
-    %1447 = stablehlo.broadcast_in_dim %1446, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1448 = stablehlo.broadcast_in_dim %1443, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1449 = stablehlo.multiply %1447, %1448 : tensor<1x4800x128xf32>
-    %1450 = stablehlo.convert %arg66 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1451 = stablehlo.broadcast_in_dim %1449, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1452 = stablehlo.broadcast_in_dim %1450, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1453 = stablehlo.multiply %1451, %1452 : tensor<1x4800x128xf32>
-    %1454 = stablehlo.convert %arg67 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1455 = stablehlo.broadcast_in_dim %1453, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1456 = stablehlo.broadcast_in_dim %1454, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1457 = stablehlo.add %1455, %1456 : tensor<1x4800x128xf32>
-    %1458 = stablehlo.convert %1457 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1459 = stablehlo.reshape %1458 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1460 = stablehlo.convert %1459 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1461 = stablehlo.dot_general %1460, %arg547, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1462 = stablehlo.broadcast_in_dim %1461, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1463 = stablehlo.multiply %1462, %952 : tensor<4800x128xf32>
-    %1464 = stablehlo.broadcast_in_dim %1463, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1465 = stablehlo.broadcast_in_dim %arg548, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1466 = stablehlo.add %1464, %1465 : tensor<4800x128xf32>
-    %1467 = stablehlo.convert %1466 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1468 = stablehlo.reshape %1467 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1469 = stablehlo.reshape %1468 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1470 = stablehlo.transpose %1469, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1471 = stablehlo.transpose %1458, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %1472 = stablehlo.reshape %1471 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %1473 = stablehlo.convolution(%1472, %arg68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %1474 = stablehlo.reshape %arg69 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %1475 = stablehlo.broadcast_in_dim %1473, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %1476 = stablehlo.broadcast_in_dim %1474, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %1477 = stablehlo.add %1475, %1476 : tensor<1x128x15x20xbf16>
-    %1478 = stablehlo.reshape %1477 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %1479 = stablehlo.transpose %1478, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %1480 = stablehlo.convert %1479 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %1481 = stablehlo.convert %1480 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %1482 = stablehlo.reduce(%1481 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1483 = stablehlo.reshape %1482 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1484 = stablehlo.broadcast_in_dim %1483, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1485 = stablehlo.divide %1484, %975 : tensor<1x300x1xf64>
-    %1486 = stablehlo.broadcast_in_dim %1481, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %1487 = stablehlo.broadcast_in_dim %1485, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %1488 = stablehlo.subtract %1486, %1487 : tensor<1x300x128xf64>
-    %1489 = stablehlo.multiply %1488, %1488 : tensor<1x300x128xf64>
-    %1490 = stablehlo.reduce(%1489 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1491 = stablehlo.reshape %1490 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1492 = stablehlo.broadcast_in_dim %1491, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1493 = stablehlo.divide %1492, %975 : tensor<1x300x1xf64>
-    %1494 = stablehlo.convert %1493 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %1495 = stablehlo.reduce(%1480 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %1496 = stablehlo.reshape %1495 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %1497 = stablehlo.broadcast_in_dim %1496, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %1498 = stablehlo.divide %1497, %989 : tensor<1x300x1xf32>
-    %1499 = stablehlo.broadcast_in_dim %1494, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %1500 = stablehlo.add %1499, %136 : tensor<1x300x1xf32>
-    %1501 = stablehlo.rsqrt %1500 : tensor<1x300x1xf32>
-    %1502 = stablehlo.broadcast_in_dim %1480, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1503 = stablehlo.broadcast_in_dim %1498, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %1504 = stablehlo.subtract %1502, %1503 : tensor<1x300x128xf32>
-    %1505 = stablehlo.broadcast_in_dim %1504, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1506 = stablehlo.broadcast_in_dim %1501, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %1507 = stablehlo.multiply %1505, %1506 : tensor<1x300x128xf32>
-    %1508 = stablehlo.convert %arg70 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1509 = stablehlo.broadcast_in_dim %1507, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1510 = stablehlo.broadcast_in_dim %1508, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1511 = stablehlo.multiply %1509, %1510 : tensor<1x300x128xf32>
-    %1512 = stablehlo.convert %arg71 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1513 = stablehlo.broadcast_in_dim %1511, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1514 = stablehlo.broadcast_in_dim %1512, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1515 = stablehlo.add %1513, %1514 : tensor<1x300x128xf32>
-    %1516 = stablehlo.convert %1515 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %1517 = stablehlo.reshape %1516 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %1518 = stablehlo.convert %1517 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %1519 = stablehlo.dot_general %1518, %arg549, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1520 = stablehlo.broadcast_in_dim %1519, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1521 = stablehlo.multiply %1520, %1013 : tensor<300x128xf32>
-    %1522 = stablehlo.broadcast_in_dim %1521, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1523 = stablehlo.broadcast_in_dim %arg550, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1524 = stablehlo.add %1522, %1523 : tensor<300x128xf32>
-    %1525 = stablehlo.convert %1524 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1526 = stablehlo.reshape %1525 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1527 = stablehlo.reshape %1526 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1528 = stablehlo.transpose %1527, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1529 = stablehlo.dot_general %1518, %arg551, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1530 = stablehlo.broadcast_in_dim %1529, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1531 = stablehlo.multiply %1530, %1013 : tensor<300x128xf32>
-    %1532 = stablehlo.broadcast_in_dim %1531, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1533 = stablehlo.broadcast_in_dim %arg552, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1534 = stablehlo.add %1532, %1533 : tensor<300x128xf32>
-    %1535 = stablehlo.convert %1534 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1536 = stablehlo.reshape %1535 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1537 = stablehlo.reshape %1536 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1538 = stablehlo.transpose %1537, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1539 = stablehlo.transpose %1528, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %1540 = stablehlo.reshape %1470 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1541 = stablehlo.reshape %1539 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1542 = stablehlo.broadcast_in_dim %1541, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1543 = stablehlo.dot_general %1540, %1542, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1544 = stablehlo.reshape %1543 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1545 = stablehlo.broadcast_in_dim %1544, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1546 = stablehlo.divide %1545, %1039 : tensor<1x2x4800x300xbf16>
-    %1547 = stablehlo.convert %1546 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %1548 = stablehlo.reduce(%1547 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1549 = stablehlo.reshape %1548 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1550 = stablehlo.broadcast_in_dim %1547, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1551 = stablehlo.broadcast_in_dim %1549, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1552 = stablehlo.subtract %1550, %1551 : tensor<1x2x4800x300xf32>
-    %1553 = stablehlo.exponential %1552 : tensor<1x2x4800x300xf32>
-    %1554 = stablehlo.reduce(%1553 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1555 = stablehlo.reshape %1554 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1556 = stablehlo.broadcast_in_dim %1553, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1557 = stablehlo.broadcast_in_dim %1555, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1558 = stablehlo.divide %1556, %1557 : tensor<1x2x4800x300xf32>
-    %1559 = stablehlo.convert %1558 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %1560 = stablehlo.reshape %1559 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1561 = stablehlo.reshape %1538 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1562 = stablehlo.broadcast_in_dim %1561, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1563 = stablehlo.dot_general %1560, %1562, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1564 = stablehlo.reshape %1563 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1565 = stablehlo.transpose %1564, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1566 = stablehlo.reshape %1565 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %1567 = stablehlo.reshape %1566 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1568 = stablehlo.convert %1567 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1569 = stablehlo.dot_general %1568, %arg553, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1570 = stablehlo.broadcast_in_dim %1569, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1571 = stablehlo.multiply %1570, %952 : tensor<4800x128xf32>
-    %1572 = stablehlo.broadcast_in_dim %1571, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1573 = stablehlo.broadcast_in_dim %arg554, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1574 = stablehlo.add %1572, %1573 : tensor<4800x128xf32>
-    %1575 = stablehlo.convert %1574 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1576 = stablehlo.reshape %1575 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1577 = stablehlo.add %1576, %1421 : tensor<1x4800x128xbf16>
-    %1578 = stablehlo.convert %1577 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1579 = stablehlo.convert %1578 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1580 = stablehlo.reduce(%1579 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1581 = stablehlo.reshape %1580 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1582 = stablehlo.broadcast_in_dim %1581, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1583 = stablehlo.divide %1582, %874 : tensor<1x4800x1xf64>
-    %1584 = stablehlo.broadcast_in_dim %1579, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1585 = stablehlo.broadcast_in_dim %1583, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1586 = stablehlo.subtract %1584, %1585 : tensor<1x4800x128xf64>
-    %1587 = stablehlo.multiply %1586, %1586 : tensor<1x4800x128xf64>
-    %1588 = stablehlo.reduce(%1587 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1589 = stablehlo.reshape %1588 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1590 = stablehlo.broadcast_in_dim %1589, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1591 = stablehlo.divide %1590, %874 : tensor<1x4800x1xf64>
-    %1592 = stablehlo.convert %1591 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1593 = stablehlo.reduce(%1578 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1594 = stablehlo.reshape %1593 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1595 = stablehlo.broadcast_in_dim %1594, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1596 = stablehlo.divide %1595, %890 : tensor<1x4800x1xf32>
-    %1597 = stablehlo.broadcast_in_dim %1592, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1598 = stablehlo.add %1597, %893 : tensor<1x4800x1xf32>
-    %1599 = stablehlo.rsqrt %1598 : tensor<1x4800x1xf32>
-    %1600 = stablehlo.broadcast_in_dim %1578, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1601 = stablehlo.broadcast_in_dim %1596, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1602 = stablehlo.subtract %1600, %1601 : tensor<1x4800x128xf32>
-    %1603 = stablehlo.broadcast_in_dim %1602, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1604 = stablehlo.broadcast_in_dim %1599, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1605 = stablehlo.multiply %1603, %1604 : tensor<1x4800x128xf32>
-    %1606 = stablehlo.convert %arg72 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1607 = stablehlo.broadcast_in_dim %1605, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1608 = stablehlo.broadcast_in_dim %1606, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1609 = stablehlo.multiply %1607, %1608 : tensor<1x4800x128xf32>
-    %1610 = stablehlo.convert %arg73 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1611 = stablehlo.broadcast_in_dim %1609, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1612 = stablehlo.broadcast_in_dim %1610, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1613 = stablehlo.add %1611, %1612 : tensor<1x4800x128xf32>
-    %1614 = stablehlo.convert %1613 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1615 = stablehlo.reshape %1614 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1616 = stablehlo.convert %1615 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1617 = stablehlo.dot_general %1616, %arg555, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %1618 = stablehlo.broadcast_in_dim %1617, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1619 = stablehlo.multiply %1618, %1113 : tensor<4800x512xf32>
-    %1620 = stablehlo.broadcast_in_dim %1619, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1621 = stablehlo.broadcast_in_dim %arg556, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %1622 = stablehlo.add %1620, %1621 : tensor<4800x512xf32>
-    %1623 = stablehlo.convert %1622 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %1624 = stablehlo.reshape %1623 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %1625 = stablehlo.transpose %1624, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %1626 = stablehlo.reshape %1625 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %1627 = stablehlo.convolution(%1626, %arg74) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %1628 = stablehlo.reshape %arg75 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %1629 = stablehlo.broadcast_in_dim %1627, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %1630 = stablehlo.broadcast_in_dim %1628, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %1631 = stablehlo.add %1629, %1630 : tensor<1x512x60x80xbf16>
-    %1632 = stablehlo.reshape %1631 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %1633 = stablehlo.transpose %1632, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %1634 = stablehlo.multiply %1633, %cst_23 : tensor<1x4800x512xbf16>
-    %1635 = stablehlo.multiply %1633, %1130 : tensor<1x4800x512xbf16>
-    %1636 = stablehlo.convert %1635 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %1637 = stablehlo.clamp %cst_24, %1636, %cst_25 : tensor<1x4800x512xf32>
-    %1638 = stablehlo.multiply %1637, %1637 : tensor<1x4800x512xf32>
-    %1639 = stablehlo.multiply %cst_26, %1638 : tensor<1x4800x512xf32>
-    %1640 = stablehlo.add %1639, %cst_27 : tensor<1x4800x512xf32>
-    %1641 = stablehlo.multiply %1640, %1638 : tensor<1x4800x512xf32>
-    %1642 = stablehlo.add %1641, %cst_28 : tensor<1x4800x512xf32>
-    %1643 = stablehlo.multiply %1642, %1638 : tensor<1x4800x512xf32>
-    %1644 = stablehlo.add %1643, %cst_29 : tensor<1x4800x512xf32>
-    %1645 = stablehlo.multiply %1644, %1638 : tensor<1x4800x512xf32>
-    %1646 = stablehlo.add %1645, %cst_30 : tensor<1x4800x512xf32>
-    %1647 = stablehlo.multiply %1646, %1638 : tensor<1x4800x512xf32>
-    %1648 = stablehlo.add %1647, %cst_31 : tensor<1x4800x512xf32>
-    %1649 = stablehlo.multiply %1648, %1638 : tensor<1x4800x512xf32>
-    %1650 = stablehlo.add %1649, %cst_32 : tensor<1x4800x512xf32>
-    %1651 = stablehlo.multiply %cst_33, %1638 : tensor<1x4800x512xf32>
-    %1652 = stablehlo.add %1651, %cst_34 : tensor<1x4800x512xf32>
-    %1653 = stablehlo.multiply %1652, %1638 : tensor<1x4800x512xf32>
-    %1654 = stablehlo.add %1653, %cst_35 : tensor<1x4800x512xf32>
-    %1655 = stablehlo.multiply %1654, %1638 : tensor<1x4800x512xf32>
-    %1656 = stablehlo.add %1655, %cst_36 : tensor<1x4800x512xf32>
-    %1657 = stablehlo.multiply %1656, %1638 : tensor<1x4800x512xf32>
-    %1658 = stablehlo.add %1657, %cst_37 : tensor<1x4800x512xf32>
-    %1659 = stablehlo.multiply %1637, %1650 : tensor<1x4800x512xf32>
-    %1660 = stablehlo.divide %1659, %1658 : tensor<1x4800x512xf32>
-    %1661 = stablehlo.clamp %cst_38, %1660, %cst_39 : tensor<1x4800x512xf32>
-    %1662 = stablehlo.convert %1661 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %1663 = stablehlo.add %1662, %cst_21 : tensor<1x4800x512xbf16>
-    %1664 = stablehlo.multiply %1663, %1634 : tensor<1x4800x512xbf16>
-    %1665 = stablehlo.reshape %1664 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %1666 = stablehlo.dot_general %1665, %arg557, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %1667 = stablehlo.reshape %1666 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1668 = stablehlo.broadcast_in_dim %1667, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1669 = stablehlo.broadcast_in_dim %arg76, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %1670 = stablehlo.add %1668, %1669 : tensor<1x4800x128xbf16>
-    %1671 = stablehlo.reshape %1670 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1672 = stablehlo.reshape %1671 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1673 = stablehlo.add %1672, %1577 : tensor<1x4800x128xbf16>
-    %1674 = stablehlo.convert %1673 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1675 = stablehlo.convert %1674 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1676 = stablehlo.reduce(%1675 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1677 = stablehlo.reshape %1676 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1678 = stablehlo.broadcast_in_dim %1677, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1679 = stablehlo.divide %1678, %874 : tensor<1x4800x1xf64>
-    %1680 = stablehlo.broadcast_in_dim %1675, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1681 = stablehlo.broadcast_in_dim %1679, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1682 = stablehlo.subtract %1680, %1681 : tensor<1x4800x128xf64>
-    %1683 = stablehlo.multiply %1682, %1682 : tensor<1x4800x128xf64>
-    %1684 = stablehlo.reduce(%1683 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1685 = stablehlo.reshape %1684 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1686 = stablehlo.broadcast_in_dim %1685, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1687 = stablehlo.divide %1686, %874 : tensor<1x4800x1xf64>
-    %1688 = stablehlo.convert %1687 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1689 = stablehlo.reduce(%1674 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1690 = stablehlo.reshape %1689 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1691 = stablehlo.broadcast_in_dim %1690, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1692 = stablehlo.divide %1691, %890 : tensor<1x4800x1xf32>
-    %1693 = stablehlo.broadcast_in_dim %1688, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1694 = stablehlo.add %1693, %893 : tensor<1x4800x1xf32>
-    %1695 = stablehlo.rsqrt %1694 : tensor<1x4800x1xf32>
-    %1696 = stablehlo.broadcast_in_dim %1674, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1697 = stablehlo.broadcast_in_dim %1692, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1698 = stablehlo.subtract %1696, %1697 : tensor<1x4800x128xf32>
-    %1699 = stablehlo.broadcast_in_dim %1698, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1700 = stablehlo.broadcast_in_dim %1695, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1701 = stablehlo.multiply %1699, %1700 : tensor<1x4800x128xf32>
-    %1702 = stablehlo.convert %arg77 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1703 = stablehlo.broadcast_in_dim %1701, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1704 = stablehlo.broadcast_in_dim %1702, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1705 = stablehlo.multiply %1703, %1704 : tensor<1x4800x128xf32>
-    %1706 = stablehlo.convert %arg78 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1707 = stablehlo.broadcast_in_dim %1705, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1708 = stablehlo.broadcast_in_dim %1706, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1709 = stablehlo.add %1707, %1708 : tensor<1x4800x128xf32>
-    %1710 = stablehlo.convert %1709 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1711 = stablehlo.reshape %1710 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1712 = stablehlo.convert %1711 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1713 = stablehlo.dot_general %1712, %arg558, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1714 = stablehlo.broadcast_in_dim %1713, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1715 = stablehlo.multiply %1714, %952 : tensor<4800x128xf32>
-    %1716 = stablehlo.broadcast_in_dim %1715, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1717 = stablehlo.broadcast_in_dim %arg559, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1718 = stablehlo.add %1716, %1717 : tensor<4800x128xf32>
-    %1719 = stablehlo.convert %1718 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1720 = stablehlo.reshape %1719 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1721 = stablehlo.reshape %1720 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1722 = stablehlo.transpose %1721, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1723 = stablehlo.transpose %1710, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %1724 = stablehlo.reshape %1723 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %1725 = stablehlo.convolution(%1724, %arg79) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %1726 = stablehlo.reshape %arg80 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %1727 = stablehlo.broadcast_in_dim %1725, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %1728 = stablehlo.broadcast_in_dim %1726, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %1729 = stablehlo.add %1727, %1728 : tensor<1x128x15x20xbf16>
-    %1730 = stablehlo.reshape %1729 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %1731 = stablehlo.transpose %1730, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %1732 = stablehlo.convert %1731 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %1733 = stablehlo.convert %1732 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %1734 = stablehlo.reduce(%1733 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1735 = stablehlo.reshape %1734 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1736 = stablehlo.broadcast_in_dim %1735, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1737 = stablehlo.divide %1736, %975 : tensor<1x300x1xf64>
-    %1738 = stablehlo.broadcast_in_dim %1733, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %1739 = stablehlo.broadcast_in_dim %1737, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %1740 = stablehlo.subtract %1738, %1739 : tensor<1x300x128xf64>
-    %1741 = stablehlo.multiply %1740, %1740 : tensor<1x300x128xf64>
-    %1742 = stablehlo.reduce(%1741 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1743 = stablehlo.reshape %1742 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1744 = stablehlo.broadcast_in_dim %1743, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1745 = stablehlo.divide %1744, %975 : tensor<1x300x1xf64>
-    %1746 = stablehlo.convert %1745 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %1747 = stablehlo.reduce(%1732 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %1748 = stablehlo.reshape %1747 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %1749 = stablehlo.broadcast_in_dim %1748, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %1750 = stablehlo.divide %1749, %989 : tensor<1x300x1xf32>
-    %1751 = stablehlo.broadcast_in_dim %1746, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %1752 = stablehlo.add %1751, %136 : tensor<1x300x1xf32>
-    %1753 = stablehlo.rsqrt %1752 : tensor<1x300x1xf32>
-    %1754 = stablehlo.broadcast_in_dim %1732, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1755 = stablehlo.broadcast_in_dim %1750, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %1756 = stablehlo.subtract %1754, %1755 : tensor<1x300x128xf32>
-    %1757 = stablehlo.broadcast_in_dim %1756, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1758 = stablehlo.broadcast_in_dim %1753, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %1759 = stablehlo.multiply %1757, %1758 : tensor<1x300x128xf32>
-    %1760 = stablehlo.convert %arg81 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1761 = stablehlo.broadcast_in_dim %1759, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1762 = stablehlo.broadcast_in_dim %1760, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1763 = stablehlo.multiply %1761, %1762 : tensor<1x300x128xf32>
-    %1764 = stablehlo.convert %arg82 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1765 = stablehlo.broadcast_in_dim %1763, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %1766 = stablehlo.broadcast_in_dim %1764, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %1767 = stablehlo.add %1765, %1766 : tensor<1x300x128xf32>
-    %1768 = stablehlo.convert %1767 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %1769 = stablehlo.reshape %1768 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %1770 = stablehlo.convert %1769 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %1771 = stablehlo.dot_general %1770, %arg560, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1772 = stablehlo.broadcast_in_dim %1771, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1773 = stablehlo.multiply %1772, %1013 : tensor<300x128xf32>
-    %1774 = stablehlo.broadcast_in_dim %1773, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1775 = stablehlo.broadcast_in_dim %arg561, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1776 = stablehlo.add %1774, %1775 : tensor<300x128xf32>
-    %1777 = stablehlo.convert %1776 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1778 = stablehlo.reshape %1777 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1779 = stablehlo.reshape %1778 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1780 = stablehlo.transpose %1779, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1781 = stablehlo.dot_general %1770, %arg562, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %1782 = stablehlo.broadcast_in_dim %1781, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1783 = stablehlo.multiply %1782, %1013 : tensor<300x128xf32>
-    %1784 = stablehlo.broadcast_in_dim %1783, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %1785 = stablehlo.broadcast_in_dim %arg563, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %1786 = stablehlo.add %1784, %1785 : tensor<300x128xf32>
-    %1787 = stablehlo.convert %1786 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %1788 = stablehlo.reshape %1787 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %1789 = stablehlo.reshape %1788 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %1790 = stablehlo.transpose %1789, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %1791 = stablehlo.transpose %1780, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %1792 = stablehlo.reshape %1722 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1793 = stablehlo.reshape %1791 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1794 = stablehlo.broadcast_in_dim %1793, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %1795 = stablehlo.dot_general %1792, %1794, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1796 = stablehlo.reshape %1795 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1797 = stablehlo.broadcast_in_dim %1796, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %1798 = stablehlo.divide %1797, %1039 : tensor<1x2x4800x300xbf16>
-    %1799 = stablehlo.convert %1798 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %1800 = stablehlo.reduce(%1799 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1801 = stablehlo.reshape %1800 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1802 = stablehlo.broadcast_in_dim %1799, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1803 = stablehlo.broadcast_in_dim %1801, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1804 = stablehlo.subtract %1802, %1803 : tensor<1x2x4800x300xf32>
-    %1805 = stablehlo.exponential %1804 : tensor<1x2x4800x300xf32>
-    %1806 = stablehlo.reduce(%1805 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %1807 = stablehlo.reshape %1806 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %1808 = stablehlo.broadcast_in_dim %1805, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %1809 = stablehlo.broadcast_in_dim %1807, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %1810 = stablehlo.divide %1808, %1809 : tensor<1x2x4800x300xf32>
-    %1811 = stablehlo.convert %1810 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %1812 = stablehlo.reshape %1811 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %1813 = stablehlo.reshape %1790 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1814 = stablehlo.broadcast_in_dim %1813, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %1815 = stablehlo.dot_general %1812, %1814, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %1816 = stablehlo.reshape %1815 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1817 = stablehlo.transpose %1816, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1818 = stablehlo.reshape %1817 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %1819 = stablehlo.reshape %1818 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1820 = stablehlo.convert %1819 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1821 = stablehlo.dot_general %1820, %arg564, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1822 = stablehlo.broadcast_in_dim %1821, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1823 = stablehlo.multiply %1822, %952 : tensor<4800x128xf32>
-    %1824 = stablehlo.broadcast_in_dim %1823, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1825 = stablehlo.broadcast_in_dim %arg565, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1826 = stablehlo.add %1824, %1825 : tensor<4800x128xf32>
-    %1827 = stablehlo.convert %1826 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1828 = stablehlo.reshape %1827 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1829 = stablehlo.add %1828, %1673 : tensor<1x4800x128xbf16>
-    %1830 = stablehlo.convert %1829 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1831 = stablehlo.convert %1830 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1832 = stablehlo.reduce(%1831 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1833 = stablehlo.reshape %1832 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1834 = stablehlo.broadcast_in_dim %1833, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1835 = stablehlo.divide %1834, %874 : tensor<1x4800x1xf64>
-    %1836 = stablehlo.broadcast_in_dim %1831, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1837 = stablehlo.broadcast_in_dim %1835, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1838 = stablehlo.subtract %1836, %1837 : tensor<1x4800x128xf64>
-    %1839 = stablehlo.multiply %1838, %1838 : tensor<1x4800x128xf64>
-    %1840 = stablehlo.reduce(%1839 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1841 = stablehlo.reshape %1840 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1842 = stablehlo.broadcast_in_dim %1841, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1843 = stablehlo.divide %1842, %874 : tensor<1x4800x1xf64>
-    %1844 = stablehlo.convert %1843 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1845 = stablehlo.reduce(%1830 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1846 = stablehlo.reshape %1845 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1847 = stablehlo.broadcast_in_dim %1846, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1848 = stablehlo.divide %1847, %890 : tensor<1x4800x1xf32>
-    %1849 = stablehlo.broadcast_in_dim %1844, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1850 = stablehlo.add %1849, %893 : tensor<1x4800x1xf32>
-    %1851 = stablehlo.rsqrt %1850 : tensor<1x4800x1xf32>
-    %1852 = stablehlo.broadcast_in_dim %1830, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1853 = stablehlo.broadcast_in_dim %1848, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1854 = stablehlo.subtract %1852, %1853 : tensor<1x4800x128xf32>
-    %1855 = stablehlo.broadcast_in_dim %1854, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1856 = stablehlo.broadcast_in_dim %1851, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1857 = stablehlo.multiply %1855, %1856 : tensor<1x4800x128xf32>
-    %1858 = stablehlo.convert %arg83 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1859 = stablehlo.broadcast_in_dim %1857, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1860 = stablehlo.broadcast_in_dim %1858, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1861 = stablehlo.multiply %1859, %1860 : tensor<1x4800x128xf32>
-    %1862 = stablehlo.convert %arg84 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1863 = stablehlo.broadcast_in_dim %1861, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1864 = stablehlo.broadcast_in_dim %1862, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1865 = stablehlo.add %1863, %1864 : tensor<1x4800x128xf32>
-    %1866 = stablehlo.convert %1865 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1867 = stablehlo.reshape %1866 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1868 = stablehlo.convert %1867 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1869 = stablehlo.dot_general %1868, %arg566, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %1870 = stablehlo.broadcast_in_dim %1869, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1871 = stablehlo.multiply %1870, %1113 : tensor<4800x512xf32>
-    %1872 = stablehlo.broadcast_in_dim %1871, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %1873 = stablehlo.broadcast_in_dim %arg567, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %1874 = stablehlo.add %1872, %1873 : tensor<4800x512xf32>
-    %1875 = stablehlo.convert %1874 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %1876 = stablehlo.reshape %1875 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %1877 = stablehlo.transpose %1876, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %1878 = stablehlo.reshape %1877 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %1879 = stablehlo.convolution(%1878, %arg85) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %1880 = stablehlo.reshape %arg86 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %1881 = stablehlo.broadcast_in_dim %1879, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %1882 = stablehlo.broadcast_in_dim %1880, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %1883 = stablehlo.add %1881, %1882 : tensor<1x512x60x80xbf16>
-    %1884 = stablehlo.reshape %1883 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %1885 = stablehlo.transpose %1884, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %1886 = stablehlo.multiply %1885, %cst_23 : tensor<1x4800x512xbf16>
-    %1887 = stablehlo.multiply %1885, %1130 : tensor<1x4800x512xbf16>
-    %1888 = stablehlo.convert %1887 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %1889 = stablehlo.clamp %cst_24, %1888, %cst_25 : tensor<1x4800x512xf32>
-    %1890 = stablehlo.multiply %1889, %1889 : tensor<1x4800x512xf32>
-    %1891 = stablehlo.multiply %cst_26, %1890 : tensor<1x4800x512xf32>
-    %1892 = stablehlo.add %1891, %cst_27 : tensor<1x4800x512xf32>
-    %1893 = stablehlo.multiply %1892, %1890 : tensor<1x4800x512xf32>
-    %1894 = stablehlo.add %1893, %cst_28 : tensor<1x4800x512xf32>
-    %1895 = stablehlo.multiply %1894, %1890 : tensor<1x4800x512xf32>
-    %1896 = stablehlo.add %1895, %cst_29 : tensor<1x4800x512xf32>
-    %1897 = stablehlo.multiply %1896, %1890 : tensor<1x4800x512xf32>
-    %1898 = stablehlo.add %1897, %cst_30 : tensor<1x4800x512xf32>
-    %1899 = stablehlo.multiply %1898, %1890 : tensor<1x4800x512xf32>
-    %1900 = stablehlo.add %1899, %cst_31 : tensor<1x4800x512xf32>
-    %1901 = stablehlo.multiply %1900, %1890 : tensor<1x4800x512xf32>
-    %1902 = stablehlo.add %1901, %cst_32 : tensor<1x4800x512xf32>
-    %1903 = stablehlo.multiply %cst_33, %1890 : tensor<1x4800x512xf32>
-    %1904 = stablehlo.add %1903, %cst_34 : tensor<1x4800x512xf32>
-    %1905 = stablehlo.multiply %1904, %1890 : tensor<1x4800x512xf32>
-    %1906 = stablehlo.add %1905, %cst_35 : tensor<1x4800x512xf32>
-    %1907 = stablehlo.multiply %1906, %1890 : tensor<1x4800x512xf32>
-    %1908 = stablehlo.add %1907, %cst_36 : tensor<1x4800x512xf32>
-    %1909 = stablehlo.multiply %1908, %1890 : tensor<1x4800x512xf32>
-    %1910 = stablehlo.add %1909, %cst_37 : tensor<1x4800x512xf32>
-    %1911 = stablehlo.multiply %1889, %1902 : tensor<1x4800x512xf32>
-    %1912 = stablehlo.divide %1911, %1910 : tensor<1x4800x512xf32>
-    %1913 = stablehlo.clamp %cst_38, %1912, %cst_39 : tensor<1x4800x512xf32>
-    %1914 = stablehlo.convert %1913 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %1915 = stablehlo.add %1914, %cst_21 : tensor<1x4800x512xbf16>
-    %1916 = stablehlo.multiply %1915, %1886 : tensor<1x4800x512xbf16>
-    %1917 = stablehlo.reshape %1916 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %1918 = stablehlo.dot_general %1917, %arg568, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %1919 = stablehlo.reshape %1918 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1920 = stablehlo.broadcast_in_dim %1919, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1921 = stablehlo.broadcast_in_dim %arg87, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %1922 = stablehlo.add %1920, %1921 : tensor<1x4800x128xbf16>
-    %1923 = stablehlo.reshape %1922 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1924 = stablehlo.reshape %1923 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1925 = stablehlo.add %1924, %1829 : tensor<1x4800x128xbf16>
-    %1926 = stablehlo.convert %1925 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %1927 = stablehlo.convert %1926 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %1928 = stablehlo.reduce(%1927 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1929 = stablehlo.reshape %1928 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1930 = stablehlo.broadcast_in_dim %1929, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1931 = stablehlo.divide %1930, %874 : tensor<1x4800x1xf64>
-    %1932 = stablehlo.broadcast_in_dim %1927, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %1933 = stablehlo.broadcast_in_dim %1931, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %1934 = stablehlo.subtract %1932, %1933 : tensor<1x4800x128xf64>
-    %1935 = stablehlo.multiply %1934, %1934 : tensor<1x4800x128xf64>
-    %1936 = stablehlo.reduce(%1935 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %1937 = stablehlo.reshape %1936 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %1938 = stablehlo.broadcast_in_dim %1937, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %1939 = stablehlo.divide %1938, %874 : tensor<1x4800x1xf64>
-    %1940 = stablehlo.convert %1939 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %1941 = stablehlo.reduce(%1926 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %1942 = stablehlo.reshape %1941 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %1943 = stablehlo.broadcast_in_dim %1942, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1944 = stablehlo.divide %1943, %890 : tensor<1x4800x1xf32>
-    %1945 = stablehlo.broadcast_in_dim %1940, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %1946 = stablehlo.add %1945, %893 : tensor<1x4800x1xf32>
-    %1947 = stablehlo.rsqrt %1946 : tensor<1x4800x1xf32>
-    %1948 = stablehlo.broadcast_in_dim %1926, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1949 = stablehlo.broadcast_in_dim %1944, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1950 = stablehlo.subtract %1948, %1949 : tensor<1x4800x128xf32>
-    %1951 = stablehlo.broadcast_in_dim %1950, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1952 = stablehlo.broadcast_in_dim %1947, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %1953 = stablehlo.multiply %1951, %1952 : tensor<1x4800x128xf32>
-    %1954 = stablehlo.convert %arg88 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1955 = stablehlo.broadcast_in_dim %1953, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1956 = stablehlo.broadcast_in_dim %1954, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1957 = stablehlo.multiply %1955, %1956 : tensor<1x4800x128xf32>
-    %1958 = stablehlo.convert %arg89 : (tensor<128xbf16>) -> tensor<128xf32>
-    %1959 = stablehlo.broadcast_in_dim %1957, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %1960 = stablehlo.broadcast_in_dim %1958, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %1961 = stablehlo.add %1959, %1960 : tensor<1x4800x128xf32>
-    %1962 = stablehlo.convert %1961 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %1963 = stablehlo.reshape %1962 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %1964 = stablehlo.convert %1963 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %1965 = stablehlo.dot_general %1964, %arg569, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %1966 = stablehlo.broadcast_in_dim %1965, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1967 = stablehlo.multiply %1966, %952 : tensor<4800x128xf32>
-    %1968 = stablehlo.broadcast_in_dim %1967, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %1969 = stablehlo.broadcast_in_dim %arg570, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %1970 = stablehlo.add %1968, %1969 : tensor<4800x128xf32>
-    %1971 = stablehlo.convert %1970 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %1972 = stablehlo.reshape %1971 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %1973 = stablehlo.reshape %1972 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %1974 = stablehlo.transpose %1973, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %1975 = stablehlo.transpose %1962, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %1976 = stablehlo.reshape %1975 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %1977 = stablehlo.convolution(%1976, %arg90) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %1978 = stablehlo.reshape %arg91 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %1979 = stablehlo.broadcast_in_dim %1977, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %1980 = stablehlo.broadcast_in_dim %1978, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %1981 = stablehlo.add %1979, %1980 : tensor<1x128x15x20xbf16>
-    %1982 = stablehlo.reshape %1981 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %1983 = stablehlo.transpose %1982, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %1984 = stablehlo.convert %1983 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %1985 = stablehlo.convert %1984 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %1986 = stablehlo.reduce(%1985 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1987 = stablehlo.reshape %1986 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1988 = stablehlo.broadcast_in_dim %1987, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1989 = stablehlo.divide %1988, %975 : tensor<1x300x1xf64>
-    %1990 = stablehlo.broadcast_in_dim %1985, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %1991 = stablehlo.broadcast_in_dim %1989, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %1992 = stablehlo.subtract %1990, %1991 : tensor<1x300x128xf64>
-    %1993 = stablehlo.multiply %1992, %1992 : tensor<1x300x128xf64>
-    %1994 = stablehlo.reduce(%1993 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %1995 = stablehlo.reshape %1994 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %1996 = stablehlo.broadcast_in_dim %1995, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %1997 = stablehlo.divide %1996, %975 : tensor<1x300x1xf64>
-    %1998 = stablehlo.convert %1997 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %1999 = stablehlo.reduce(%1984 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %2000 = stablehlo.reshape %1999 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %2001 = stablehlo.broadcast_in_dim %2000, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2002 = stablehlo.divide %2001, %989 : tensor<1x300x1xf32>
-    %2003 = stablehlo.broadcast_in_dim %1998, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2004 = stablehlo.add %2003, %136 : tensor<1x300x1xf32>
-    %2005 = stablehlo.rsqrt %2004 : tensor<1x300x1xf32>
-    %2006 = stablehlo.broadcast_in_dim %1984, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2007 = stablehlo.broadcast_in_dim %2002, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2008 = stablehlo.subtract %2006, %2007 : tensor<1x300x128xf32>
-    %2009 = stablehlo.broadcast_in_dim %2008, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2010 = stablehlo.broadcast_in_dim %2005, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2011 = stablehlo.multiply %2009, %2010 : tensor<1x300x128xf32>
-    %2012 = stablehlo.convert %arg92 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2013 = stablehlo.broadcast_in_dim %2011, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2014 = stablehlo.broadcast_in_dim %2012, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2015 = stablehlo.multiply %2013, %2014 : tensor<1x300x128xf32>
-    %2016 = stablehlo.convert %arg93 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2017 = stablehlo.broadcast_in_dim %2015, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2018 = stablehlo.broadcast_in_dim %2016, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2019 = stablehlo.add %2017, %2018 : tensor<1x300x128xf32>
-    %2020 = stablehlo.convert %2019 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %2021 = stablehlo.reshape %2020 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %2022 = stablehlo.convert %2021 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %2023 = stablehlo.dot_general %2022, %arg571, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2024 = stablehlo.broadcast_in_dim %2023, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2025 = stablehlo.multiply %2024, %1013 : tensor<300x128xf32>
-    %2026 = stablehlo.broadcast_in_dim %2025, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2027 = stablehlo.broadcast_in_dim %arg572, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2028 = stablehlo.add %2026, %2027 : tensor<300x128xf32>
-    %2029 = stablehlo.convert %2028 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2030 = stablehlo.reshape %2029 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2031 = stablehlo.reshape %2030 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2032 = stablehlo.transpose %2031, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2033 = stablehlo.dot_general %2022, %arg573, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2034 = stablehlo.broadcast_in_dim %2033, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2035 = stablehlo.multiply %2034, %1013 : tensor<300x128xf32>
-    %2036 = stablehlo.broadcast_in_dim %2035, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2037 = stablehlo.broadcast_in_dim %arg574, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2038 = stablehlo.add %2036, %2037 : tensor<300x128xf32>
-    %2039 = stablehlo.convert %2038 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2040 = stablehlo.reshape %2039 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2041 = stablehlo.reshape %2040 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2042 = stablehlo.transpose %2041, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2043 = stablehlo.transpose %2032, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %2044 = stablehlo.reshape %1974 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2045 = stablehlo.reshape %2043 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2046 = stablehlo.broadcast_in_dim %2045, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2047 = stablehlo.dot_general %2044, %2046, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2048 = stablehlo.reshape %2047 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2049 = stablehlo.broadcast_in_dim %2048, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2050 = stablehlo.divide %2049, %1039 : tensor<1x2x4800x300xbf16>
-    %2051 = stablehlo.convert %2050 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %2052 = stablehlo.reduce(%2051 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2053 = stablehlo.reshape %2052 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2054 = stablehlo.broadcast_in_dim %2051, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2055 = stablehlo.broadcast_in_dim %2053, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2056 = stablehlo.subtract %2054, %2055 : tensor<1x2x4800x300xf32>
-    %2057 = stablehlo.exponential %2056 : tensor<1x2x4800x300xf32>
-    %2058 = stablehlo.reduce(%2057 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2059 = stablehlo.reshape %2058 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2060 = stablehlo.broadcast_in_dim %2057, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2061 = stablehlo.broadcast_in_dim %2059, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2062 = stablehlo.divide %2060, %2061 : tensor<1x2x4800x300xf32>
-    %2063 = stablehlo.convert %2062 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %2064 = stablehlo.reshape %2063 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2065 = stablehlo.reshape %2042 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2066 = stablehlo.broadcast_in_dim %2065, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2067 = stablehlo.dot_general %2064, %2066, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2068 = stablehlo.reshape %2067 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2069 = stablehlo.transpose %2068, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2070 = stablehlo.reshape %2069 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %2071 = stablehlo.reshape %2070 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2072 = stablehlo.convert %2071 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2073 = stablehlo.dot_general %2072, %arg575, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2074 = stablehlo.broadcast_in_dim %2073, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2075 = stablehlo.multiply %2074, %952 : tensor<4800x128xf32>
-    %2076 = stablehlo.broadcast_in_dim %2075, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2077 = stablehlo.broadcast_in_dim %arg576, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2078 = stablehlo.add %2076, %2077 : tensor<4800x128xf32>
-    %2079 = stablehlo.convert %2078 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2080 = stablehlo.reshape %2079 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2081 = stablehlo.add %2080, %1925 : tensor<1x4800x128xbf16>
-    %2082 = stablehlo.convert %2081 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2083 = stablehlo.convert %2082 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2084 = stablehlo.reduce(%2083 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2085 = stablehlo.reshape %2084 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2086 = stablehlo.broadcast_in_dim %2085, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2087 = stablehlo.divide %2086, %874 : tensor<1x4800x1xf64>
-    %2088 = stablehlo.broadcast_in_dim %2083, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2089 = stablehlo.broadcast_in_dim %2087, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2090 = stablehlo.subtract %2088, %2089 : tensor<1x4800x128xf64>
-    %2091 = stablehlo.multiply %2090, %2090 : tensor<1x4800x128xf64>
-    %2092 = stablehlo.reduce(%2091 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2093 = stablehlo.reshape %2092 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2094 = stablehlo.broadcast_in_dim %2093, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2095 = stablehlo.divide %2094, %874 : tensor<1x4800x1xf64>
-    %2096 = stablehlo.convert %2095 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2097 = stablehlo.reduce(%2082 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2098 = stablehlo.reshape %2097 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2099 = stablehlo.broadcast_in_dim %2098, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2100 = stablehlo.divide %2099, %890 : tensor<1x4800x1xf32>
-    %2101 = stablehlo.broadcast_in_dim %2096, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2102 = stablehlo.add %2101, %893 : tensor<1x4800x1xf32>
-    %2103 = stablehlo.rsqrt %2102 : tensor<1x4800x1xf32>
-    %2104 = stablehlo.broadcast_in_dim %2082, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2105 = stablehlo.broadcast_in_dim %2100, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2106 = stablehlo.subtract %2104, %2105 : tensor<1x4800x128xf32>
-    %2107 = stablehlo.broadcast_in_dim %2106, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2108 = stablehlo.broadcast_in_dim %2103, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2109 = stablehlo.multiply %2107, %2108 : tensor<1x4800x128xf32>
-    %2110 = stablehlo.convert %arg94 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2111 = stablehlo.broadcast_in_dim %2109, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2112 = stablehlo.broadcast_in_dim %2110, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2113 = stablehlo.multiply %2111, %2112 : tensor<1x4800x128xf32>
-    %2114 = stablehlo.convert %arg95 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2115 = stablehlo.broadcast_in_dim %2113, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2116 = stablehlo.broadcast_in_dim %2114, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2117 = stablehlo.add %2115, %2116 : tensor<1x4800x128xf32>
-    %2118 = stablehlo.convert %2117 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2119 = stablehlo.reshape %2118 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2120 = stablehlo.convert %2119 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2121 = stablehlo.dot_general %2120, %arg577, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %2122 = stablehlo.broadcast_in_dim %2121, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2123 = stablehlo.multiply %2122, %1113 : tensor<4800x512xf32>
-    %2124 = stablehlo.broadcast_in_dim %2123, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2125 = stablehlo.broadcast_in_dim %arg578, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %2126 = stablehlo.add %2124, %2125 : tensor<4800x512xf32>
-    %2127 = stablehlo.convert %2126 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %2128 = stablehlo.reshape %2127 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %2129 = stablehlo.transpose %2128, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %2130 = stablehlo.reshape %2129 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %2131 = stablehlo.convolution(%2130, %arg96) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %2132 = stablehlo.reshape %arg97 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %2133 = stablehlo.broadcast_in_dim %2131, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %2134 = stablehlo.broadcast_in_dim %2132, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %2135 = stablehlo.add %2133, %2134 : tensor<1x512x60x80xbf16>
-    %2136 = stablehlo.reshape %2135 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %2137 = stablehlo.transpose %2136, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %2138 = stablehlo.multiply %2137, %cst_23 : tensor<1x4800x512xbf16>
-    %2139 = stablehlo.multiply %2137, %1130 : tensor<1x4800x512xbf16>
-    %2140 = stablehlo.convert %2139 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %2141 = stablehlo.clamp %cst_24, %2140, %cst_25 : tensor<1x4800x512xf32>
-    %2142 = stablehlo.multiply %2141, %2141 : tensor<1x4800x512xf32>
-    %2143 = stablehlo.multiply %cst_26, %2142 : tensor<1x4800x512xf32>
-    %2144 = stablehlo.add %2143, %cst_27 : tensor<1x4800x512xf32>
-    %2145 = stablehlo.multiply %2144, %2142 : tensor<1x4800x512xf32>
-    %2146 = stablehlo.add %2145, %cst_28 : tensor<1x4800x512xf32>
-    %2147 = stablehlo.multiply %2146, %2142 : tensor<1x4800x512xf32>
-    %2148 = stablehlo.add %2147, %cst_29 : tensor<1x4800x512xf32>
-    %2149 = stablehlo.multiply %2148, %2142 : tensor<1x4800x512xf32>
-    %2150 = stablehlo.add %2149, %cst_30 : tensor<1x4800x512xf32>
-    %2151 = stablehlo.multiply %2150, %2142 : tensor<1x4800x512xf32>
-    %2152 = stablehlo.add %2151, %cst_31 : tensor<1x4800x512xf32>
-    %2153 = stablehlo.multiply %2152, %2142 : tensor<1x4800x512xf32>
-    %2154 = stablehlo.add %2153, %cst_32 : tensor<1x4800x512xf32>
-    %2155 = stablehlo.multiply %cst_33, %2142 : tensor<1x4800x512xf32>
-    %2156 = stablehlo.add %2155, %cst_34 : tensor<1x4800x512xf32>
-    %2157 = stablehlo.multiply %2156, %2142 : tensor<1x4800x512xf32>
-    %2158 = stablehlo.add %2157, %cst_35 : tensor<1x4800x512xf32>
-    %2159 = stablehlo.multiply %2158, %2142 : tensor<1x4800x512xf32>
-    %2160 = stablehlo.add %2159, %cst_36 : tensor<1x4800x512xf32>
-    %2161 = stablehlo.multiply %2160, %2142 : tensor<1x4800x512xf32>
-    %2162 = stablehlo.add %2161, %cst_37 : tensor<1x4800x512xf32>
-    %2163 = stablehlo.multiply %2141, %2154 : tensor<1x4800x512xf32>
-    %2164 = stablehlo.divide %2163, %2162 : tensor<1x4800x512xf32>
-    %2165 = stablehlo.clamp %cst_38, %2164, %cst_39 : tensor<1x4800x512xf32>
-    %2166 = stablehlo.convert %2165 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %2167 = stablehlo.add %2166, %cst_21 : tensor<1x4800x512xbf16>
-    %2168 = stablehlo.multiply %2167, %2138 : tensor<1x4800x512xbf16>
-    %2169 = stablehlo.reshape %2168 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %2170 = stablehlo.dot_general %2169, %arg579, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %2171 = stablehlo.reshape %2170 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2172 = stablehlo.broadcast_in_dim %2171, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2173 = stablehlo.broadcast_in_dim %arg98, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %2174 = stablehlo.add %2172, %2173 : tensor<1x4800x128xbf16>
-    %2175 = stablehlo.reshape %2174 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2176 = stablehlo.reshape %2175 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2177 = stablehlo.add %2176, %2081 : tensor<1x4800x128xbf16>
-    %2178 = stablehlo.convert %2177 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2179 = stablehlo.convert %2178 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2180 = stablehlo.reduce(%2179 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2181 = stablehlo.reshape %2180 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2182 = stablehlo.broadcast_in_dim %2181, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2183 = stablehlo.divide %2182, %874 : tensor<1x4800x1xf64>
-    %2184 = stablehlo.broadcast_in_dim %2179, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2185 = stablehlo.broadcast_in_dim %2183, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2186 = stablehlo.subtract %2184, %2185 : tensor<1x4800x128xf64>
-    %2187 = stablehlo.multiply %2186, %2186 : tensor<1x4800x128xf64>
-    %2188 = stablehlo.reduce(%2187 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2189 = stablehlo.reshape %2188 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2190 = stablehlo.broadcast_in_dim %2189, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2191 = stablehlo.divide %2190, %874 : tensor<1x4800x1xf64>
-    %2192 = stablehlo.convert %2191 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2193 = stablehlo.reduce(%2178 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2194 = stablehlo.reshape %2193 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2195 = stablehlo.broadcast_in_dim %2194, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2196 = stablehlo.divide %2195, %890 : tensor<1x4800x1xf32>
-    %2197 = stablehlo.broadcast_in_dim %2192, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2198 = stablehlo.add %2197, %893 : tensor<1x4800x1xf32>
-    %2199 = stablehlo.rsqrt %2198 : tensor<1x4800x1xf32>
-    %2200 = stablehlo.broadcast_in_dim %2178, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2201 = stablehlo.broadcast_in_dim %2196, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2202 = stablehlo.subtract %2200, %2201 : tensor<1x4800x128xf32>
-    %2203 = stablehlo.broadcast_in_dim %2202, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2204 = stablehlo.broadcast_in_dim %2199, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2205 = stablehlo.multiply %2203, %2204 : tensor<1x4800x128xf32>
-    %2206 = stablehlo.convert %arg99 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2207 = stablehlo.broadcast_in_dim %2205, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2208 = stablehlo.broadcast_in_dim %2206, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2209 = stablehlo.multiply %2207, %2208 : tensor<1x4800x128xf32>
-    %2210 = stablehlo.convert %arg100 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2211 = stablehlo.broadcast_in_dim %2209, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2212 = stablehlo.broadcast_in_dim %2210, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2213 = stablehlo.add %2211, %2212 : tensor<1x4800x128xf32>
-    %2214 = stablehlo.convert %2213 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2215 = stablehlo.reshape %2214 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2216 = stablehlo.convert %2215 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2217 = stablehlo.dot_general %2216, %arg580, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2218 = stablehlo.broadcast_in_dim %2217, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2219 = stablehlo.multiply %2218, %952 : tensor<4800x128xf32>
-    %2220 = stablehlo.broadcast_in_dim %2219, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2221 = stablehlo.broadcast_in_dim %arg581, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2222 = stablehlo.add %2220, %2221 : tensor<4800x128xf32>
-    %2223 = stablehlo.convert %2222 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2224 = stablehlo.reshape %2223 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2225 = stablehlo.reshape %2224 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2226 = stablehlo.transpose %2225, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2227 = stablehlo.transpose %2214, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %2228 = stablehlo.reshape %2227 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %2229 = stablehlo.convolution(%2228, %arg101) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %2230 = stablehlo.reshape %arg102 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %2231 = stablehlo.broadcast_in_dim %2229, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %2232 = stablehlo.broadcast_in_dim %2230, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %2233 = stablehlo.add %2231, %2232 : tensor<1x128x15x20xbf16>
-    %2234 = stablehlo.reshape %2233 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %2235 = stablehlo.transpose %2234, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %2236 = stablehlo.convert %2235 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %2237 = stablehlo.convert %2236 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %2238 = stablehlo.reduce(%2237 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %2239 = stablehlo.reshape %2238 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %2240 = stablehlo.broadcast_in_dim %2239, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %2241 = stablehlo.divide %2240, %975 : tensor<1x300x1xf64>
-    %2242 = stablehlo.broadcast_in_dim %2237, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %2243 = stablehlo.broadcast_in_dim %2241, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %2244 = stablehlo.subtract %2242, %2243 : tensor<1x300x128xf64>
-    %2245 = stablehlo.multiply %2244, %2244 : tensor<1x300x128xf64>
-    %2246 = stablehlo.reduce(%2245 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %2247 = stablehlo.reshape %2246 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %2248 = stablehlo.broadcast_in_dim %2247, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %2249 = stablehlo.divide %2248, %975 : tensor<1x300x1xf64>
-    %2250 = stablehlo.convert %2249 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %2251 = stablehlo.reduce(%2236 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %2252 = stablehlo.reshape %2251 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %2253 = stablehlo.broadcast_in_dim %2252, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2254 = stablehlo.divide %2253, %989 : tensor<1x300x1xf32>
-    %2255 = stablehlo.broadcast_in_dim %2250, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2256 = stablehlo.add %2255, %136 : tensor<1x300x1xf32>
-    %2257 = stablehlo.rsqrt %2256 : tensor<1x300x1xf32>
-    %2258 = stablehlo.broadcast_in_dim %2236, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2259 = stablehlo.broadcast_in_dim %2254, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2260 = stablehlo.subtract %2258, %2259 : tensor<1x300x128xf32>
-    %2261 = stablehlo.broadcast_in_dim %2260, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2262 = stablehlo.broadcast_in_dim %2257, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2263 = stablehlo.multiply %2261, %2262 : tensor<1x300x128xf32>
-    %2264 = stablehlo.convert %arg103 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2265 = stablehlo.broadcast_in_dim %2263, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2266 = stablehlo.broadcast_in_dim %2264, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2267 = stablehlo.multiply %2265, %2266 : tensor<1x300x128xf32>
-    %2268 = stablehlo.convert %arg104 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2269 = stablehlo.broadcast_in_dim %2267, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2270 = stablehlo.broadcast_in_dim %2268, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2271 = stablehlo.add %2269, %2270 : tensor<1x300x128xf32>
-    %2272 = stablehlo.convert %2271 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %2273 = stablehlo.reshape %2272 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %2274 = stablehlo.convert %2273 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %2275 = stablehlo.dot_general %2274, %arg582, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2276 = stablehlo.broadcast_in_dim %2275, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2277 = stablehlo.multiply %2276, %1013 : tensor<300x128xf32>
-    %2278 = stablehlo.broadcast_in_dim %2277, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2279 = stablehlo.broadcast_in_dim %arg583, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2280 = stablehlo.add %2278, %2279 : tensor<300x128xf32>
-    %2281 = stablehlo.convert %2280 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2282 = stablehlo.reshape %2281 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2283 = stablehlo.reshape %2282 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2284 = stablehlo.transpose %2283, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2285 = stablehlo.dot_general %2274, %arg584, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2286 = stablehlo.broadcast_in_dim %2285, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2287 = stablehlo.multiply %2286, %1013 : tensor<300x128xf32>
-    %2288 = stablehlo.broadcast_in_dim %2287, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2289 = stablehlo.broadcast_in_dim %arg585, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2290 = stablehlo.add %2288, %2289 : tensor<300x128xf32>
-    %2291 = stablehlo.convert %2290 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2292 = stablehlo.reshape %2291 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2293 = stablehlo.reshape %2292 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2294 = stablehlo.transpose %2293, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2295 = stablehlo.transpose %2284, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %2296 = stablehlo.reshape %2226 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2297 = stablehlo.reshape %2295 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2298 = stablehlo.broadcast_in_dim %2297, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2299 = stablehlo.dot_general %2296, %2298, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2300 = stablehlo.reshape %2299 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2301 = stablehlo.broadcast_in_dim %2300, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2302 = stablehlo.divide %2301, %1039 : tensor<1x2x4800x300xbf16>
-    %2303 = stablehlo.convert %2302 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %2304 = stablehlo.reduce(%2303 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2305 = stablehlo.reshape %2304 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2306 = stablehlo.broadcast_in_dim %2303, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2307 = stablehlo.broadcast_in_dim %2305, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2308 = stablehlo.subtract %2306, %2307 : tensor<1x2x4800x300xf32>
-    %2309 = stablehlo.exponential %2308 : tensor<1x2x4800x300xf32>
-    %2310 = stablehlo.reduce(%2309 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2311 = stablehlo.reshape %2310 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2312 = stablehlo.broadcast_in_dim %2309, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2313 = stablehlo.broadcast_in_dim %2311, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2314 = stablehlo.divide %2312, %2313 : tensor<1x2x4800x300xf32>
-    %2315 = stablehlo.convert %2314 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %2316 = stablehlo.reshape %2315 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2317 = stablehlo.reshape %2294 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2318 = stablehlo.broadcast_in_dim %2317, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2319 = stablehlo.dot_general %2316, %2318, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2320 = stablehlo.reshape %2319 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2321 = stablehlo.transpose %2320, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2322 = stablehlo.reshape %2321 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %2323 = stablehlo.reshape %2322 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2324 = stablehlo.convert %2323 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2325 = stablehlo.dot_general %2324, %arg586, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2326 = stablehlo.broadcast_in_dim %2325, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2327 = stablehlo.multiply %2326, %952 : tensor<4800x128xf32>
-    %2328 = stablehlo.broadcast_in_dim %2327, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2329 = stablehlo.broadcast_in_dim %arg587, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2330 = stablehlo.add %2328, %2329 : tensor<4800x128xf32>
-    %2331 = stablehlo.convert %2330 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2332 = stablehlo.reshape %2331 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2333 = stablehlo.add %2332, %2177 : tensor<1x4800x128xbf16>
-    %2334 = stablehlo.convert %2333 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2335 = stablehlo.convert %2334 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2336 = stablehlo.reduce(%2335 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2337 = stablehlo.reshape %2336 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2338 = stablehlo.broadcast_in_dim %2337, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2339 = stablehlo.divide %2338, %874 : tensor<1x4800x1xf64>
-    %2340 = stablehlo.broadcast_in_dim %2335, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2341 = stablehlo.broadcast_in_dim %2339, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2342 = stablehlo.subtract %2340, %2341 : tensor<1x4800x128xf64>
-    %2343 = stablehlo.multiply %2342, %2342 : tensor<1x4800x128xf64>
-    %2344 = stablehlo.reduce(%2343 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2345 = stablehlo.reshape %2344 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2346 = stablehlo.broadcast_in_dim %2345, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2347 = stablehlo.divide %2346, %874 : tensor<1x4800x1xf64>
-    %2348 = stablehlo.convert %2347 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2349 = stablehlo.reduce(%2334 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2350 = stablehlo.reshape %2349 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2351 = stablehlo.broadcast_in_dim %2350, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2352 = stablehlo.divide %2351, %890 : tensor<1x4800x1xf32>
-    %2353 = stablehlo.broadcast_in_dim %2348, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2354 = stablehlo.add %2353, %893 : tensor<1x4800x1xf32>
-    %2355 = stablehlo.rsqrt %2354 : tensor<1x4800x1xf32>
-    %2356 = stablehlo.broadcast_in_dim %2334, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2357 = stablehlo.broadcast_in_dim %2352, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2358 = stablehlo.subtract %2356, %2357 : tensor<1x4800x128xf32>
-    %2359 = stablehlo.broadcast_in_dim %2358, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2360 = stablehlo.broadcast_in_dim %2355, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2361 = stablehlo.multiply %2359, %2360 : tensor<1x4800x128xf32>
-    %2362 = stablehlo.convert %arg105 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2363 = stablehlo.broadcast_in_dim %2361, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2364 = stablehlo.broadcast_in_dim %2362, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2365 = stablehlo.multiply %2363, %2364 : tensor<1x4800x128xf32>
-    %2366 = stablehlo.convert %arg106 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2367 = stablehlo.broadcast_in_dim %2365, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2368 = stablehlo.broadcast_in_dim %2366, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2369 = stablehlo.add %2367, %2368 : tensor<1x4800x128xf32>
-    %2370 = stablehlo.convert %2369 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2371 = stablehlo.reshape %2370 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2372 = stablehlo.convert %2371 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2373 = stablehlo.dot_general %2372, %arg588, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %2374 = stablehlo.broadcast_in_dim %2373, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2375 = stablehlo.multiply %2374, %1113 : tensor<4800x512xf32>
-    %2376 = stablehlo.broadcast_in_dim %2375, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2377 = stablehlo.broadcast_in_dim %arg589, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %2378 = stablehlo.add %2376, %2377 : tensor<4800x512xf32>
-    %2379 = stablehlo.convert %2378 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %2380 = stablehlo.reshape %2379 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %2381 = stablehlo.transpose %2380, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %2382 = stablehlo.reshape %2381 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %2383 = stablehlo.convolution(%2382, %arg107) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %2384 = stablehlo.reshape %arg108 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %2385 = stablehlo.broadcast_in_dim %2383, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %2386 = stablehlo.broadcast_in_dim %2384, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %2387 = stablehlo.add %2385, %2386 : tensor<1x512x60x80xbf16>
-    %2388 = stablehlo.reshape %2387 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %2389 = stablehlo.transpose %2388, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %2390 = stablehlo.multiply %2389, %cst_23 : tensor<1x4800x512xbf16>
-    %2391 = stablehlo.multiply %2389, %1130 : tensor<1x4800x512xbf16>
-    %2392 = stablehlo.convert %2391 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %2393 = stablehlo.clamp %cst_24, %2392, %cst_25 : tensor<1x4800x512xf32>
-    %2394 = stablehlo.multiply %2393, %2393 : tensor<1x4800x512xf32>
-    %2395 = stablehlo.multiply %cst_26, %2394 : tensor<1x4800x512xf32>
-    %2396 = stablehlo.add %2395, %cst_27 : tensor<1x4800x512xf32>
-    %2397 = stablehlo.multiply %2396, %2394 : tensor<1x4800x512xf32>
-    %2398 = stablehlo.add %2397, %cst_28 : tensor<1x4800x512xf32>
-    %2399 = stablehlo.multiply %2398, %2394 : tensor<1x4800x512xf32>
-    %2400 = stablehlo.add %2399, %cst_29 : tensor<1x4800x512xf32>
-    %2401 = stablehlo.multiply %2400, %2394 : tensor<1x4800x512xf32>
-    %2402 = stablehlo.add %2401, %cst_30 : tensor<1x4800x512xf32>
-    %2403 = stablehlo.multiply %2402, %2394 : tensor<1x4800x512xf32>
-    %2404 = stablehlo.add %2403, %cst_31 : tensor<1x4800x512xf32>
-    %2405 = stablehlo.multiply %2404, %2394 : tensor<1x4800x512xf32>
-    %2406 = stablehlo.add %2405, %cst_32 : tensor<1x4800x512xf32>
-    %2407 = stablehlo.multiply %cst_33, %2394 : tensor<1x4800x512xf32>
-    %2408 = stablehlo.add %2407, %cst_34 : tensor<1x4800x512xf32>
-    %2409 = stablehlo.multiply %2408, %2394 : tensor<1x4800x512xf32>
-    %2410 = stablehlo.add %2409, %cst_35 : tensor<1x4800x512xf32>
-    %2411 = stablehlo.multiply %2410, %2394 : tensor<1x4800x512xf32>
-    %2412 = stablehlo.add %2411, %cst_36 : tensor<1x4800x512xf32>
-    %2413 = stablehlo.multiply %2412, %2394 : tensor<1x4800x512xf32>
-    %2414 = stablehlo.add %2413, %cst_37 : tensor<1x4800x512xf32>
-    %2415 = stablehlo.multiply %2393, %2406 : tensor<1x4800x512xf32>
-    %2416 = stablehlo.divide %2415, %2414 : tensor<1x4800x512xf32>
-    %2417 = stablehlo.clamp %cst_38, %2416, %cst_39 : tensor<1x4800x512xf32>
-    %2418 = stablehlo.convert %2417 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %2419 = stablehlo.add %2418, %cst_21 : tensor<1x4800x512xbf16>
-    %2420 = stablehlo.multiply %2419, %2390 : tensor<1x4800x512xbf16>
-    %2421 = stablehlo.reshape %2420 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %2422 = stablehlo.dot_general %2421, %arg590, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %2423 = stablehlo.reshape %2422 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2424 = stablehlo.broadcast_in_dim %2423, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2425 = stablehlo.broadcast_in_dim %arg109, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %2426 = stablehlo.add %2424, %2425 : tensor<1x4800x128xbf16>
-    %2427 = stablehlo.reshape %2426 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2428 = stablehlo.reshape %2427 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2429 = stablehlo.add %2428, %2333 : tensor<1x4800x128xbf16>
-    %2430 = stablehlo.convert %2429 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2431 = stablehlo.convert %2430 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2432 = stablehlo.reduce(%2431 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2433 = stablehlo.reshape %2432 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2434 = stablehlo.broadcast_in_dim %2433, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2435 = stablehlo.divide %2434, %874 : tensor<1x4800x1xf64>
-    %2436 = stablehlo.broadcast_in_dim %2431, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2437 = stablehlo.broadcast_in_dim %2435, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2438 = stablehlo.subtract %2436, %2437 : tensor<1x4800x128xf64>
-    %2439 = stablehlo.multiply %2438, %2438 : tensor<1x4800x128xf64>
-    %2440 = stablehlo.reduce(%2439 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2441 = stablehlo.reshape %2440 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2442 = stablehlo.broadcast_in_dim %2441, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2443 = stablehlo.divide %2442, %874 : tensor<1x4800x1xf64>
-    %2444 = stablehlo.convert %2443 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2445 = stablehlo.reduce(%2430 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2446 = stablehlo.reshape %2445 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2447 = stablehlo.broadcast_in_dim %2446, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2448 = stablehlo.divide %2447, %890 : tensor<1x4800x1xf32>
-    %2449 = stablehlo.broadcast_in_dim %2444, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2450 = stablehlo.add %2449, %893 : tensor<1x4800x1xf32>
-    %2451 = stablehlo.rsqrt %2450 : tensor<1x4800x1xf32>
-    %2452 = stablehlo.broadcast_in_dim %2430, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2453 = stablehlo.broadcast_in_dim %2448, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2454 = stablehlo.subtract %2452, %2453 : tensor<1x4800x128xf32>
-    %2455 = stablehlo.broadcast_in_dim %2454, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2456 = stablehlo.broadcast_in_dim %2451, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2457 = stablehlo.multiply %2455, %2456 : tensor<1x4800x128xf32>
-    %2458 = stablehlo.convert %arg110 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2459 = stablehlo.broadcast_in_dim %2457, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2460 = stablehlo.broadcast_in_dim %2458, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2461 = stablehlo.multiply %2459, %2460 : tensor<1x4800x128xf32>
-    %2462 = stablehlo.convert %arg111 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2463 = stablehlo.broadcast_in_dim %2461, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2464 = stablehlo.broadcast_in_dim %2462, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2465 = stablehlo.add %2463, %2464 : tensor<1x4800x128xf32>
-    %2466 = stablehlo.convert %2465 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2467 = stablehlo.reshape %2466 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2468 = stablehlo.convert %2467 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2469 = stablehlo.dot_general %2468, %arg591, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2470 = stablehlo.broadcast_in_dim %2469, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2471 = stablehlo.multiply %2470, %952 : tensor<4800x128xf32>
-    %2472 = stablehlo.broadcast_in_dim %2471, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2473 = stablehlo.broadcast_in_dim %arg592, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2474 = stablehlo.add %2472, %2473 : tensor<4800x128xf32>
-    %2475 = stablehlo.convert %2474 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2476 = stablehlo.reshape %2475 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2477 = stablehlo.reshape %2476 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2478 = stablehlo.transpose %2477, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2479 = stablehlo.transpose %2466, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %2480 = stablehlo.reshape %2479 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %2481 = stablehlo.convolution(%2480, %arg112) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %2482 = stablehlo.reshape %arg113 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %2483 = stablehlo.broadcast_in_dim %2481, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %2484 = stablehlo.broadcast_in_dim %2482, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %2485 = stablehlo.add %2483, %2484 : tensor<1x128x15x20xbf16>
-    %2486 = stablehlo.reshape %2485 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %2487 = stablehlo.transpose %2486, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %2488 = stablehlo.convert %2487 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %2489 = stablehlo.convert %2488 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %2490 = stablehlo.reduce(%2489 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %2491 = stablehlo.reshape %2490 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %2492 = stablehlo.broadcast_in_dim %2491, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %2493 = stablehlo.divide %2492, %975 : tensor<1x300x1xf64>
-    %2494 = stablehlo.broadcast_in_dim %2489, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %2495 = stablehlo.broadcast_in_dim %2493, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %2496 = stablehlo.subtract %2494, %2495 : tensor<1x300x128xf64>
-    %2497 = stablehlo.multiply %2496, %2496 : tensor<1x300x128xf64>
-    %2498 = stablehlo.reduce(%2497 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %2499 = stablehlo.reshape %2498 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %2500 = stablehlo.broadcast_in_dim %2499, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %2501 = stablehlo.divide %2500, %975 : tensor<1x300x1xf64>
-    %2502 = stablehlo.convert %2501 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %2503 = stablehlo.reduce(%2488 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %2504 = stablehlo.reshape %2503 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %2505 = stablehlo.broadcast_in_dim %2504, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2506 = stablehlo.divide %2505, %989 : tensor<1x300x1xf32>
-    %2507 = stablehlo.broadcast_in_dim %2502, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2508 = stablehlo.add %2507, %136 : tensor<1x300x1xf32>
-    %2509 = stablehlo.rsqrt %2508 : tensor<1x300x1xf32>
-    %2510 = stablehlo.broadcast_in_dim %2488, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2511 = stablehlo.broadcast_in_dim %2506, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2512 = stablehlo.subtract %2510, %2511 : tensor<1x300x128xf32>
-    %2513 = stablehlo.broadcast_in_dim %2512, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2514 = stablehlo.broadcast_in_dim %2509, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2515 = stablehlo.multiply %2513, %2514 : tensor<1x300x128xf32>
-    %2516 = stablehlo.convert %arg114 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2517 = stablehlo.broadcast_in_dim %2515, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2518 = stablehlo.broadcast_in_dim %2516, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2519 = stablehlo.multiply %2517, %2518 : tensor<1x300x128xf32>
-    %2520 = stablehlo.convert %arg115 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2521 = stablehlo.broadcast_in_dim %2519, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2522 = stablehlo.broadcast_in_dim %2520, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2523 = stablehlo.add %2521, %2522 : tensor<1x300x128xf32>
-    %2524 = stablehlo.convert %2523 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %2525 = stablehlo.reshape %2524 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %2526 = stablehlo.convert %2525 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %2527 = stablehlo.dot_general %2526, %arg593, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2528 = stablehlo.broadcast_in_dim %2527, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2529 = stablehlo.multiply %2528, %1013 : tensor<300x128xf32>
-    %2530 = stablehlo.broadcast_in_dim %2529, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2531 = stablehlo.broadcast_in_dim %arg594, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2532 = stablehlo.add %2530, %2531 : tensor<300x128xf32>
-    %2533 = stablehlo.convert %2532 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2534 = stablehlo.reshape %2533 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2535 = stablehlo.reshape %2534 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2536 = stablehlo.transpose %2535, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2537 = stablehlo.dot_general %2526, %arg595, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2538 = stablehlo.broadcast_in_dim %2537, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2539 = stablehlo.multiply %2538, %1013 : tensor<300x128xf32>
-    %2540 = stablehlo.broadcast_in_dim %2539, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2541 = stablehlo.broadcast_in_dim %arg596, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2542 = stablehlo.add %2540, %2541 : tensor<300x128xf32>
-    %2543 = stablehlo.convert %2542 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2544 = stablehlo.reshape %2543 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2545 = stablehlo.reshape %2544 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2546 = stablehlo.transpose %2545, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2547 = stablehlo.transpose %2536, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %2548 = stablehlo.reshape %2478 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2549 = stablehlo.reshape %2547 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2550 = stablehlo.broadcast_in_dim %2549, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2551 = stablehlo.dot_general %2548, %2550, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2552 = stablehlo.reshape %2551 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2553 = stablehlo.broadcast_in_dim %2552, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2554 = stablehlo.divide %2553, %1039 : tensor<1x2x4800x300xbf16>
-    %2555 = stablehlo.convert %2554 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %2556 = stablehlo.reduce(%2555 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2557 = stablehlo.reshape %2556 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2558 = stablehlo.broadcast_in_dim %2555, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2559 = stablehlo.broadcast_in_dim %2557, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2560 = stablehlo.subtract %2558, %2559 : tensor<1x2x4800x300xf32>
-    %2561 = stablehlo.exponential %2560 : tensor<1x2x4800x300xf32>
-    %2562 = stablehlo.reduce(%2561 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2563 = stablehlo.reshape %2562 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2564 = stablehlo.broadcast_in_dim %2561, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2565 = stablehlo.broadcast_in_dim %2563, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2566 = stablehlo.divide %2564, %2565 : tensor<1x2x4800x300xf32>
-    %2567 = stablehlo.convert %2566 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %2568 = stablehlo.reshape %2567 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2569 = stablehlo.reshape %2546 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2570 = stablehlo.broadcast_in_dim %2569, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2571 = stablehlo.dot_general %2568, %2570, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2572 = stablehlo.reshape %2571 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2573 = stablehlo.transpose %2572, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2574 = stablehlo.reshape %2573 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %2575 = stablehlo.reshape %2574 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2576 = stablehlo.convert %2575 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2577 = stablehlo.dot_general %2576, %arg597, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2578 = stablehlo.broadcast_in_dim %2577, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2579 = stablehlo.multiply %2578, %952 : tensor<4800x128xf32>
-    %2580 = stablehlo.broadcast_in_dim %2579, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2581 = stablehlo.broadcast_in_dim %arg598, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2582 = stablehlo.add %2580, %2581 : tensor<4800x128xf32>
-    %2583 = stablehlo.convert %2582 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2584 = stablehlo.reshape %2583 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2585 = stablehlo.add %2584, %2429 : tensor<1x4800x128xbf16>
-    %2586 = stablehlo.convert %2585 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2587 = stablehlo.convert %2586 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2588 = stablehlo.reduce(%2587 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2589 = stablehlo.reshape %2588 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2590 = stablehlo.broadcast_in_dim %2589, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2591 = stablehlo.divide %2590, %874 : tensor<1x4800x1xf64>
-    %2592 = stablehlo.broadcast_in_dim %2587, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2593 = stablehlo.broadcast_in_dim %2591, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2594 = stablehlo.subtract %2592, %2593 : tensor<1x4800x128xf64>
-    %2595 = stablehlo.multiply %2594, %2594 : tensor<1x4800x128xf64>
-    %2596 = stablehlo.reduce(%2595 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2597 = stablehlo.reshape %2596 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2598 = stablehlo.broadcast_in_dim %2597, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2599 = stablehlo.divide %2598, %874 : tensor<1x4800x1xf64>
-    %2600 = stablehlo.convert %2599 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2601 = stablehlo.reduce(%2586 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2602 = stablehlo.reshape %2601 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2603 = stablehlo.broadcast_in_dim %2602, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2604 = stablehlo.divide %2603, %890 : tensor<1x4800x1xf32>
-    %2605 = stablehlo.broadcast_in_dim %2600, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2606 = stablehlo.add %2605, %893 : tensor<1x4800x1xf32>
-    %2607 = stablehlo.rsqrt %2606 : tensor<1x4800x1xf32>
-    %2608 = stablehlo.broadcast_in_dim %2586, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2609 = stablehlo.broadcast_in_dim %2604, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2610 = stablehlo.subtract %2608, %2609 : tensor<1x4800x128xf32>
-    %2611 = stablehlo.broadcast_in_dim %2610, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2612 = stablehlo.broadcast_in_dim %2607, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2613 = stablehlo.multiply %2611, %2612 : tensor<1x4800x128xf32>
-    %2614 = stablehlo.convert %arg116 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2615 = stablehlo.broadcast_in_dim %2613, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2616 = stablehlo.broadcast_in_dim %2614, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2617 = stablehlo.multiply %2615, %2616 : tensor<1x4800x128xf32>
-    %2618 = stablehlo.convert %arg117 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2619 = stablehlo.broadcast_in_dim %2617, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2620 = stablehlo.broadcast_in_dim %2618, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2621 = stablehlo.add %2619, %2620 : tensor<1x4800x128xf32>
-    %2622 = stablehlo.convert %2621 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2623 = stablehlo.reshape %2622 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2624 = stablehlo.convert %2623 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2625 = stablehlo.dot_general %2624, %arg599, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %2626 = stablehlo.broadcast_in_dim %2625, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2627 = stablehlo.multiply %2626, %1113 : tensor<4800x512xf32>
-    %2628 = stablehlo.broadcast_in_dim %2627, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2629 = stablehlo.broadcast_in_dim %arg600, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %2630 = stablehlo.add %2628, %2629 : tensor<4800x512xf32>
-    %2631 = stablehlo.convert %2630 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %2632 = stablehlo.reshape %2631 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %2633 = stablehlo.transpose %2632, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %2634 = stablehlo.reshape %2633 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %2635 = stablehlo.convolution(%2634, %arg118) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %2636 = stablehlo.reshape %arg119 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %2637 = stablehlo.broadcast_in_dim %2635, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %2638 = stablehlo.broadcast_in_dim %2636, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %2639 = stablehlo.add %2637, %2638 : tensor<1x512x60x80xbf16>
-    %2640 = stablehlo.reshape %2639 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %2641 = stablehlo.transpose %2640, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %2642 = stablehlo.multiply %2641, %cst_23 : tensor<1x4800x512xbf16>
-    %2643 = stablehlo.multiply %2641, %1130 : tensor<1x4800x512xbf16>
-    %2644 = stablehlo.convert %2643 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %2645 = stablehlo.clamp %cst_24, %2644, %cst_25 : tensor<1x4800x512xf32>
-    %2646 = stablehlo.multiply %2645, %2645 : tensor<1x4800x512xf32>
-    %2647 = stablehlo.multiply %cst_26, %2646 : tensor<1x4800x512xf32>
-    %2648 = stablehlo.add %2647, %cst_27 : tensor<1x4800x512xf32>
-    %2649 = stablehlo.multiply %2648, %2646 : tensor<1x4800x512xf32>
-    %2650 = stablehlo.add %2649, %cst_28 : tensor<1x4800x512xf32>
-    %2651 = stablehlo.multiply %2650, %2646 : tensor<1x4800x512xf32>
-    %2652 = stablehlo.add %2651, %cst_29 : tensor<1x4800x512xf32>
-    %2653 = stablehlo.multiply %2652, %2646 : tensor<1x4800x512xf32>
-    %2654 = stablehlo.add %2653, %cst_30 : tensor<1x4800x512xf32>
-    %2655 = stablehlo.multiply %2654, %2646 : tensor<1x4800x512xf32>
-    %2656 = stablehlo.add %2655, %cst_31 : tensor<1x4800x512xf32>
-    %2657 = stablehlo.multiply %2656, %2646 : tensor<1x4800x512xf32>
-    %2658 = stablehlo.add %2657, %cst_32 : tensor<1x4800x512xf32>
-    %2659 = stablehlo.multiply %cst_33, %2646 : tensor<1x4800x512xf32>
-    %2660 = stablehlo.add %2659, %cst_34 : tensor<1x4800x512xf32>
-    %2661 = stablehlo.multiply %2660, %2646 : tensor<1x4800x512xf32>
-    %2662 = stablehlo.add %2661, %cst_35 : tensor<1x4800x512xf32>
-    %2663 = stablehlo.multiply %2662, %2646 : tensor<1x4800x512xf32>
-    %2664 = stablehlo.add %2663, %cst_36 : tensor<1x4800x512xf32>
-    %2665 = stablehlo.multiply %2664, %2646 : tensor<1x4800x512xf32>
-    %2666 = stablehlo.add %2665, %cst_37 : tensor<1x4800x512xf32>
-    %2667 = stablehlo.multiply %2645, %2658 : tensor<1x4800x512xf32>
-    %2668 = stablehlo.divide %2667, %2666 : tensor<1x4800x512xf32>
-    %2669 = stablehlo.clamp %cst_38, %2668, %cst_39 : tensor<1x4800x512xf32>
-    %2670 = stablehlo.convert %2669 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %2671 = stablehlo.add %2670, %cst_21 : tensor<1x4800x512xbf16>
-    %2672 = stablehlo.multiply %2671, %2642 : tensor<1x4800x512xbf16>
-    %2673 = stablehlo.reshape %2672 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %2674 = stablehlo.dot_general %2673, %arg601, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %2675 = stablehlo.reshape %2674 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2676 = stablehlo.broadcast_in_dim %2675, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2677 = stablehlo.broadcast_in_dim %arg120, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %2678 = stablehlo.add %2676, %2677 : tensor<1x4800x128xbf16>
-    %2679 = stablehlo.reshape %2678 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2680 = stablehlo.reshape %2679 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2681 = stablehlo.add %2680, %2585 : tensor<1x4800x128xbf16>
-    %2682 = stablehlo.convert %2681 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2683 = stablehlo.convert %2682 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2684 = stablehlo.reduce(%2683 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2685 = stablehlo.reshape %2684 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2686 = stablehlo.broadcast_in_dim %2685, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2687 = stablehlo.divide %2686, %874 : tensor<1x4800x1xf64>
-    %2688 = stablehlo.broadcast_in_dim %2683, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2689 = stablehlo.broadcast_in_dim %2687, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2690 = stablehlo.subtract %2688, %2689 : tensor<1x4800x128xf64>
-    %2691 = stablehlo.multiply %2690, %2690 : tensor<1x4800x128xf64>
-    %2692 = stablehlo.reduce(%2691 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2693 = stablehlo.reshape %2692 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2694 = stablehlo.broadcast_in_dim %2693, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2695 = stablehlo.divide %2694, %874 : tensor<1x4800x1xf64>
-    %2696 = stablehlo.convert %2695 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2697 = stablehlo.reduce(%2682 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2698 = stablehlo.reshape %2697 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2699 = stablehlo.broadcast_in_dim %2698, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2700 = stablehlo.divide %2699, %890 : tensor<1x4800x1xf32>
-    %2701 = stablehlo.broadcast_in_dim %2696, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2702 = stablehlo.add %2701, %893 : tensor<1x4800x1xf32>
-    %2703 = stablehlo.rsqrt %2702 : tensor<1x4800x1xf32>
-    %2704 = stablehlo.broadcast_in_dim %2682, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2705 = stablehlo.broadcast_in_dim %2700, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2706 = stablehlo.subtract %2704, %2705 : tensor<1x4800x128xf32>
-    %2707 = stablehlo.broadcast_in_dim %2706, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2708 = stablehlo.broadcast_in_dim %2703, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2709 = stablehlo.multiply %2707, %2708 : tensor<1x4800x128xf32>
-    %2710 = stablehlo.convert %arg121 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2711 = stablehlo.broadcast_in_dim %2709, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2712 = stablehlo.broadcast_in_dim %2710, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2713 = stablehlo.multiply %2711, %2712 : tensor<1x4800x128xf32>
-    %2714 = stablehlo.convert %arg122 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2715 = stablehlo.broadcast_in_dim %2713, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2716 = stablehlo.broadcast_in_dim %2714, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2717 = stablehlo.add %2715, %2716 : tensor<1x4800x128xf32>
-    %2718 = stablehlo.convert %2717 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2719 = stablehlo.reshape %2718 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2720 = stablehlo.convert %2719 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2721 = stablehlo.dot_general %2720, %arg602, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2722 = stablehlo.broadcast_in_dim %2721, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2723 = stablehlo.multiply %2722, %952 : tensor<4800x128xf32>
-    %2724 = stablehlo.broadcast_in_dim %2723, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2725 = stablehlo.broadcast_in_dim %arg603, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2726 = stablehlo.add %2724, %2725 : tensor<4800x128xf32>
-    %2727 = stablehlo.convert %2726 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2728 = stablehlo.reshape %2727 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2729 = stablehlo.reshape %2728 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2730 = stablehlo.transpose %2729, dims = [0, 2, 1, 3] : (tensor<1x4800x2x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2731 = stablehlo.transpose %2718, dims = [0, 2, 1] : (tensor<1x4800x128xbf16>) -> tensor<1x128x4800xbf16>
-    %2732 = stablehlo.reshape %2731 : (tensor<1x128x4800xbf16>) -> tensor<1x128x60x80xbf16>
-    %2733 = stablehlo.convolution(%2732, %arg123) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<128x128x4x4xbf16>) -> tensor<1x128x15x20xbf16>
-    %2734 = stablehlo.reshape %arg124 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %2735 = stablehlo.broadcast_in_dim %2733, dims = [0, 1, 2, 3] : (tensor<1x128x15x20xbf16>) -> tensor<1x128x15x20xbf16>
-    %2736 = stablehlo.broadcast_in_dim %2734, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x15x20xbf16>
-    %2737 = stablehlo.add %2735, %2736 : tensor<1x128x15x20xbf16>
-    %2738 = stablehlo.reshape %2737 : (tensor<1x128x15x20xbf16>) -> tensor<1x128x300xbf16>
-    %2739 = stablehlo.transpose %2738, dims = [0, 2, 1] : (tensor<1x128x300xbf16>) -> tensor<1x300x128xbf16>
-    %2740 = stablehlo.convert %2739 : (tensor<1x300x128xbf16>) -> tensor<1x300x128xf32>
-    %2741 = stablehlo.convert %2740 : (tensor<1x300x128xf32>) -> tensor<1x300x128xf64>
-    %2742 = stablehlo.reduce(%2741 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %2743 = stablehlo.reshape %2742 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %2744 = stablehlo.broadcast_in_dim %2743, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %2745 = stablehlo.divide %2744, %975 : tensor<1x300x1xf64>
-    %2746 = stablehlo.broadcast_in_dim %2741, dims = [0, 1, 2] : (tensor<1x300x128xf64>) -> tensor<1x300x128xf64>
-    %2747 = stablehlo.broadcast_in_dim %2745, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x128xf64>
-    %2748 = stablehlo.subtract %2746, %2747 : tensor<1x300x128xf64>
-    %2749 = stablehlo.multiply %2748, %2748 : tensor<1x300x128xf64>
-    %2750 = stablehlo.reduce(%2749 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %2751 = stablehlo.reshape %2750 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %2752 = stablehlo.broadcast_in_dim %2751, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %2753 = stablehlo.divide %2752, %975 : tensor<1x300x1xf64>
-    %2754 = stablehlo.convert %2753 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %2755 = stablehlo.reduce(%2740 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x128xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %2756 = stablehlo.reshape %2755 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %2757 = stablehlo.broadcast_in_dim %2756, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2758 = stablehlo.divide %2757, %989 : tensor<1x300x1xf32>
-    %2759 = stablehlo.broadcast_in_dim %2754, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %2760 = stablehlo.add %2759, %136 : tensor<1x300x1xf32>
-    %2761 = stablehlo.rsqrt %2760 : tensor<1x300x1xf32>
-    %2762 = stablehlo.broadcast_in_dim %2740, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2763 = stablehlo.broadcast_in_dim %2758, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2764 = stablehlo.subtract %2762, %2763 : tensor<1x300x128xf32>
-    %2765 = stablehlo.broadcast_in_dim %2764, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2766 = stablehlo.broadcast_in_dim %2761, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x128xf32>
-    %2767 = stablehlo.multiply %2765, %2766 : tensor<1x300x128xf32>
-    %2768 = stablehlo.convert %arg125 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2769 = stablehlo.broadcast_in_dim %2767, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2770 = stablehlo.broadcast_in_dim %2768, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2771 = stablehlo.multiply %2769, %2770 : tensor<1x300x128xf32>
-    %2772 = stablehlo.convert %arg126 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2773 = stablehlo.broadcast_in_dim %2771, dims = [0, 1, 2] : (tensor<1x300x128xf32>) -> tensor<1x300x128xf32>
-    %2774 = stablehlo.broadcast_in_dim %2772, dims = [2] : (tensor<128xf32>) -> tensor<1x300x128xf32>
-    %2775 = stablehlo.add %2773, %2774 : tensor<1x300x128xf32>
-    %2776 = stablehlo.convert %2775 : (tensor<1x300x128xf32>) -> tensor<1x300x128xbf16>
-    %2777 = stablehlo.reshape %2776 : (tensor<1x300x128xbf16>) -> tensor<300x128xbf16>
-    %2778 = stablehlo.convert %2777 : (tensor<300x128xbf16>) -> tensor<300x128xf32>
-    %2779 = stablehlo.dot_general %2778, %arg604, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2780 = stablehlo.broadcast_in_dim %2779, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2781 = stablehlo.multiply %2780, %1013 : tensor<300x128xf32>
-    %2782 = stablehlo.broadcast_in_dim %2781, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2783 = stablehlo.broadcast_in_dim %arg605, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2784 = stablehlo.add %2782, %2783 : tensor<300x128xf32>
-    %2785 = stablehlo.convert %2784 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2786 = stablehlo.reshape %2785 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2787 = stablehlo.reshape %2786 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2788 = stablehlo.transpose %2787, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2789 = stablehlo.dot_general %2778, %arg606, contracting_dims = [1] x [0] : (tensor<300x128xf32>, tensor<128x128xf32>) -> tensor<300x128xf32>
-    %2790 = stablehlo.broadcast_in_dim %2789, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2791 = stablehlo.multiply %2790, %1013 : tensor<300x128xf32>
-    %2792 = stablehlo.broadcast_in_dim %2791, dims = [0, 1] : (tensor<300x128xf32>) -> tensor<300x128xf32>
-    %2793 = stablehlo.broadcast_in_dim %arg607, dims = [1] : (tensor<128xf32>) -> tensor<300x128xf32>
-    %2794 = stablehlo.add %2792, %2793 : tensor<300x128xf32>
-    %2795 = stablehlo.convert %2794 : (tensor<300x128xf32>) -> tensor<300x128xbf16>
-    %2796 = stablehlo.reshape %2795 : (tensor<300x128xbf16>) -> tensor<1x300x128xbf16>
-    %2797 = stablehlo.reshape %2796 : (tensor<1x300x128xbf16>) -> tensor<1x300x2x64xbf16>
-    %2798 = stablehlo.transpose %2797, dims = [0, 2, 1, 3] : (tensor<1x300x2x64xbf16>) -> tensor<1x2x300x64xbf16>
-    %2799 = stablehlo.transpose %2788, dims = [0, 1, 3, 2] : (tensor<1x2x300x64xbf16>) -> tensor<1x2x64x300xbf16>
-    %2800 = stablehlo.reshape %2730 : (tensor<1x2x4800x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2801 = stablehlo.reshape %2799 : (tensor<1x2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2802 = stablehlo.broadcast_in_dim %2801, dims = [0, 1, 2] : (tensor<2x64x300xbf16>) -> tensor<2x64x300xbf16>
-    %2803 = stablehlo.dot_general %2800, %2802, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x64xbf16>, tensor<2x64x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2804 = stablehlo.reshape %2803 : (tensor<2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2805 = stablehlo.broadcast_in_dim %2804, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xbf16>
-    %2806 = stablehlo.divide %2805, %1039 : tensor<1x2x4800x300xbf16>
-    %2807 = stablehlo.convert %2806 : (tensor<1x2x4800x300xbf16>) -> tensor<1x2x4800x300xf32>
-    %2808 = stablehlo.reduce(%2807 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2809 = stablehlo.reshape %2808 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2810 = stablehlo.broadcast_in_dim %2807, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2811 = stablehlo.broadcast_in_dim %2809, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2812 = stablehlo.subtract %2810, %2811 : tensor<1x2x4800x300xf32>
-    %2813 = stablehlo.exponential %2812 : tensor<1x2x4800x300xf32>
-    %2814 = stablehlo.reduce(%2813 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4800x300xf32>, tensor<f32>) -> tensor<1x2x4800xf32>
-    %2815 = stablehlo.reshape %2814 : (tensor<1x2x4800xf32>) -> tensor<1x2x4800x1xf32>
-    %2816 = stablehlo.broadcast_in_dim %2813, dims = [0, 1, 2, 3] : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xf32>
-    %2817 = stablehlo.broadcast_in_dim %2815, dims = [0, 1, 2, 3] : (tensor<1x2x4800x1xf32>) -> tensor<1x2x4800x300xf32>
-    %2818 = stablehlo.divide %2816, %2817 : tensor<1x2x4800x300xf32>
-    %2819 = stablehlo.convert %2818 : (tensor<1x2x4800x300xf32>) -> tensor<1x2x4800x300xbf16>
-    %2820 = stablehlo.reshape %2819 : (tensor<1x2x4800x300xbf16>) -> tensor<2x4800x300xbf16>
-    %2821 = stablehlo.reshape %2798 : (tensor<1x2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2822 = stablehlo.broadcast_in_dim %2821, dims = [0, 1, 2] : (tensor<2x300x64xbf16>) -> tensor<2x300x64xbf16>
-    %2823 = stablehlo.dot_general %2820, %2822, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4800x300xbf16>, tensor<2x300x64xbf16>) -> tensor<2x4800x64xbf16>
-    %2824 = stablehlo.reshape %2823 : (tensor<2x4800x64xbf16>) -> tensor<1x2x4800x64xbf16>
-    %2825 = stablehlo.transpose %2824, dims = [0, 2, 1, 3] : (tensor<1x2x4800x64xbf16>) -> tensor<1x4800x2x64xbf16>
-    %2826 = stablehlo.reshape %2825 : (tensor<1x4800x2x64xbf16>) -> tensor<1x4800x128xbf16>
-    %2827 = stablehlo.reshape %2826 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2828 = stablehlo.convert %2827 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2829 = stablehlo.dot_general %2828, %arg608, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x128xf32>) -> tensor<4800x128xf32>
-    %2830 = stablehlo.broadcast_in_dim %2829, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2831 = stablehlo.multiply %2830, %952 : tensor<4800x128xf32>
-    %2832 = stablehlo.broadcast_in_dim %2831, dims = [0, 1] : (tensor<4800x128xf32>) -> tensor<4800x128xf32>
-    %2833 = stablehlo.broadcast_in_dim %arg609, dims = [1] : (tensor<128xf32>) -> tensor<4800x128xf32>
-    %2834 = stablehlo.add %2832, %2833 : tensor<4800x128xf32>
-    %2835 = stablehlo.convert %2834 : (tensor<4800x128xf32>) -> tensor<4800x128xbf16>
-    %2836 = stablehlo.reshape %2835 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2837 = stablehlo.add %2836, %2681 : tensor<1x4800x128xbf16>
-    %2838 = stablehlo.convert %2837 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2839 = stablehlo.convert %2838 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2840 = stablehlo.reduce(%2839 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2841 = stablehlo.reshape %2840 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2842 = stablehlo.broadcast_in_dim %2841, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2843 = stablehlo.divide %2842, %874 : tensor<1x4800x1xf64>
-    %2844 = stablehlo.broadcast_in_dim %2839, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2845 = stablehlo.broadcast_in_dim %2843, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2846 = stablehlo.subtract %2844, %2845 : tensor<1x4800x128xf64>
-    %2847 = stablehlo.multiply %2846, %2846 : tensor<1x4800x128xf64>
-    %2848 = stablehlo.reduce(%2847 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2849 = stablehlo.reshape %2848 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2850 = stablehlo.broadcast_in_dim %2849, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2851 = stablehlo.divide %2850, %874 : tensor<1x4800x1xf64>
-    %2852 = stablehlo.convert %2851 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2853 = stablehlo.reduce(%2838 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2854 = stablehlo.reshape %2853 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2855 = stablehlo.broadcast_in_dim %2854, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2856 = stablehlo.divide %2855, %890 : tensor<1x4800x1xf32>
-    %2857 = stablehlo.broadcast_in_dim %2852, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2858 = stablehlo.add %2857, %893 : tensor<1x4800x1xf32>
-    %2859 = stablehlo.rsqrt %2858 : tensor<1x4800x1xf32>
-    %2860 = stablehlo.broadcast_in_dim %2838, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2861 = stablehlo.broadcast_in_dim %2856, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2862 = stablehlo.subtract %2860, %2861 : tensor<1x4800x128xf32>
-    %2863 = stablehlo.broadcast_in_dim %2862, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2864 = stablehlo.broadcast_in_dim %2859, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2865 = stablehlo.multiply %2863, %2864 : tensor<1x4800x128xf32>
-    %2866 = stablehlo.convert %arg127 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2867 = stablehlo.broadcast_in_dim %2865, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2868 = stablehlo.broadcast_in_dim %2866, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2869 = stablehlo.multiply %2867, %2868 : tensor<1x4800x128xf32>
-    %2870 = stablehlo.convert %arg128 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2871 = stablehlo.broadcast_in_dim %2869, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2872 = stablehlo.broadcast_in_dim %2870, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2873 = stablehlo.add %2871, %2872 : tensor<1x4800x128xf32>
-    %2874 = stablehlo.convert %2873 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2875 = stablehlo.reshape %2874 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2876 = stablehlo.convert %2875 : (tensor<4800x128xbf16>) -> tensor<4800x128xf32>
-    %2877 = stablehlo.dot_general %2876, %arg610, contracting_dims = [1] x [0] : (tensor<4800x128xf32>, tensor<128x512xf32>) -> tensor<4800x512xf32>
-    %2878 = stablehlo.broadcast_in_dim %2877, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2879 = stablehlo.multiply %2878, %1113 : tensor<4800x512xf32>
-    %2880 = stablehlo.broadcast_in_dim %2879, dims = [0, 1] : (tensor<4800x512xf32>) -> tensor<4800x512xf32>
-    %2881 = stablehlo.broadcast_in_dim %arg611, dims = [1] : (tensor<512xf32>) -> tensor<4800x512xf32>
-    %2882 = stablehlo.add %2880, %2881 : tensor<4800x512xf32>
-    %2883 = stablehlo.convert %2882 : (tensor<4800x512xf32>) -> tensor<4800x512xbf16>
-    %2884 = stablehlo.reshape %2883 : (tensor<4800x512xbf16>) -> tensor<1x4800x512xbf16>
-    %2885 = stablehlo.transpose %2884, dims = [0, 2, 1] : (tensor<1x4800x512xbf16>) -> tensor<1x512x4800xbf16>
-    %2886 = stablehlo.reshape %2885 : (tensor<1x512x4800xbf16>) -> tensor<1x512x60x80xbf16>
-    %2887 = stablehlo.convolution(%2886, %arg129) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x60x80xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x60x80xbf16>
-    %2888 = stablehlo.reshape %arg130 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %2889 = stablehlo.broadcast_in_dim %2887, dims = [0, 1, 2, 3] : (tensor<1x512x60x80xbf16>) -> tensor<1x512x60x80xbf16>
-    %2890 = stablehlo.broadcast_in_dim %2888, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x60x80xbf16>
-    %2891 = stablehlo.add %2889, %2890 : tensor<1x512x60x80xbf16>
-    %2892 = stablehlo.reshape %2891 : (tensor<1x512x60x80xbf16>) -> tensor<1x512x4800xbf16>
-    %2893 = stablehlo.transpose %2892, dims = [0, 2, 1] : (tensor<1x512x4800xbf16>) -> tensor<1x4800x512xbf16>
-    %2894 = stablehlo.multiply %2893, %cst_23 : tensor<1x4800x512xbf16>
-    %2895 = stablehlo.multiply %2893, %1130 : tensor<1x4800x512xbf16>
-    %2896 = stablehlo.convert %2895 : (tensor<1x4800x512xbf16>) -> tensor<1x4800x512xf32>
-    %2897 = stablehlo.clamp %cst_24, %2896, %cst_25 : tensor<1x4800x512xf32>
-    %2898 = stablehlo.multiply %2897, %2897 : tensor<1x4800x512xf32>
-    %2899 = stablehlo.multiply %cst_26, %2898 : tensor<1x4800x512xf32>
-    %2900 = stablehlo.add %2899, %cst_27 : tensor<1x4800x512xf32>
-    %2901 = stablehlo.multiply %2900, %2898 : tensor<1x4800x512xf32>
-    %2902 = stablehlo.add %2901, %cst_28 : tensor<1x4800x512xf32>
-    %2903 = stablehlo.multiply %2902, %2898 : tensor<1x4800x512xf32>
-    %2904 = stablehlo.add %2903, %cst_29 : tensor<1x4800x512xf32>
-    %2905 = stablehlo.multiply %2904, %2898 : tensor<1x4800x512xf32>
-    %2906 = stablehlo.add %2905, %cst_30 : tensor<1x4800x512xf32>
-    %2907 = stablehlo.multiply %2906, %2898 : tensor<1x4800x512xf32>
-    %2908 = stablehlo.add %2907, %cst_31 : tensor<1x4800x512xf32>
-    %2909 = stablehlo.multiply %2908, %2898 : tensor<1x4800x512xf32>
-    %2910 = stablehlo.add %2909, %cst_32 : tensor<1x4800x512xf32>
-    %2911 = stablehlo.multiply %cst_33, %2898 : tensor<1x4800x512xf32>
-    %2912 = stablehlo.add %2911, %cst_34 : tensor<1x4800x512xf32>
-    %2913 = stablehlo.multiply %2912, %2898 : tensor<1x4800x512xf32>
-    %2914 = stablehlo.add %2913, %cst_35 : tensor<1x4800x512xf32>
-    %2915 = stablehlo.multiply %2914, %2898 : tensor<1x4800x512xf32>
-    %2916 = stablehlo.add %2915, %cst_36 : tensor<1x4800x512xf32>
-    %2917 = stablehlo.multiply %2916, %2898 : tensor<1x4800x512xf32>
-    %2918 = stablehlo.add %2917, %cst_37 : tensor<1x4800x512xf32>
-    %2919 = stablehlo.multiply %2897, %2910 : tensor<1x4800x512xf32>
-    %2920 = stablehlo.divide %2919, %2918 : tensor<1x4800x512xf32>
-    %2921 = stablehlo.clamp %cst_38, %2920, %cst_39 : tensor<1x4800x512xf32>
-    %2922 = stablehlo.convert %2921 : (tensor<1x4800x512xf32>) -> tensor<1x4800x512xbf16>
-    %2923 = stablehlo.add %2922, %cst_21 : tensor<1x4800x512xbf16>
-    %2924 = stablehlo.multiply %2923, %2894 : tensor<1x4800x512xbf16>
-    %2925 = stablehlo.reshape %2924 : (tensor<1x4800x512xbf16>) -> tensor<4800x512xbf16>
-    %2926 = stablehlo.dot_general %2925, %arg612, contracting_dims = [1] x [0] : (tensor<4800x512xbf16>, tensor<512x128xbf16>) -> tensor<4800x128xbf16>
-    %2927 = stablehlo.reshape %2926 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2928 = stablehlo.broadcast_in_dim %2927, dims = [0, 1, 2] : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2929 = stablehlo.broadcast_in_dim %arg131, dims = [2] : (tensor<128xbf16>) -> tensor<1x4800x128xbf16>
-    %2930 = stablehlo.add %2928, %2929 : tensor<1x4800x128xbf16>
-    %2931 = stablehlo.reshape %2930 : (tensor<1x4800x128xbf16>) -> tensor<4800x128xbf16>
-    %2932 = stablehlo.reshape %2931 : (tensor<4800x128xbf16>) -> tensor<1x4800x128xbf16>
-    %2933 = stablehlo.add %2932, %2837 : tensor<1x4800x128xbf16>
-    %2934 = stablehlo.convert %2933 : (tensor<1x4800x128xbf16>) -> tensor<1x4800x128xf32>
-    %2935 = stablehlo.convert %2934 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf64>
-    %2936 = stablehlo.reduce(%2935 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2937 = stablehlo.reshape %2936 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2938 = stablehlo.broadcast_in_dim %2937, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2939 = stablehlo.divide %2938, %874 : tensor<1x4800x1xf64>
-    %2940 = stablehlo.broadcast_in_dim %2935, dims = [0, 1, 2] : (tensor<1x4800x128xf64>) -> tensor<1x4800x128xf64>
-    %2941 = stablehlo.broadcast_in_dim %2939, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x128xf64>
-    %2942 = stablehlo.subtract %2940, %2941 : tensor<1x4800x128xf64>
-    %2943 = stablehlo.multiply %2942, %2942 : tensor<1x4800x128xf64>
-    %2944 = stablehlo.reduce(%2943 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf64>, tensor<f64>) -> tensor<1x4800xf64>
-    %2945 = stablehlo.reshape %2944 : (tensor<1x4800xf64>) -> tensor<1x4800x1xf64>
-    %2946 = stablehlo.broadcast_in_dim %2945, dims = [0, 1, 2] : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf64>
-    %2947 = stablehlo.divide %2946, %874 : tensor<1x4800x1xf64>
-    %2948 = stablehlo.convert %2947 : (tensor<1x4800x1xf64>) -> tensor<1x4800x1xf32>
-    %2949 = stablehlo.reduce(%2934 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4800x128xf32>, tensor<f32>) -> tensor<1x4800xf32>
-    %2950 = stablehlo.reshape %2949 : (tensor<1x4800xf32>) -> tensor<1x4800x1xf32>
-    %2951 = stablehlo.broadcast_in_dim %2950, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2952 = stablehlo.divide %2951, %890 : tensor<1x4800x1xf32>
-    %2953 = stablehlo.broadcast_in_dim %2948, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x1xf32>
-    %2954 = stablehlo.add %2953, %893 : tensor<1x4800x1xf32>
-    %2955 = stablehlo.rsqrt %2954 : tensor<1x4800x1xf32>
-    %2956 = stablehlo.broadcast_in_dim %2934, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2957 = stablehlo.broadcast_in_dim %2952, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2958 = stablehlo.subtract %2956, %2957 : tensor<1x4800x128xf32>
-    %2959 = stablehlo.broadcast_in_dim %2958, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2960 = stablehlo.broadcast_in_dim %2955, dims = [0, 1, 2] : (tensor<1x4800x1xf32>) -> tensor<1x4800x128xf32>
-    %2961 = stablehlo.multiply %2959, %2960 : tensor<1x4800x128xf32>
-    %2962 = stablehlo.convert %arg132 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2963 = stablehlo.broadcast_in_dim %2961, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2964 = stablehlo.broadcast_in_dim %2962, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2965 = stablehlo.multiply %2963, %2964 : tensor<1x4800x128xf32>
-    %2966 = stablehlo.convert %arg133 : (tensor<128xbf16>) -> tensor<128xf32>
-    %2967 = stablehlo.broadcast_in_dim %2965, dims = [0, 1, 2] : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xf32>
-    %2968 = stablehlo.broadcast_in_dim %2966, dims = [2] : (tensor<128xf32>) -> tensor<1x4800x128xf32>
-    %2969 = stablehlo.add %2967, %2968 : tensor<1x4800x128xf32>
-    %2970 = stablehlo.convert %2969 : (tensor<1x4800x128xf32>) -> tensor<1x4800x128xbf16>
-    %2971 = stablehlo.reshape %2970 : (tensor<1x4800x128xbf16>) -> tensor<1x60x80x128xbf16>
-    %2972 = stablehlo.transpose %2971, dims = [0, 3, 1, 2] : (tensor<1x60x80x128xbf16>) -> tensor<1x128x60x80xbf16>
-    %2973 = stablehlo.convolution(%2972, %arg134) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<320x128x3x3xbf16>) -> tensor<1x320x30x40xbf16>
-    %2974 = stablehlo.reshape %arg135 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %2975 = stablehlo.broadcast_in_dim %2973, dims = [0, 1, 2, 3] : (tensor<1x320x30x40xbf16>) -> tensor<1x320x30x40xbf16>
-    %2976 = stablehlo.broadcast_in_dim %2974, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x30x40xbf16>
-    %2977 = stablehlo.add %2975, %2976 : tensor<1x320x30x40xbf16>
-    %2978 = stablehlo.reshape %2977 : (tensor<1x320x30x40xbf16>) -> tensor<1x320x1200xbf16>
-    %2979 = stablehlo.transpose %2978, dims = [0, 2, 1] : (tensor<1x320x1200xbf16>) -> tensor<1x1200x320xbf16>
-    %2980 = stablehlo.convert %2979 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %2981 = stablehlo.convert %2980 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %2982 = stablehlo.reduce(%2981 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %2983 = stablehlo.reshape %2982 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %2984 = stablehlo.convert %cst_90 : (tensor<1xi64>) -> tensor<1xf64>
-    %2985 = stablehlo.reshape %2984 : (tensor<1xf64>) -> tensor<f64>
-    %2986 = stablehlo.broadcast_in_dim %2983, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %2987 = stablehlo.broadcast_in_dim %2985, dims = [] : (tensor<f64>) -> tensor<1x1200x1xf64>
-    %2988 = stablehlo.divide %2986, %2987 : tensor<1x1200x1xf64>
-    %2989 = stablehlo.broadcast_in_dim %2981, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %2990 = stablehlo.broadcast_in_dim %2988, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %2991 = stablehlo.subtract %2989, %2990 : tensor<1x1200x320xf64>
-    %2992 = stablehlo.multiply %2991, %2991 : tensor<1x1200x320xf64>
-    %2993 = stablehlo.reduce(%2992 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %2994 = stablehlo.reshape %2993 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %2995 = stablehlo.broadcast_in_dim %2994, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %2996 = stablehlo.divide %2995, %2987 : tensor<1x1200x1xf64>
-    %2997 = stablehlo.convert %2996 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %2998 = stablehlo.reduce(%2980 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %2999 = stablehlo.reshape %2998 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3000 = stablehlo.convert %cst_90 : (tensor<1xi64>) -> tensor<1xf32>
-    %3001 = stablehlo.reshape %3000 : (tensor<1xf32>) -> tensor<f32>
-    %3002 = stablehlo.broadcast_in_dim %2999, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3003 = stablehlo.broadcast_in_dim %3001, dims = [] : (tensor<f32>) -> tensor<1x1200x1xf32>
-    %3004 = stablehlo.divide %3002, %3003 : tensor<1x1200x1xf32>
-    %3005 = stablehlo.broadcast_in_dim %2997, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3006 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x1200x1xf32>
-    %3007 = stablehlo.add %3005, %3006 : tensor<1x1200x1xf32>
-    %3008 = stablehlo.rsqrt %3007 : tensor<1x1200x1xf32>
-    %3009 = stablehlo.broadcast_in_dim %2980, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3010 = stablehlo.broadcast_in_dim %3004, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3011 = stablehlo.subtract %3009, %3010 : tensor<1x1200x320xf32>
-    %3012 = stablehlo.broadcast_in_dim %3011, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3013 = stablehlo.broadcast_in_dim %3008, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3014 = stablehlo.multiply %3012, %3013 : tensor<1x1200x320xf32>
-    %3015 = stablehlo.convert %arg136 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3016 = stablehlo.broadcast_in_dim %3014, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3017 = stablehlo.broadcast_in_dim %3015, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3018 = stablehlo.multiply %3016, %3017 : tensor<1x1200x320xf32>
-    %3019 = stablehlo.convert %arg137 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3020 = stablehlo.broadcast_in_dim %3018, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3021 = stablehlo.broadcast_in_dim %3019, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3022 = stablehlo.add %3020, %3021 : tensor<1x1200x320xf32>
-    %3023 = stablehlo.convert %3022 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3024 = stablehlo.convert %3023 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3025 = stablehlo.convert %3024 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3026 = stablehlo.reduce(%3025 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3027 = stablehlo.reshape %3026 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3028 = stablehlo.broadcast_in_dim %3027, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3029 = stablehlo.divide %3028, %2987 : tensor<1x1200x1xf64>
-    %3030 = stablehlo.broadcast_in_dim %3025, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3031 = stablehlo.broadcast_in_dim %3029, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3032 = stablehlo.subtract %3030, %3031 : tensor<1x1200x320xf64>
-    %3033 = stablehlo.multiply %3032, %3032 : tensor<1x1200x320xf64>
-    %3034 = stablehlo.reduce(%3033 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3035 = stablehlo.reshape %3034 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3036 = stablehlo.broadcast_in_dim %3035, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3037 = stablehlo.divide %3036, %2987 : tensor<1x1200x1xf64>
-    %3038 = stablehlo.convert %3037 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3039 = stablehlo.reduce(%3024 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3040 = stablehlo.reshape %3039 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3041 = stablehlo.broadcast_in_dim %3040, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3042 = stablehlo.divide %3041, %3003 : tensor<1x1200x1xf32>
-    %3043 = stablehlo.broadcast_in_dim %3038, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3044 = stablehlo.add %3043, %3006 : tensor<1x1200x1xf32>
-    %3045 = stablehlo.rsqrt %3044 : tensor<1x1200x1xf32>
-    %3046 = stablehlo.broadcast_in_dim %3024, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3047 = stablehlo.broadcast_in_dim %3042, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3048 = stablehlo.subtract %3046, %3047 : tensor<1x1200x320xf32>
-    %3049 = stablehlo.broadcast_in_dim %3048, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3050 = stablehlo.broadcast_in_dim %3045, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3051 = stablehlo.multiply %3049, %3050 : tensor<1x1200x320xf32>
-    %3052 = stablehlo.convert %arg138 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3053 = stablehlo.broadcast_in_dim %3051, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3054 = stablehlo.broadcast_in_dim %3052, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3055 = stablehlo.multiply %3053, %3054 : tensor<1x1200x320xf32>
-    %3056 = stablehlo.convert %arg139 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3057 = stablehlo.broadcast_in_dim %3055, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3058 = stablehlo.broadcast_in_dim %3056, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3059 = stablehlo.add %3057, %3058 : tensor<1x1200x320xf32>
-    %3060 = stablehlo.convert %3059 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3061 = stablehlo.reshape %3060 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3062 = stablehlo.convert %3061 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3063 = stablehlo.dot_general %3062, %arg613, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3064 = stablehlo.broadcast_in_dim %3063, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3065 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<1200x320xf32>
-    %3066 = stablehlo.multiply %3064, %3065 : tensor<1200x320xf32>
-    %3067 = stablehlo.broadcast_in_dim %3066, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3068 = stablehlo.broadcast_in_dim %arg614, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3069 = stablehlo.add %3067, %3068 : tensor<1200x320xf32>
-    %3070 = stablehlo.convert %3069 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3071 = stablehlo.reshape %3070 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3072 = stablehlo.reshape %3071 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3073 = stablehlo.transpose %3072, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3074 = stablehlo.transpose %3060, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %3075 = stablehlo.reshape %3074 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %3076 = stablehlo.convolution(%3075, %arg140) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %3077 = stablehlo.reshape %arg141 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %3078 = stablehlo.broadcast_in_dim %3076, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %3079 = stablehlo.broadcast_in_dim %3077, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %3080 = stablehlo.add %3078, %3079 : tensor<1x320x15x20xbf16>
-    %3081 = stablehlo.reshape %3080 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %3082 = stablehlo.transpose %3081, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %3083 = stablehlo.convert %3082 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %3084 = stablehlo.convert %3083 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %3085 = stablehlo.reduce(%3084 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3086 = stablehlo.reshape %3085 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3087 = stablehlo.broadcast_in_dim %3086, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3088 = stablehlo.broadcast_in_dim %2985, dims = [] : (tensor<f64>) -> tensor<1x300x1xf64>
-    %3089 = stablehlo.divide %3087, %3088 : tensor<1x300x1xf64>
-    %3090 = stablehlo.broadcast_in_dim %3084, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %3091 = stablehlo.broadcast_in_dim %3089, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %3092 = stablehlo.subtract %3090, %3091 : tensor<1x300x320xf64>
-    %3093 = stablehlo.multiply %3092, %3092 : tensor<1x300x320xf64>
-    %3094 = stablehlo.reduce(%3093 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3095 = stablehlo.reshape %3094 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3096 = stablehlo.broadcast_in_dim %3095, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3097 = stablehlo.divide %3096, %3088 : tensor<1x300x1xf64>
-    %3098 = stablehlo.convert %3097 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %3099 = stablehlo.reduce(%3083 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %3100 = stablehlo.reshape %3099 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %3101 = stablehlo.broadcast_in_dim %3100, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3102 = stablehlo.broadcast_in_dim %3001, dims = [] : (tensor<f32>) -> tensor<1x300x1xf32>
-    %3103 = stablehlo.divide %3101, %3102 : tensor<1x300x1xf32>
-    %3104 = stablehlo.broadcast_in_dim %3098, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3105 = stablehlo.add %3104, %136 : tensor<1x300x1xf32>
-    %3106 = stablehlo.rsqrt %3105 : tensor<1x300x1xf32>
-    %3107 = stablehlo.broadcast_in_dim %3083, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3108 = stablehlo.broadcast_in_dim %3103, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3109 = stablehlo.subtract %3107, %3108 : tensor<1x300x320xf32>
-    %3110 = stablehlo.broadcast_in_dim %3109, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3111 = stablehlo.broadcast_in_dim %3106, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3112 = stablehlo.multiply %3110, %3111 : tensor<1x300x320xf32>
-    %3113 = stablehlo.convert %arg142 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3114 = stablehlo.broadcast_in_dim %3112, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3115 = stablehlo.broadcast_in_dim %3113, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3116 = stablehlo.multiply %3114, %3115 : tensor<1x300x320xf32>
-    %3117 = stablehlo.convert %arg143 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3118 = stablehlo.broadcast_in_dim %3116, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3119 = stablehlo.broadcast_in_dim %3117, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3120 = stablehlo.add %3118, %3119 : tensor<1x300x320xf32>
-    %3121 = stablehlo.convert %3120 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %3122 = stablehlo.reshape %3121 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %3123 = stablehlo.convert %3122 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %3124 = stablehlo.dot_general %3123, %arg615, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3125 = stablehlo.broadcast_in_dim %3124, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3126 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<300x320xf32>
-    %3127 = stablehlo.multiply %3125, %3126 : tensor<300x320xf32>
-    %3128 = stablehlo.broadcast_in_dim %3127, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3129 = stablehlo.broadcast_in_dim %arg616, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3130 = stablehlo.add %3128, %3129 : tensor<300x320xf32>
-    %3131 = stablehlo.convert %3130 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3132 = stablehlo.reshape %3131 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3133 = stablehlo.reshape %3132 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3134 = stablehlo.transpose %3133, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3135 = stablehlo.dot_general %3123, %arg617, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3136 = stablehlo.broadcast_in_dim %3135, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3137 = stablehlo.multiply %3136, %3126 : tensor<300x320xf32>
-    %3138 = stablehlo.broadcast_in_dim %3137, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3139 = stablehlo.broadcast_in_dim %arg618, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3140 = stablehlo.add %3138, %3139 : tensor<300x320xf32>
-    %3141 = stablehlo.convert %3140 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3142 = stablehlo.reshape %3141 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3143 = stablehlo.reshape %3142 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3144 = stablehlo.transpose %3143, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3145 = stablehlo.transpose %3134, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %3146 = stablehlo.reshape %3073 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3147 = stablehlo.reshape %3145 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3148 = stablehlo.broadcast_in_dim %3147, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3149 = stablehlo.dot_general %3146, %3148, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3150 = stablehlo.reshape %3149 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3151 = stablehlo.broadcast_in_dim %3150, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3152 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x5x1200x300xbf16>
-    %3153 = stablehlo.divide %3151, %3152 : tensor<1x5x1200x300xbf16>
-    %3154 = stablehlo.convert %3153 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %3155 = stablehlo.reduce(%3154 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3156 = stablehlo.reshape %3155 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3157 = stablehlo.broadcast_in_dim %3154, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3158 = stablehlo.broadcast_in_dim %3156, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3159 = stablehlo.subtract %3157, %3158 : tensor<1x5x1200x300xf32>
-    %3160 = stablehlo.exponential %3159 : tensor<1x5x1200x300xf32>
-    %3161 = stablehlo.reduce(%3160 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3162 = stablehlo.reshape %3161 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3163 = stablehlo.broadcast_in_dim %3160, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3164 = stablehlo.broadcast_in_dim %3162, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3165 = stablehlo.divide %3163, %3164 : tensor<1x5x1200x300xf32>
-    %3166 = stablehlo.convert %3165 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %3167 = stablehlo.reshape %3166 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3168 = stablehlo.reshape %3144 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3169 = stablehlo.broadcast_in_dim %3168, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3170 = stablehlo.dot_general %3167, %3169, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3171 = stablehlo.reshape %3170 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3172 = stablehlo.transpose %3171, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3173 = stablehlo.reshape %3172 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %3174 = stablehlo.reshape %3173 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3175 = stablehlo.convert %3174 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3176 = stablehlo.dot_general %3175, %arg619, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3177 = stablehlo.broadcast_in_dim %3176, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3178 = stablehlo.multiply %3177, %3065 : tensor<1200x320xf32>
-    %3179 = stablehlo.broadcast_in_dim %3178, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3180 = stablehlo.broadcast_in_dim %arg620, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3181 = stablehlo.add %3179, %3180 : tensor<1200x320xf32>
-    %3182 = stablehlo.convert %3181 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3183 = stablehlo.reshape %3182 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3184 = stablehlo.add %3183, %3023 : tensor<1x1200x320xbf16>
-    %3185 = stablehlo.convert %3184 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3186 = stablehlo.convert %3185 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3187 = stablehlo.reduce(%3186 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3188 = stablehlo.reshape %3187 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3189 = stablehlo.broadcast_in_dim %3188, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3190 = stablehlo.divide %3189, %2987 : tensor<1x1200x1xf64>
-    %3191 = stablehlo.broadcast_in_dim %3186, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3192 = stablehlo.broadcast_in_dim %3190, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3193 = stablehlo.subtract %3191, %3192 : tensor<1x1200x320xf64>
-    %3194 = stablehlo.multiply %3193, %3193 : tensor<1x1200x320xf64>
-    %3195 = stablehlo.reduce(%3194 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3196 = stablehlo.reshape %3195 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3197 = stablehlo.broadcast_in_dim %3196, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3198 = stablehlo.divide %3197, %2987 : tensor<1x1200x1xf64>
-    %3199 = stablehlo.convert %3198 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3200 = stablehlo.reduce(%3185 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3201 = stablehlo.reshape %3200 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3202 = stablehlo.broadcast_in_dim %3201, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3203 = stablehlo.divide %3202, %3003 : tensor<1x1200x1xf32>
-    %3204 = stablehlo.broadcast_in_dim %3199, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3205 = stablehlo.add %3204, %3006 : tensor<1x1200x1xf32>
-    %3206 = stablehlo.rsqrt %3205 : tensor<1x1200x1xf32>
-    %3207 = stablehlo.broadcast_in_dim %3185, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3208 = stablehlo.broadcast_in_dim %3203, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3209 = stablehlo.subtract %3207, %3208 : tensor<1x1200x320xf32>
-    %3210 = stablehlo.broadcast_in_dim %3209, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3211 = stablehlo.broadcast_in_dim %3206, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3212 = stablehlo.multiply %3210, %3211 : tensor<1x1200x320xf32>
-    %3213 = stablehlo.convert %arg144 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3214 = stablehlo.broadcast_in_dim %3212, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3215 = stablehlo.broadcast_in_dim %3213, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3216 = stablehlo.multiply %3214, %3215 : tensor<1x1200x320xf32>
-    %3217 = stablehlo.convert %arg145 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3218 = stablehlo.broadcast_in_dim %3216, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3219 = stablehlo.broadcast_in_dim %3217, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3220 = stablehlo.add %3218, %3219 : tensor<1x1200x320xf32>
-    %3221 = stablehlo.convert %3220 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3222 = stablehlo.reshape %3221 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3223 = stablehlo.convert %3222 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3224 = stablehlo.dot_general %3223, %arg621, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %3225 = stablehlo.broadcast_in_dim %3224, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3226 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<1200x1280xf32>
-    %3227 = stablehlo.multiply %3225, %3226 : tensor<1200x1280xf32>
-    %3228 = stablehlo.broadcast_in_dim %3227, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3229 = stablehlo.broadcast_in_dim %arg622, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %3230 = stablehlo.add %3228, %3229 : tensor<1200x1280xf32>
-    %3231 = stablehlo.convert %3230 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %3232 = stablehlo.reshape %3231 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %3233 = stablehlo.transpose %3232, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %3234 = stablehlo.reshape %3233 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3235 = stablehlo.convolution(%3234, %arg146) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3236 = stablehlo.reshape %arg147 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %3237 = stablehlo.broadcast_in_dim %3235, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3238 = stablehlo.broadcast_in_dim %3236, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3239 = stablehlo.add %3237, %3238 : tensor<1x1280x30x40xbf16>
-    %3240 = stablehlo.reshape %3239 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %3241 = stablehlo.transpose %3240, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %3242 = stablehlo.multiply %3241, %cst_42 : tensor<1x1200x1280xbf16>
-    %3243 = stablehlo.rsqrt %cst_41 : tensor<1x1200x1280xbf16>
-    %3244 = stablehlo.multiply %3241, %3243 : tensor<1x1200x1280xbf16>
-    %3245 = stablehlo.convert %3244 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %3246 = stablehlo.clamp %cst_43, %3245, %cst_44 : tensor<1x1200x1280xf32>
-    %3247 = stablehlo.multiply %3246, %3246 : tensor<1x1200x1280xf32>
-    %3248 = stablehlo.multiply %cst_45, %3247 : tensor<1x1200x1280xf32>
-    %3249 = stablehlo.add %3248, %cst_46 : tensor<1x1200x1280xf32>
-    %3250 = stablehlo.multiply %3249, %3247 : tensor<1x1200x1280xf32>
-    %3251 = stablehlo.add %3250, %cst_47 : tensor<1x1200x1280xf32>
-    %3252 = stablehlo.multiply %3251, %3247 : tensor<1x1200x1280xf32>
-    %3253 = stablehlo.add %3252, %cst_48 : tensor<1x1200x1280xf32>
-    %3254 = stablehlo.multiply %3253, %3247 : tensor<1x1200x1280xf32>
-    %3255 = stablehlo.add %3254, %cst_49 : tensor<1x1200x1280xf32>
-    %3256 = stablehlo.multiply %3255, %3247 : tensor<1x1200x1280xf32>
-    %3257 = stablehlo.add %3256, %cst_50 : tensor<1x1200x1280xf32>
-    %3258 = stablehlo.multiply %3257, %3247 : tensor<1x1200x1280xf32>
-    %3259 = stablehlo.add %3258, %cst_51 : tensor<1x1200x1280xf32>
-    %3260 = stablehlo.multiply %cst_52, %3247 : tensor<1x1200x1280xf32>
-    %3261 = stablehlo.add %3260, %cst_53 : tensor<1x1200x1280xf32>
-    %3262 = stablehlo.multiply %3261, %3247 : tensor<1x1200x1280xf32>
-    %3263 = stablehlo.add %3262, %cst_54 : tensor<1x1200x1280xf32>
-    %3264 = stablehlo.multiply %3263, %3247 : tensor<1x1200x1280xf32>
-    %3265 = stablehlo.add %3264, %cst_55 : tensor<1x1200x1280xf32>
-    %3266 = stablehlo.multiply %3265, %3247 : tensor<1x1200x1280xf32>
-    %3267 = stablehlo.add %3266, %cst_56 : tensor<1x1200x1280xf32>
-    %3268 = stablehlo.multiply %3246, %3259 : tensor<1x1200x1280xf32>
-    %3269 = stablehlo.divide %3268, %3267 : tensor<1x1200x1280xf32>
-    %3270 = stablehlo.clamp %cst_57, %3269, %cst_58 : tensor<1x1200x1280xf32>
-    %3271 = stablehlo.convert %3270 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %3272 = stablehlo.add %3271, %cst_40 : tensor<1x1200x1280xbf16>
-    %3273 = stablehlo.multiply %3272, %3242 : tensor<1x1200x1280xbf16>
-    %3274 = stablehlo.reshape %3273 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %3275 = stablehlo.dot_general %3274, %arg623, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %3276 = stablehlo.reshape %3275 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3277 = stablehlo.broadcast_in_dim %3276, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3278 = stablehlo.broadcast_in_dim %arg148, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %3279 = stablehlo.add %3277, %3278 : tensor<1x1200x320xbf16>
-    %3280 = stablehlo.reshape %3279 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3281 = stablehlo.reshape %3280 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3282 = stablehlo.add %3281, %3184 : tensor<1x1200x320xbf16>
-    %3283 = stablehlo.convert %3282 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3284 = stablehlo.convert %3283 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3285 = stablehlo.reduce(%3284 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3286 = stablehlo.reshape %3285 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3287 = stablehlo.broadcast_in_dim %3286, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3288 = stablehlo.divide %3287, %2987 : tensor<1x1200x1xf64>
-    %3289 = stablehlo.broadcast_in_dim %3284, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3290 = stablehlo.broadcast_in_dim %3288, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3291 = stablehlo.subtract %3289, %3290 : tensor<1x1200x320xf64>
-    %3292 = stablehlo.multiply %3291, %3291 : tensor<1x1200x320xf64>
-    %3293 = stablehlo.reduce(%3292 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3294 = stablehlo.reshape %3293 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3295 = stablehlo.broadcast_in_dim %3294, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3296 = stablehlo.divide %3295, %2987 : tensor<1x1200x1xf64>
-    %3297 = stablehlo.convert %3296 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3298 = stablehlo.reduce(%3283 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3299 = stablehlo.reshape %3298 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3300 = stablehlo.broadcast_in_dim %3299, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3301 = stablehlo.divide %3300, %3003 : tensor<1x1200x1xf32>
-    %3302 = stablehlo.broadcast_in_dim %3297, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3303 = stablehlo.add %3302, %3006 : tensor<1x1200x1xf32>
-    %3304 = stablehlo.rsqrt %3303 : tensor<1x1200x1xf32>
-    %3305 = stablehlo.broadcast_in_dim %3283, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3306 = stablehlo.broadcast_in_dim %3301, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3307 = stablehlo.subtract %3305, %3306 : tensor<1x1200x320xf32>
-    %3308 = stablehlo.broadcast_in_dim %3307, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3309 = stablehlo.broadcast_in_dim %3304, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3310 = stablehlo.multiply %3308, %3309 : tensor<1x1200x320xf32>
-    %3311 = stablehlo.convert %arg149 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3312 = stablehlo.broadcast_in_dim %3310, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3313 = stablehlo.broadcast_in_dim %3311, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3314 = stablehlo.multiply %3312, %3313 : tensor<1x1200x320xf32>
-    %3315 = stablehlo.convert %arg150 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3316 = stablehlo.broadcast_in_dim %3314, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3317 = stablehlo.broadcast_in_dim %3315, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3318 = stablehlo.add %3316, %3317 : tensor<1x1200x320xf32>
-    %3319 = stablehlo.convert %3318 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3320 = stablehlo.reshape %3319 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3321 = stablehlo.convert %3320 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3322 = stablehlo.dot_general %3321, %arg624, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3323 = stablehlo.broadcast_in_dim %3322, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3324 = stablehlo.multiply %3323, %3065 : tensor<1200x320xf32>
-    %3325 = stablehlo.broadcast_in_dim %3324, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3326 = stablehlo.broadcast_in_dim %arg625, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3327 = stablehlo.add %3325, %3326 : tensor<1200x320xf32>
-    %3328 = stablehlo.convert %3327 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3329 = stablehlo.reshape %3328 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3330 = stablehlo.reshape %3329 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3331 = stablehlo.transpose %3330, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3332 = stablehlo.transpose %3319, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %3333 = stablehlo.reshape %3332 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %3334 = stablehlo.convolution(%3333, %arg151) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %3335 = stablehlo.reshape %arg152 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %3336 = stablehlo.broadcast_in_dim %3334, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %3337 = stablehlo.broadcast_in_dim %3335, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %3338 = stablehlo.add %3336, %3337 : tensor<1x320x15x20xbf16>
-    %3339 = stablehlo.reshape %3338 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %3340 = stablehlo.transpose %3339, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %3341 = stablehlo.convert %3340 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %3342 = stablehlo.convert %3341 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %3343 = stablehlo.reduce(%3342 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3344 = stablehlo.reshape %3343 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3345 = stablehlo.broadcast_in_dim %3344, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3346 = stablehlo.divide %3345, %3088 : tensor<1x300x1xf64>
-    %3347 = stablehlo.broadcast_in_dim %3342, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %3348 = stablehlo.broadcast_in_dim %3346, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %3349 = stablehlo.subtract %3347, %3348 : tensor<1x300x320xf64>
-    %3350 = stablehlo.multiply %3349, %3349 : tensor<1x300x320xf64>
-    %3351 = stablehlo.reduce(%3350 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3352 = stablehlo.reshape %3351 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3353 = stablehlo.broadcast_in_dim %3352, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3354 = stablehlo.divide %3353, %3088 : tensor<1x300x1xf64>
-    %3355 = stablehlo.convert %3354 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %3356 = stablehlo.reduce(%3341 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %3357 = stablehlo.reshape %3356 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %3358 = stablehlo.broadcast_in_dim %3357, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3359 = stablehlo.divide %3358, %3102 : tensor<1x300x1xf32>
-    %3360 = stablehlo.broadcast_in_dim %3355, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3361 = stablehlo.add %3360, %136 : tensor<1x300x1xf32>
-    %3362 = stablehlo.rsqrt %3361 : tensor<1x300x1xf32>
-    %3363 = stablehlo.broadcast_in_dim %3341, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3364 = stablehlo.broadcast_in_dim %3359, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3365 = stablehlo.subtract %3363, %3364 : tensor<1x300x320xf32>
-    %3366 = stablehlo.broadcast_in_dim %3365, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3367 = stablehlo.broadcast_in_dim %3362, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3368 = stablehlo.multiply %3366, %3367 : tensor<1x300x320xf32>
-    %3369 = stablehlo.convert %arg153 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3370 = stablehlo.broadcast_in_dim %3368, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3371 = stablehlo.broadcast_in_dim %3369, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3372 = stablehlo.multiply %3370, %3371 : tensor<1x300x320xf32>
-    %3373 = stablehlo.convert %arg154 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3374 = stablehlo.broadcast_in_dim %3372, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3375 = stablehlo.broadcast_in_dim %3373, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3376 = stablehlo.add %3374, %3375 : tensor<1x300x320xf32>
-    %3377 = stablehlo.convert %3376 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %3378 = stablehlo.reshape %3377 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %3379 = stablehlo.convert %3378 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %3380 = stablehlo.dot_general %3379, %arg626, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3381 = stablehlo.broadcast_in_dim %3380, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3382 = stablehlo.multiply %3381, %3126 : tensor<300x320xf32>
-    %3383 = stablehlo.broadcast_in_dim %3382, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3384 = stablehlo.broadcast_in_dim %arg627, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3385 = stablehlo.add %3383, %3384 : tensor<300x320xf32>
-    %3386 = stablehlo.convert %3385 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3387 = stablehlo.reshape %3386 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3388 = stablehlo.reshape %3387 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3389 = stablehlo.transpose %3388, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3390 = stablehlo.dot_general %3379, %arg628, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3391 = stablehlo.broadcast_in_dim %3390, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3392 = stablehlo.multiply %3391, %3126 : tensor<300x320xf32>
-    %3393 = stablehlo.broadcast_in_dim %3392, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3394 = stablehlo.broadcast_in_dim %arg629, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3395 = stablehlo.add %3393, %3394 : tensor<300x320xf32>
-    %3396 = stablehlo.convert %3395 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3397 = stablehlo.reshape %3396 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3398 = stablehlo.reshape %3397 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3399 = stablehlo.transpose %3398, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3400 = stablehlo.transpose %3389, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %3401 = stablehlo.reshape %3331 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3402 = stablehlo.reshape %3400 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3403 = stablehlo.broadcast_in_dim %3402, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3404 = stablehlo.dot_general %3401, %3403, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3405 = stablehlo.reshape %3404 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3406 = stablehlo.broadcast_in_dim %3405, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3407 = stablehlo.divide %3406, %3152 : tensor<1x5x1200x300xbf16>
-    %3408 = stablehlo.convert %3407 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %3409 = stablehlo.reduce(%3408 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3410 = stablehlo.reshape %3409 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3411 = stablehlo.broadcast_in_dim %3408, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3412 = stablehlo.broadcast_in_dim %3410, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3413 = stablehlo.subtract %3411, %3412 : tensor<1x5x1200x300xf32>
-    %3414 = stablehlo.exponential %3413 : tensor<1x5x1200x300xf32>
-    %3415 = stablehlo.reduce(%3414 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3416 = stablehlo.reshape %3415 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3417 = stablehlo.broadcast_in_dim %3414, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3418 = stablehlo.broadcast_in_dim %3416, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3419 = stablehlo.divide %3417, %3418 : tensor<1x5x1200x300xf32>
-    %3420 = stablehlo.convert %3419 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %3421 = stablehlo.reshape %3420 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3422 = stablehlo.reshape %3399 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3423 = stablehlo.broadcast_in_dim %3422, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3424 = stablehlo.dot_general %3421, %3423, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3425 = stablehlo.reshape %3424 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3426 = stablehlo.transpose %3425, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3427 = stablehlo.reshape %3426 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %3428 = stablehlo.reshape %3427 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3429 = stablehlo.convert %3428 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3430 = stablehlo.dot_general %3429, %arg630, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3431 = stablehlo.broadcast_in_dim %3430, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3432 = stablehlo.multiply %3431, %3065 : tensor<1200x320xf32>
-    %3433 = stablehlo.broadcast_in_dim %3432, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3434 = stablehlo.broadcast_in_dim %arg631, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3435 = stablehlo.add %3433, %3434 : tensor<1200x320xf32>
-    %3436 = stablehlo.convert %3435 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3437 = stablehlo.reshape %3436 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3438 = stablehlo.add %3437, %3282 : tensor<1x1200x320xbf16>
-    %3439 = stablehlo.convert %3438 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3440 = stablehlo.convert %3439 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3441 = stablehlo.reduce(%3440 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3442 = stablehlo.reshape %3441 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3443 = stablehlo.broadcast_in_dim %3442, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3444 = stablehlo.divide %3443, %2987 : tensor<1x1200x1xf64>
-    %3445 = stablehlo.broadcast_in_dim %3440, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3446 = stablehlo.broadcast_in_dim %3444, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3447 = stablehlo.subtract %3445, %3446 : tensor<1x1200x320xf64>
-    %3448 = stablehlo.multiply %3447, %3447 : tensor<1x1200x320xf64>
-    %3449 = stablehlo.reduce(%3448 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3450 = stablehlo.reshape %3449 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3451 = stablehlo.broadcast_in_dim %3450, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3452 = stablehlo.divide %3451, %2987 : tensor<1x1200x1xf64>
-    %3453 = stablehlo.convert %3452 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3454 = stablehlo.reduce(%3439 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3455 = stablehlo.reshape %3454 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3456 = stablehlo.broadcast_in_dim %3455, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3457 = stablehlo.divide %3456, %3003 : tensor<1x1200x1xf32>
-    %3458 = stablehlo.broadcast_in_dim %3453, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3459 = stablehlo.add %3458, %3006 : tensor<1x1200x1xf32>
-    %3460 = stablehlo.rsqrt %3459 : tensor<1x1200x1xf32>
-    %3461 = stablehlo.broadcast_in_dim %3439, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3462 = stablehlo.broadcast_in_dim %3457, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3463 = stablehlo.subtract %3461, %3462 : tensor<1x1200x320xf32>
-    %3464 = stablehlo.broadcast_in_dim %3463, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3465 = stablehlo.broadcast_in_dim %3460, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3466 = stablehlo.multiply %3464, %3465 : tensor<1x1200x320xf32>
-    %3467 = stablehlo.convert %arg155 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3468 = stablehlo.broadcast_in_dim %3466, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3469 = stablehlo.broadcast_in_dim %3467, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3470 = stablehlo.multiply %3468, %3469 : tensor<1x1200x320xf32>
-    %3471 = stablehlo.convert %arg156 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3472 = stablehlo.broadcast_in_dim %3470, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3473 = stablehlo.broadcast_in_dim %3471, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3474 = stablehlo.add %3472, %3473 : tensor<1x1200x320xf32>
-    %3475 = stablehlo.convert %3474 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3476 = stablehlo.reshape %3475 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3477 = stablehlo.convert %3476 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3478 = stablehlo.dot_general %3477, %arg632, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %3479 = stablehlo.broadcast_in_dim %3478, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3480 = stablehlo.multiply %3479, %3226 : tensor<1200x1280xf32>
-    %3481 = stablehlo.broadcast_in_dim %3480, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3482 = stablehlo.broadcast_in_dim %arg633, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %3483 = stablehlo.add %3481, %3482 : tensor<1200x1280xf32>
-    %3484 = stablehlo.convert %3483 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %3485 = stablehlo.reshape %3484 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %3486 = stablehlo.transpose %3485, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %3487 = stablehlo.reshape %3486 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3488 = stablehlo.convolution(%3487, %arg157) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3489 = stablehlo.reshape %arg158 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %3490 = stablehlo.broadcast_in_dim %3488, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3491 = stablehlo.broadcast_in_dim %3489, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3492 = stablehlo.add %3490, %3491 : tensor<1x1280x30x40xbf16>
-    %3493 = stablehlo.reshape %3492 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %3494 = stablehlo.transpose %3493, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %3495 = stablehlo.multiply %3494, %cst_42 : tensor<1x1200x1280xbf16>
-    %3496 = stablehlo.multiply %3494, %3243 : tensor<1x1200x1280xbf16>
-    %3497 = stablehlo.convert %3496 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %3498 = stablehlo.clamp %cst_43, %3497, %cst_44 : tensor<1x1200x1280xf32>
-    %3499 = stablehlo.multiply %3498, %3498 : tensor<1x1200x1280xf32>
-    %3500 = stablehlo.multiply %cst_45, %3499 : tensor<1x1200x1280xf32>
-    %3501 = stablehlo.add %3500, %cst_46 : tensor<1x1200x1280xf32>
-    %3502 = stablehlo.multiply %3501, %3499 : tensor<1x1200x1280xf32>
-    %3503 = stablehlo.add %3502, %cst_47 : tensor<1x1200x1280xf32>
-    %3504 = stablehlo.multiply %3503, %3499 : tensor<1x1200x1280xf32>
-    %3505 = stablehlo.add %3504, %cst_48 : tensor<1x1200x1280xf32>
-    %3506 = stablehlo.multiply %3505, %3499 : tensor<1x1200x1280xf32>
-    %3507 = stablehlo.add %3506, %cst_49 : tensor<1x1200x1280xf32>
-    %3508 = stablehlo.multiply %3507, %3499 : tensor<1x1200x1280xf32>
-    %3509 = stablehlo.add %3508, %cst_50 : tensor<1x1200x1280xf32>
-    %3510 = stablehlo.multiply %3509, %3499 : tensor<1x1200x1280xf32>
-    %3511 = stablehlo.add %3510, %cst_51 : tensor<1x1200x1280xf32>
-    %3512 = stablehlo.multiply %cst_52, %3499 : tensor<1x1200x1280xf32>
-    %3513 = stablehlo.add %3512, %cst_53 : tensor<1x1200x1280xf32>
-    %3514 = stablehlo.multiply %3513, %3499 : tensor<1x1200x1280xf32>
-    %3515 = stablehlo.add %3514, %cst_54 : tensor<1x1200x1280xf32>
-    %3516 = stablehlo.multiply %3515, %3499 : tensor<1x1200x1280xf32>
-    %3517 = stablehlo.add %3516, %cst_55 : tensor<1x1200x1280xf32>
-    %3518 = stablehlo.multiply %3517, %3499 : tensor<1x1200x1280xf32>
-    %3519 = stablehlo.add %3518, %cst_56 : tensor<1x1200x1280xf32>
-    %3520 = stablehlo.multiply %3498, %3511 : tensor<1x1200x1280xf32>
-    %3521 = stablehlo.divide %3520, %3519 : tensor<1x1200x1280xf32>
-    %3522 = stablehlo.clamp %cst_57, %3521, %cst_58 : tensor<1x1200x1280xf32>
-    %3523 = stablehlo.convert %3522 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %3524 = stablehlo.add %3523, %cst_40 : tensor<1x1200x1280xbf16>
-    %3525 = stablehlo.multiply %3524, %3495 : tensor<1x1200x1280xbf16>
-    %3526 = stablehlo.reshape %3525 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %3527 = stablehlo.dot_general %3526, %arg634, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %3528 = stablehlo.reshape %3527 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3529 = stablehlo.broadcast_in_dim %3528, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3530 = stablehlo.broadcast_in_dim %arg159, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %3531 = stablehlo.add %3529, %3530 : tensor<1x1200x320xbf16>
-    %3532 = stablehlo.reshape %3531 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3533 = stablehlo.reshape %3532 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3534 = stablehlo.add %3533, %3438 : tensor<1x1200x320xbf16>
-    %3535 = stablehlo.convert %3534 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3536 = stablehlo.convert %3535 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3537 = stablehlo.reduce(%3536 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3538 = stablehlo.reshape %3537 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3539 = stablehlo.broadcast_in_dim %3538, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3540 = stablehlo.divide %3539, %2987 : tensor<1x1200x1xf64>
-    %3541 = stablehlo.broadcast_in_dim %3536, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3542 = stablehlo.broadcast_in_dim %3540, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3543 = stablehlo.subtract %3541, %3542 : tensor<1x1200x320xf64>
-    %3544 = stablehlo.multiply %3543, %3543 : tensor<1x1200x320xf64>
-    %3545 = stablehlo.reduce(%3544 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3546 = stablehlo.reshape %3545 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3547 = stablehlo.broadcast_in_dim %3546, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3548 = stablehlo.divide %3547, %2987 : tensor<1x1200x1xf64>
-    %3549 = stablehlo.convert %3548 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3550 = stablehlo.reduce(%3535 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3551 = stablehlo.reshape %3550 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3552 = stablehlo.broadcast_in_dim %3551, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3553 = stablehlo.divide %3552, %3003 : tensor<1x1200x1xf32>
-    %3554 = stablehlo.broadcast_in_dim %3549, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3555 = stablehlo.add %3554, %3006 : tensor<1x1200x1xf32>
-    %3556 = stablehlo.rsqrt %3555 : tensor<1x1200x1xf32>
-    %3557 = stablehlo.broadcast_in_dim %3535, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3558 = stablehlo.broadcast_in_dim %3553, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3559 = stablehlo.subtract %3557, %3558 : tensor<1x1200x320xf32>
-    %3560 = stablehlo.broadcast_in_dim %3559, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3561 = stablehlo.broadcast_in_dim %3556, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3562 = stablehlo.multiply %3560, %3561 : tensor<1x1200x320xf32>
-    %3563 = stablehlo.convert %arg160 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3564 = stablehlo.broadcast_in_dim %3562, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3565 = stablehlo.broadcast_in_dim %3563, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3566 = stablehlo.multiply %3564, %3565 : tensor<1x1200x320xf32>
-    %3567 = stablehlo.convert %arg161 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3568 = stablehlo.broadcast_in_dim %3566, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3569 = stablehlo.broadcast_in_dim %3567, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3570 = stablehlo.add %3568, %3569 : tensor<1x1200x320xf32>
-    %3571 = stablehlo.convert %3570 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3572 = stablehlo.reshape %3571 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3573 = stablehlo.convert %3572 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3574 = stablehlo.dot_general %3573, %arg635, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3575 = stablehlo.broadcast_in_dim %3574, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3576 = stablehlo.multiply %3575, %3065 : tensor<1200x320xf32>
-    %3577 = stablehlo.broadcast_in_dim %3576, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3578 = stablehlo.broadcast_in_dim %arg636, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3579 = stablehlo.add %3577, %3578 : tensor<1200x320xf32>
-    %3580 = stablehlo.convert %3579 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3581 = stablehlo.reshape %3580 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3582 = stablehlo.reshape %3581 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3583 = stablehlo.transpose %3582, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3584 = stablehlo.transpose %3571, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %3585 = stablehlo.reshape %3584 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %3586 = stablehlo.convolution(%3585, %arg162) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %3587 = stablehlo.reshape %arg163 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %3588 = stablehlo.broadcast_in_dim %3586, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %3589 = stablehlo.broadcast_in_dim %3587, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %3590 = stablehlo.add %3588, %3589 : tensor<1x320x15x20xbf16>
-    %3591 = stablehlo.reshape %3590 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %3592 = stablehlo.transpose %3591, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %3593 = stablehlo.convert %3592 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %3594 = stablehlo.convert %3593 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %3595 = stablehlo.reduce(%3594 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3596 = stablehlo.reshape %3595 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3597 = stablehlo.broadcast_in_dim %3596, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3598 = stablehlo.divide %3597, %3088 : tensor<1x300x1xf64>
-    %3599 = stablehlo.broadcast_in_dim %3594, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %3600 = stablehlo.broadcast_in_dim %3598, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %3601 = stablehlo.subtract %3599, %3600 : tensor<1x300x320xf64>
-    %3602 = stablehlo.multiply %3601, %3601 : tensor<1x300x320xf64>
-    %3603 = stablehlo.reduce(%3602 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3604 = stablehlo.reshape %3603 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3605 = stablehlo.broadcast_in_dim %3604, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3606 = stablehlo.divide %3605, %3088 : tensor<1x300x1xf64>
-    %3607 = stablehlo.convert %3606 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %3608 = stablehlo.reduce(%3593 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %3609 = stablehlo.reshape %3608 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %3610 = stablehlo.broadcast_in_dim %3609, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3611 = stablehlo.divide %3610, %3102 : tensor<1x300x1xf32>
-    %3612 = stablehlo.broadcast_in_dim %3607, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3613 = stablehlo.add %3612, %136 : tensor<1x300x1xf32>
-    %3614 = stablehlo.rsqrt %3613 : tensor<1x300x1xf32>
-    %3615 = stablehlo.broadcast_in_dim %3593, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3616 = stablehlo.broadcast_in_dim %3611, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3617 = stablehlo.subtract %3615, %3616 : tensor<1x300x320xf32>
-    %3618 = stablehlo.broadcast_in_dim %3617, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3619 = stablehlo.broadcast_in_dim %3614, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3620 = stablehlo.multiply %3618, %3619 : tensor<1x300x320xf32>
-    %3621 = stablehlo.convert %arg164 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3622 = stablehlo.broadcast_in_dim %3620, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3623 = stablehlo.broadcast_in_dim %3621, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3624 = stablehlo.multiply %3622, %3623 : tensor<1x300x320xf32>
-    %3625 = stablehlo.convert %arg165 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3626 = stablehlo.broadcast_in_dim %3624, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3627 = stablehlo.broadcast_in_dim %3625, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3628 = stablehlo.add %3626, %3627 : tensor<1x300x320xf32>
-    %3629 = stablehlo.convert %3628 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %3630 = stablehlo.reshape %3629 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %3631 = stablehlo.convert %3630 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %3632 = stablehlo.dot_general %3631, %arg637, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3633 = stablehlo.broadcast_in_dim %3632, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3634 = stablehlo.multiply %3633, %3126 : tensor<300x320xf32>
-    %3635 = stablehlo.broadcast_in_dim %3634, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3636 = stablehlo.broadcast_in_dim %arg638, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3637 = stablehlo.add %3635, %3636 : tensor<300x320xf32>
-    %3638 = stablehlo.convert %3637 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3639 = stablehlo.reshape %3638 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3640 = stablehlo.reshape %3639 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3641 = stablehlo.transpose %3640, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3642 = stablehlo.dot_general %3631, %arg639, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3643 = stablehlo.broadcast_in_dim %3642, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3644 = stablehlo.multiply %3643, %3126 : tensor<300x320xf32>
-    %3645 = stablehlo.broadcast_in_dim %3644, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3646 = stablehlo.broadcast_in_dim %arg640, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3647 = stablehlo.add %3645, %3646 : tensor<300x320xf32>
-    %3648 = stablehlo.convert %3647 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3649 = stablehlo.reshape %3648 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3650 = stablehlo.reshape %3649 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3651 = stablehlo.transpose %3650, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3652 = stablehlo.transpose %3641, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %3653 = stablehlo.reshape %3583 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3654 = stablehlo.reshape %3652 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3655 = stablehlo.broadcast_in_dim %3654, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3656 = stablehlo.dot_general %3653, %3655, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3657 = stablehlo.reshape %3656 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3658 = stablehlo.broadcast_in_dim %3657, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3659 = stablehlo.divide %3658, %3152 : tensor<1x5x1200x300xbf16>
-    %3660 = stablehlo.convert %3659 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %3661 = stablehlo.reduce(%3660 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3662 = stablehlo.reshape %3661 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3663 = stablehlo.broadcast_in_dim %3660, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3664 = stablehlo.broadcast_in_dim %3662, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3665 = stablehlo.subtract %3663, %3664 : tensor<1x5x1200x300xf32>
-    %3666 = stablehlo.exponential %3665 : tensor<1x5x1200x300xf32>
-    %3667 = stablehlo.reduce(%3666 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3668 = stablehlo.reshape %3667 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3669 = stablehlo.broadcast_in_dim %3666, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3670 = stablehlo.broadcast_in_dim %3668, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3671 = stablehlo.divide %3669, %3670 : tensor<1x5x1200x300xf32>
-    %3672 = stablehlo.convert %3671 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %3673 = stablehlo.reshape %3672 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3674 = stablehlo.reshape %3651 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3675 = stablehlo.broadcast_in_dim %3674, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3676 = stablehlo.dot_general %3673, %3675, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3677 = stablehlo.reshape %3676 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3678 = stablehlo.transpose %3677, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3679 = stablehlo.reshape %3678 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %3680 = stablehlo.reshape %3679 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3681 = stablehlo.convert %3680 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3682 = stablehlo.dot_general %3681, %arg641, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3683 = stablehlo.broadcast_in_dim %3682, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3684 = stablehlo.multiply %3683, %3065 : tensor<1200x320xf32>
-    %3685 = stablehlo.broadcast_in_dim %3684, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3686 = stablehlo.broadcast_in_dim %arg642, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3687 = stablehlo.add %3685, %3686 : tensor<1200x320xf32>
-    %3688 = stablehlo.convert %3687 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3689 = stablehlo.reshape %3688 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3690 = stablehlo.add %3689, %3534 : tensor<1x1200x320xbf16>
-    %3691 = stablehlo.convert %3690 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3692 = stablehlo.convert %3691 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3693 = stablehlo.reduce(%3692 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3694 = stablehlo.reshape %3693 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3695 = stablehlo.broadcast_in_dim %3694, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3696 = stablehlo.divide %3695, %2987 : tensor<1x1200x1xf64>
-    %3697 = stablehlo.broadcast_in_dim %3692, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3698 = stablehlo.broadcast_in_dim %3696, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3699 = stablehlo.subtract %3697, %3698 : tensor<1x1200x320xf64>
-    %3700 = stablehlo.multiply %3699, %3699 : tensor<1x1200x320xf64>
-    %3701 = stablehlo.reduce(%3700 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3702 = stablehlo.reshape %3701 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3703 = stablehlo.broadcast_in_dim %3702, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3704 = stablehlo.divide %3703, %2987 : tensor<1x1200x1xf64>
-    %3705 = stablehlo.convert %3704 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3706 = stablehlo.reduce(%3691 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3707 = stablehlo.reshape %3706 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3708 = stablehlo.broadcast_in_dim %3707, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3709 = stablehlo.divide %3708, %3003 : tensor<1x1200x1xf32>
-    %3710 = stablehlo.broadcast_in_dim %3705, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3711 = stablehlo.add %3710, %3006 : tensor<1x1200x1xf32>
-    %3712 = stablehlo.rsqrt %3711 : tensor<1x1200x1xf32>
-    %3713 = stablehlo.broadcast_in_dim %3691, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3714 = stablehlo.broadcast_in_dim %3709, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3715 = stablehlo.subtract %3713, %3714 : tensor<1x1200x320xf32>
-    %3716 = stablehlo.broadcast_in_dim %3715, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3717 = stablehlo.broadcast_in_dim %3712, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3718 = stablehlo.multiply %3716, %3717 : tensor<1x1200x320xf32>
-    %3719 = stablehlo.convert %arg166 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3720 = stablehlo.broadcast_in_dim %3718, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3721 = stablehlo.broadcast_in_dim %3719, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3722 = stablehlo.multiply %3720, %3721 : tensor<1x1200x320xf32>
-    %3723 = stablehlo.convert %arg167 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3724 = stablehlo.broadcast_in_dim %3722, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3725 = stablehlo.broadcast_in_dim %3723, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3726 = stablehlo.add %3724, %3725 : tensor<1x1200x320xf32>
-    %3727 = stablehlo.convert %3726 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3728 = stablehlo.reshape %3727 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3729 = stablehlo.convert %3728 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3730 = stablehlo.dot_general %3729, %arg643, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %3731 = stablehlo.broadcast_in_dim %3730, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3732 = stablehlo.multiply %3731, %3226 : tensor<1200x1280xf32>
-    %3733 = stablehlo.broadcast_in_dim %3732, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3734 = stablehlo.broadcast_in_dim %arg644, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %3735 = stablehlo.add %3733, %3734 : tensor<1200x1280xf32>
-    %3736 = stablehlo.convert %3735 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %3737 = stablehlo.reshape %3736 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %3738 = stablehlo.transpose %3737, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %3739 = stablehlo.reshape %3738 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3740 = stablehlo.convolution(%3739, %arg168) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3741 = stablehlo.reshape %arg169 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %3742 = stablehlo.broadcast_in_dim %3740, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3743 = stablehlo.broadcast_in_dim %3741, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3744 = stablehlo.add %3742, %3743 : tensor<1x1280x30x40xbf16>
-    %3745 = stablehlo.reshape %3744 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %3746 = stablehlo.transpose %3745, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %3747 = stablehlo.multiply %3746, %cst_42 : tensor<1x1200x1280xbf16>
-    %3748 = stablehlo.multiply %3746, %3243 : tensor<1x1200x1280xbf16>
-    %3749 = stablehlo.convert %3748 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %3750 = stablehlo.clamp %cst_43, %3749, %cst_44 : tensor<1x1200x1280xf32>
-    %3751 = stablehlo.multiply %3750, %3750 : tensor<1x1200x1280xf32>
-    %3752 = stablehlo.multiply %cst_45, %3751 : tensor<1x1200x1280xf32>
-    %3753 = stablehlo.add %3752, %cst_46 : tensor<1x1200x1280xf32>
-    %3754 = stablehlo.multiply %3753, %3751 : tensor<1x1200x1280xf32>
-    %3755 = stablehlo.add %3754, %cst_47 : tensor<1x1200x1280xf32>
-    %3756 = stablehlo.multiply %3755, %3751 : tensor<1x1200x1280xf32>
-    %3757 = stablehlo.add %3756, %cst_48 : tensor<1x1200x1280xf32>
-    %3758 = stablehlo.multiply %3757, %3751 : tensor<1x1200x1280xf32>
-    %3759 = stablehlo.add %3758, %cst_49 : tensor<1x1200x1280xf32>
-    %3760 = stablehlo.multiply %3759, %3751 : tensor<1x1200x1280xf32>
-    %3761 = stablehlo.add %3760, %cst_50 : tensor<1x1200x1280xf32>
-    %3762 = stablehlo.multiply %3761, %3751 : tensor<1x1200x1280xf32>
-    %3763 = stablehlo.add %3762, %cst_51 : tensor<1x1200x1280xf32>
-    %3764 = stablehlo.multiply %cst_52, %3751 : tensor<1x1200x1280xf32>
-    %3765 = stablehlo.add %3764, %cst_53 : tensor<1x1200x1280xf32>
-    %3766 = stablehlo.multiply %3765, %3751 : tensor<1x1200x1280xf32>
-    %3767 = stablehlo.add %3766, %cst_54 : tensor<1x1200x1280xf32>
-    %3768 = stablehlo.multiply %3767, %3751 : tensor<1x1200x1280xf32>
-    %3769 = stablehlo.add %3768, %cst_55 : tensor<1x1200x1280xf32>
-    %3770 = stablehlo.multiply %3769, %3751 : tensor<1x1200x1280xf32>
-    %3771 = stablehlo.add %3770, %cst_56 : tensor<1x1200x1280xf32>
-    %3772 = stablehlo.multiply %3750, %3763 : tensor<1x1200x1280xf32>
-    %3773 = stablehlo.divide %3772, %3771 : tensor<1x1200x1280xf32>
-    %3774 = stablehlo.clamp %cst_57, %3773, %cst_58 : tensor<1x1200x1280xf32>
-    %3775 = stablehlo.convert %3774 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %3776 = stablehlo.add %3775, %cst_40 : tensor<1x1200x1280xbf16>
-    %3777 = stablehlo.multiply %3776, %3747 : tensor<1x1200x1280xbf16>
-    %3778 = stablehlo.reshape %3777 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %3779 = stablehlo.dot_general %3778, %arg645, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %3780 = stablehlo.reshape %3779 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3781 = stablehlo.broadcast_in_dim %3780, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3782 = stablehlo.broadcast_in_dim %arg170, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %3783 = stablehlo.add %3781, %3782 : tensor<1x1200x320xbf16>
-    %3784 = stablehlo.reshape %3783 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3785 = stablehlo.reshape %3784 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3786 = stablehlo.add %3785, %3690 : tensor<1x1200x320xbf16>
-    %3787 = stablehlo.convert %3786 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3788 = stablehlo.convert %3787 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3789 = stablehlo.reduce(%3788 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3790 = stablehlo.reshape %3789 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3791 = stablehlo.broadcast_in_dim %3790, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3792 = stablehlo.divide %3791, %2987 : tensor<1x1200x1xf64>
-    %3793 = stablehlo.broadcast_in_dim %3788, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3794 = stablehlo.broadcast_in_dim %3792, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3795 = stablehlo.subtract %3793, %3794 : tensor<1x1200x320xf64>
-    %3796 = stablehlo.multiply %3795, %3795 : tensor<1x1200x320xf64>
-    %3797 = stablehlo.reduce(%3796 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3798 = stablehlo.reshape %3797 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3799 = stablehlo.broadcast_in_dim %3798, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3800 = stablehlo.divide %3799, %2987 : tensor<1x1200x1xf64>
-    %3801 = stablehlo.convert %3800 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3802 = stablehlo.reduce(%3787 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3803 = stablehlo.reshape %3802 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3804 = stablehlo.broadcast_in_dim %3803, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3805 = stablehlo.divide %3804, %3003 : tensor<1x1200x1xf32>
-    %3806 = stablehlo.broadcast_in_dim %3801, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3807 = stablehlo.add %3806, %3006 : tensor<1x1200x1xf32>
-    %3808 = stablehlo.rsqrt %3807 : tensor<1x1200x1xf32>
-    %3809 = stablehlo.broadcast_in_dim %3787, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3810 = stablehlo.broadcast_in_dim %3805, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3811 = stablehlo.subtract %3809, %3810 : tensor<1x1200x320xf32>
-    %3812 = stablehlo.broadcast_in_dim %3811, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3813 = stablehlo.broadcast_in_dim %3808, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3814 = stablehlo.multiply %3812, %3813 : tensor<1x1200x320xf32>
-    %3815 = stablehlo.convert %arg171 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3816 = stablehlo.broadcast_in_dim %3814, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3817 = stablehlo.broadcast_in_dim %3815, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3818 = stablehlo.multiply %3816, %3817 : tensor<1x1200x320xf32>
-    %3819 = stablehlo.convert %arg172 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3820 = stablehlo.broadcast_in_dim %3818, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3821 = stablehlo.broadcast_in_dim %3819, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3822 = stablehlo.add %3820, %3821 : tensor<1x1200x320xf32>
-    %3823 = stablehlo.convert %3822 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3824 = stablehlo.reshape %3823 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3825 = stablehlo.convert %3824 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3826 = stablehlo.dot_general %3825, %arg646, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3827 = stablehlo.broadcast_in_dim %3826, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3828 = stablehlo.multiply %3827, %3065 : tensor<1200x320xf32>
-    %3829 = stablehlo.broadcast_in_dim %3828, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3830 = stablehlo.broadcast_in_dim %arg647, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3831 = stablehlo.add %3829, %3830 : tensor<1200x320xf32>
-    %3832 = stablehlo.convert %3831 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3833 = stablehlo.reshape %3832 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3834 = stablehlo.reshape %3833 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3835 = stablehlo.transpose %3834, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3836 = stablehlo.transpose %3823, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %3837 = stablehlo.reshape %3836 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %3838 = stablehlo.convolution(%3837, %arg173) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %3839 = stablehlo.reshape %arg174 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %3840 = stablehlo.broadcast_in_dim %3838, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %3841 = stablehlo.broadcast_in_dim %3839, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %3842 = stablehlo.add %3840, %3841 : tensor<1x320x15x20xbf16>
-    %3843 = stablehlo.reshape %3842 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %3844 = stablehlo.transpose %3843, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %3845 = stablehlo.convert %3844 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %3846 = stablehlo.convert %3845 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %3847 = stablehlo.reduce(%3846 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3848 = stablehlo.reshape %3847 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3849 = stablehlo.broadcast_in_dim %3848, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3850 = stablehlo.divide %3849, %3088 : tensor<1x300x1xf64>
-    %3851 = stablehlo.broadcast_in_dim %3846, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %3852 = stablehlo.broadcast_in_dim %3850, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %3853 = stablehlo.subtract %3851, %3852 : tensor<1x300x320xf64>
-    %3854 = stablehlo.multiply %3853, %3853 : tensor<1x300x320xf64>
-    %3855 = stablehlo.reduce(%3854 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %3856 = stablehlo.reshape %3855 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %3857 = stablehlo.broadcast_in_dim %3856, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %3858 = stablehlo.divide %3857, %3088 : tensor<1x300x1xf64>
-    %3859 = stablehlo.convert %3858 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %3860 = stablehlo.reduce(%3845 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %3861 = stablehlo.reshape %3860 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %3862 = stablehlo.broadcast_in_dim %3861, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3863 = stablehlo.divide %3862, %3102 : tensor<1x300x1xf32>
-    %3864 = stablehlo.broadcast_in_dim %3859, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %3865 = stablehlo.add %3864, %136 : tensor<1x300x1xf32>
-    %3866 = stablehlo.rsqrt %3865 : tensor<1x300x1xf32>
-    %3867 = stablehlo.broadcast_in_dim %3845, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3868 = stablehlo.broadcast_in_dim %3863, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3869 = stablehlo.subtract %3867, %3868 : tensor<1x300x320xf32>
-    %3870 = stablehlo.broadcast_in_dim %3869, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3871 = stablehlo.broadcast_in_dim %3866, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %3872 = stablehlo.multiply %3870, %3871 : tensor<1x300x320xf32>
-    %3873 = stablehlo.convert %arg175 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3874 = stablehlo.broadcast_in_dim %3872, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3875 = stablehlo.broadcast_in_dim %3873, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3876 = stablehlo.multiply %3874, %3875 : tensor<1x300x320xf32>
-    %3877 = stablehlo.convert %arg176 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3878 = stablehlo.broadcast_in_dim %3876, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %3879 = stablehlo.broadcast_in_dim %3877, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %3880 = stablehlo.add %3878, %3879 : tensor<1x300x320xf32>
-    %3881 = stablehlo.convert %3880 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %3882 = stablehlo.reshape %3881 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %3883 = stablehlo.convert %3882 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %3884 = stablehlo.dot_general %3883, %arg648, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3885 = stablehlo.broadcast_in_dim %3884, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3886 = stablehlo.multiply %3885, %3126 : tensor<300x320xf32>
-    %3887 = stablehlo.broadcast_in_dim %3886, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3888 = stablehlo.broadcast_in_dim %arg649, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3889 = stablehlo.add %3887, %3888 : tensor<300x320xf32>
-    %3890 = stablehlo.convert %3889 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3891 = stablehlo.reshape %3890 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3892 = stablehlo.reshape %3891 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3893 = stablehlo.transpose %3892, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3894 = stablehlo.dot_general %3883, %arg650, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %3895 = stablehlo.broadcast_in_dim %3894, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3896 = stablehlo.multiply %3895, %3126 : tensor<300x320xf32>
-    %3897 = stablehlo.broadcast_in_dim %3896, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %3898 = stablehlo.broadcast_in_dim %arg651, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %3899 = stablehlo.add %3897, %3898 : tensor<300x320xf32>
-    %3900 = stablehlo.convert %3899 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %3901 = stablehlo.reshape %3900 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %3902 = stablehlo.reshape %3901 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %3903 = stablehlo.transpose %3902, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %3904 = stablehlo.transpose %3893, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %3905 = stablehlo.reshape %3835 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3906 = stablehlo.reshape %3904 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3907 = stablehlo.broadcast_in_dim %3906, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %3908 = stablehlo.dot_general %3905, %3907, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3909 = stablehlo.reshape %3908 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3910 = stablehlo.broadcast_in_dim %3909, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %3911 = stablehlo.divide %3910, %3152 : tensor<1x5x1200x300xbf16>
-    %3912 = stablehlo.convert %3911 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %3913 = stablehlo.reduce(%3912 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3914 = stablehlo.reshape %3913 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3915 = stablehlo.broadcast_in_dim %3912, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3916 = stablehlo.broadcast_in_dim %3914, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3917 = stablehlo.subtract %3915, %3916 : tensor<1x5x1200x300xf32>
-    %3918 = stablehlo.exponential %3917 : tensor<1x5x1200x300xf32>
-    %3919 = stablehlo.reduce(%3918 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %3920 = stablehlo.reshape %3919 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %3921 = stablehlo.broadcast_in_dim %3918, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %3922 = stablehlo.broadcast_in_dim %3920, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %3923 = stablehlo.divide %3921, %3922 : tensor<1x5x1200x300xf32>
-    %3924 = stablehlo.convert %3923 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %3925 = stablehlo.reshape %3924 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %3926 = stablehlo.reshape %3903 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3927 = stablehlo.broadcast_in_dim %3926, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %3928 = stablehlo.dot_general %3925, %3927, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %3929 = stablehlo.reshape %3928 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %3930 = stablehlo.transpose %3929, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %3931 = stablehlo.reshape %3930 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %3932 = stablehlo.reshape %3931 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3933 = stablehlo.convert %3932 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3934 = stablehlo.dot_general %3933, %arg652, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %3935 = stablehlo.broadcast_in_dim %3934, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3936 = stablehlo.multiply %3935, %3065 : tensor<1200x320xf32>
-    %3937 = stablehlo.broadcast_in_dim %3936, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %3938 = stablehlo.broadcast_in_dim %arg653, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %3939 = stablehlo.add %3937, %3938 : tensor<1200x320xf32>
-    %3940 = stablehlo.convert %3939 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %3941 = stablehlo.reshape %3940 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %3942 = stablehlo.add %3941, %3786 : tensor<1x1200x320xbf16>
-    %3943 = stablehlo.convert %3942 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %3944 = stablehlo.convert %3943 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %3945 = stablehlo.reduce(%3944 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3946 = stablehlo.reshape %3945 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3947 = stablehlo.broadcast_in_dim %3946, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3948 = stablehlo.divide %3947, %2987 : tensor<1x1200x1xf64>
-    %3949 = stablehlo.broadcast_in_dim %3944, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %3950 = stablehlo.broadcast_in_dim %3948, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %3951 = stablehlo.subtract %3949, %3950 : tensor<1x1200x320xf64>
-    %3952 = stablehlo.multiply %3951, %3951 : tensor<1x1200x320xf64>
-    %3953 = stablehlo.reduce(%3952 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %3954 = stablehlo.reshape %3953 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %3955 = stablehlo.broadcast_in_dim %3954, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %3956 = stablehlo.divide %3955, %2987 : tensor<1x1200x1xf64>
-    %3957 = stablehlo.convert %3956 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %3958 = stablehlo.reduce(%3943 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %3959 = stablehlo.reshape %3958 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %3960 = stablehlo.broadcast_in_dim %3959, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3961 = stablehlo.divide %3960, %3003 : tensor<1x1200x1xf32>
-    %3962 = stablehlo.broadcast_in_dim %3957, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %3963 = stablehlo.add %3962, %3006 : tensor<1x1200x1xf32>
-    %3964 = stablehlo.rsqrt %3963 : tensor<1x1200x1xf32>
-    %3965 = stablehlo.broadcast_in_dim %3943, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3966 = stablehlo.broadcast_in_dim %3961, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3967 = stablehlo.subtract %3965, %3966 : tensor<1x1200x320xf32>
-    %3968 = stablehlo.broadcast_in_dim %3967, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3969 = stablehlo.broadcast_in_dim %3964, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %3970 = stablehlo.multiply %3968, %3969 : tensor<1x1200x320xf32>
-    %3971 = stablehlo.convert %arg177 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3972 = stablehlo.broadcast_in_dim %3970, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3973 = stablehlo.broadcast_in_dim %3971, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3974 = stablehlo.multiply %3972, %3973 : tensor<1x1200x320xf32>
-    %3975 = stablehlo.convert %arg178 : (tensor<320xbf16>) -> tensor<320xf32>
-    %3976 = stablehlo.broadcast_in_dim %3974, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %3977 = stablehlo.broadcast_in_dim %3975, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %3978 = stablehlo.add %3976, %3977 : tensor<1x1200x320xf32>
-    %3979 = stablehlo.convert %3978 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %3980 = stablehlo.reshape %3979 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %3981 = stablehlo.convert %3980 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %3982 = stablehlo.dot_general %3981, %arg654, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %3983 = stablehlo.broadcast_in_dim %3982, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3984 = stablehlo.multiply %3983, %3226 : tensor<1200x1280xf32>
-    %3985 = stablehlo.broadcast_in_dim %3984, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %3986 = stablehlo.broadcast_in_dim %arg655, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %3987 = stablehlo.add %3985, %3986 : tensor<1200x1280xf32>
-    %3988 = stablehlo.convert %3987 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %3989 = stablehlo.reshape %3988 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %3990 = stablehlo.transpose %3989, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %3991 = stablehlo.reshape %3990 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3992 = stablehlo.convolution(%3991, %arg179) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3993 = stablehlo.reshape %arg180 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %3994 = stablehlo.broadcast_in_dim %3992, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3995 = stablehlo.broadcast_in_dim %3993, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %3996 = stablehlo.add %3994, %3995 : tensor<1x1280x30x40xbf16>
-    %3997 = stablehlo.reshape %3996 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %3998 = stablehlo.transpose %3997, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %3999 = stablehlo.multiply %3998, %cst_42 : tensor<1x1200x1280xbf16>
-    %4000 = stablehlo.multiply %3998, %3243 : tensor<1x1200x1280xbf16>
-    %4001 = stablehlo.convert %4000 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %4002 = stablehlo.clamp %cst_43, %4001, %cst_44 : tensor<1x1200x1280xf32>
-    %4003 = stablehlo.multiply %4002, %4002 : tensor<1x1200x1280xf32>
-    %4004 = stablehlo.multiply %cst_45, %4003 : tensor<1x1200x1280xf32>
-    %4005 = stablehlo.add %4004, %cst_46 : tensor<1x1200x1280xf32>
-    %4006 = stablehlo.multiply %4005, %4003 : tensor<1x1200x1280xf32>
-    %4007 = stablehlo.add %4006, %cst_47 : tensor<1x1200x1280xf32>
-    %4008 = stablehlo.multiply %4007, %4003 : tensor<1x1200x1280xf32>
-    %4009 = stablehlo.add %4008, %cst_48 : tensor<1x1200x1280xf32>
-    %4010 = stablehlo.multiply %4009, %4003 : tensor<1x1200x1280xf32>
-    %4011 = stablehlo.add %4010, %cst_49 : tensor<1x1200x1280xf32>
-    %4012 = stablehlo.multiply %4011, %4003 : tensor<1x1200x1280xf32>
-    %4013 = stablehlo.add %4012, %cst_50 : tensor<1x1200x1280xf32>
-    %4014 = stablehlo.multiply %4013, %4003 : tensor<1x1200x1280xf32>
-    %4015 = stablehlo.add %4014, %cst_51 : tensor<1x1200x1280xf32>
-    %4016 = stablehlo.multiply %cst_52, %4003 : tensor<1x1200x1280xf32>
-    %4017 = stablehlo.add %4016, %cst_53 : tensor<1x1200x1280xf32>
-    %4018 = stablehlo.multiply %4017, %4003 : tensor<1x1200x1280xf32>
-    %4019 = stablehlo.add %4018, %cst_54 : tensor<1x1200x1280xf32>
-    %4020 = stablehlo.multiply %4019, %4003 : tensor<1x1200x1280xf32>
-    %4021 = stablehlo.add %4020, %cst_55 : tensor<1x1200x1280xf32>
-    %4022 = stablehlo.multiply %4021, %4003 : tensor<1x1200x1280xf32>
-    %4023 = stablehlo.add %4022, %cst_56 : tensor<1x1200x1280xf32>
-    %4024 = stablehlo.multiply %4002, %4015 : tensor<1x1200x1280xf32>
-    %4025 = stablehlo.divide %4024, %4023 : tensor<1x1200x1280xf32>
-    %4026 = stablehlo.clamp %cst_57, %4025, %cst_58 : tensor<1x1200x1280xf32>
-    %4027 = stablehlo.convert %4026 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %4028 = stablehlo.add %4027, %cst_40 : tensor<1x1200x1280xbf16>
-    %4029 = stablehlo.multiply %4028, %3999 : tensor<1x1200x1280xbf16>
-    %4030 = stablehlo.reshape %4029 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %4031 = stablehlo.dot_general %4030, %arg656, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %4032 = stablehlo.reshape %4031 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4033 = stablehlo.broadcast_in_dim %4032, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4034 = stablehlo.broadcast_in_dim %arg181, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %4035 = stablehlo.add %4033, %4034 : tensor<1x1200x320xbf16>
-    %4036 = stablehlo.reshape %4035 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4037 = stablehlo.reshape %4036 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4038 = stablehlo.add %4037, %3942 : tensor<1x1200x320xbf16>
-    %4039 = stablehlo.convert %4038 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4040 = stablehlo.convert %4039 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4041 = stablehlo.reduce(%4040 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4042 = stablehlo.reshape %4041 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4043 = stablehlo.broadcast_in_dim %4042, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4044 = stablehlo.divide %4043, %2987 : tensor<1x1200x1xf64>
-    %4045 = stablehlo.broadcast_in_dim %4040, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4046 = stablehlo.broadcast_in_dim %4044, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4047 = stablehlo.subtract %4045, %4046 : tensor<1x1200x320xf64>
-    %4048 = stablehlo.multiply %4047, %4047 : tensor<1x1200x320xf64>
-    %4049 = stablehlo.reduce(%4048 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4050 = stablehlo.reshape %4049 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4051 = stablehlo.broadcast_in_dim %4050, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4052 = stablehlo.divide %4051, %2987 : tensor<1x1200x1xf64>
-    %4053 = stablehlo.convert %4052 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4054 = stablehlo.reduce(%4039 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4055 = stablehlo.reshape %4054 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4056 = stablehlo.broadcast_in_dim %4055, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4057 = stablehlo.divide %4056, %3003 : tensor<1x1200x1xf32>
-    %4058 = stablehlo.broadcast_in_dim %4053, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4059 = stablehlo.add %4058, %3006 : tensor<1x1200x1xf32>
-    %4060 = stablehlo.rsqrt %4059 : tensor<1x1200x1xf32>
-    %4061 = stablehlo.broadcast_in_dim %4039, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4062 = stablehlo.broadcast_in_dim %4057, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4063 = stablehlo.subtract %4061, %4062 : tensor<1x1200x320xf32>
-    %4064 = stablehlo.broadcast_in_dim %4063, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4065 = stablehlo.broadcast_in_dim %4060, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4066 = stablehlo.multiply %4064, %4065 : tensor<1x1200x320xf32>
-    %4067 = stablehlo.convert %arg182 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4068 = stablehlo.broadcast_in_dim %4066, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4069 = stablehlo.broadcast_in_dim %4067, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4070 = stablehlo.multiply %4068, %4069 : tensor<1x1200x320xf32>
-    %4071 = stablehlo.convert %arg183 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4072 = stablehlo.broadcast_in_dim %4070, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4073 = stablehlo.broadcast_in_dim %4071, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4074 = stablehlo.add %4072, %4073 : tensor<1x1200x320xf32>
-    %4075 = stablehlo.convert %4074 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4076 = stablehlo.reshape %4075 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4077 = stablehlo.convert %4076 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4078 = stablehlo.dot_general %4077, %arg657, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4079 = stablehlo.broadcast_in_dim %4078, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4080 = stablehlo.multiply %4079, %3065 : tensor<1200x320xf32>
-    %4081 = stablehlo.broadcast_in_dim %4080, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4082 = stablehlo.broadcast_in_dim %arg658, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4083 = stablehlo.add %4081, %4082 : tensor<1200x320xf32>
-    %4084 = stablehlo.convert %4083 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4085 = stablehlo.reshape %4084 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4086 = stablehlo.reshape %4085 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4087 = stablehlo.transpose %4086, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4088 = stablehlo.transpose %4075, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %4089 = stablehlo.reshape %4088 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %4090 = stablehlo.convolution(%4089, %arg184) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %4091 = stablehlo.reshape %arg185 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %4092 = stablehlo.broadcast_in_dim %4090, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %4093 = stablehlo.broadcast_in_dim %4091, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %4094 = stablehlo.add %4092, %4093 : tensor<1x320x15x20xbf16>
-    %4095 = stablehlo.reshape %4094 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %4096 = stablehlo.transpose %4095, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %4097 = stablehlo.convert %4096 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %4098 = stablehlo.convert %4097 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %4099 = stablehlo.reduce(%4098 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4100 = stablehlo.reshape %4099 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4101 = stablehlo.broadcast_in_dim %4100, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4102 = stablehlo.divide %4101, %3088 : tensor<1x300x1xf64>
-    %4103 = stablehlo.broadcast_in_dim %4098, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %4104 = stablehlo.broadcast_in_dim %4102, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %4105 = stablehlo.subtract %4103, %4104 : tensor<1x300x320xf64>
-    %4106 = stablehlo.multiply %4105, %4105 : tensor<1x300x320xf64>
-    %4107 = stablehlo.reduce(%4106 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4108 = stablehlo.reshape %4107 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4109 = stablehlo.broadcast_in_dim %4108, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4110 = stablehlo.divide %4109, %3088 : tensor<1x300x1xf64>
-    %4111 = stablehlo.convert %4110 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %4112 = stablehlo.reduce(%4097 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %4113 = stablehlo.reshape %4112 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %4114 = stablehlo.broadcast_in_dim %4113, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4115 = stablehlo.divide %4114, %3102 : tensor<1x300x1xf32>
-    %4116 = stablehlo.broadcast_in_dim %4111, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4117 = stablehlo.add %4116, %136 : tensor<1x300x1xf32>
-    %4118 = stablehlo.rsqrt %4117 : tensor<1x300x1xf32>
-    %4119 = stablehlo.broadcast_in_dim %4097, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4120 = stablehlo.broadcast_in_dim %4115, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4121 = stablehlo.subtract %4119, %4120 : tensor<1x300x320xf32>
-    %4122 = stablehlo.broadcast_in_dim %4121, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4123 = stablehlo.broadcast_in_dim %4118, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4124 = stablehlo.multiply %4122, %4123 : tensor<1x300x320xf32>
-    %4125 = stablehlo.convert %arg186 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4126 = stablehlo.broadcast_in_dim %4124, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4127 = stablehlo.broadcast_in_dim %4125, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4128 = stablehlo.multiply %4126, %4127 : tensor<1x300x320xf32>
-    %4129 = stablehlo.convert %arg187 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4130 = stablehlo.broadcast_in_dim %4128, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4131 = stablehlo.broadcast_in_dim %4129, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4132 = stablehlo.add %4130, %4131 : tensor<1x300x320xf32>
-    %4133 = stablehlo.convert %4132 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %4134 = stablehlo.reshape %4133 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %4135 = stablehlo.convert %4134 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %4136 = stablehlo.dot_general %4135, %arg659, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4137 = stablehlo.broadcast_in_dim %4136, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4138 = stablehlo.multiply %4137, %3126 : tensor<300x320xf32>
-    %4139 = stablehlo.broadcast_in_dim %4138, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4140 = stablehlo.broadcast_in_dim %arg660, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4141 = stablehlo.add %4139, %4140 : tensor<300x320xf32>
-    %4142 = stablehlo.convert %4141 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4143 = stablehlo.reshape %4142 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4144 = stablehlo.reshape %4143 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4145 = stablehlo.transpose %4144, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4146 = stablehlo.dot_general %4135, %arg661, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4147 = stablehlo.broadcast_in_dim %4146, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4148 = stablehlo.multiply %4147, %3126 : tensor<300x320xf32>
-    %4149 = stablehlo.broadcast_in_dim %4148, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4150 = stablehlo.broadcast_in_dim %arg662, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4151 = stablehlo.add %4149, %4150 : tensor<300x320xf32>
-    %4152 = stablehlo.convert %4151 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4153 = stablehlo.reshape %4152 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4154 = stablehlo.reshape %4153 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4155 = stablehlo.transpose %4154, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4156 = stablehlo.transpose %4145, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %4157 = stablehlo.reshape %4087 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4158 = stablehlo.reshape %4156 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4159 = stablehlo.broadcast_in_dim %4158, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4160 = stablehlo.dot_general %4157, %4159, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4161 = stablehlo.reshape %4160 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4162 = stablehlo.broadcast_in_dim %4161, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4163 = stablehlo.divide %4162, %3152 : tensor<1x5x1200x300xbf16>
-    %4164 = stablehlo.convert %4163 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %4165 = stablehlo.reduce(%4164 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4166 = stablehlo.reshape %4165 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4167 = stablehlo.broadcast_in_dim %4164, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4168 = stablehlo.broadcast_in_dim %4166, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4169 = stablehlo.subtract %4167, %4168 : tensor<1x5x1200x300xf32>
-    %4170 = stablehlo.exponential %4169 : tensor<1x5x1200x300xf32>
-    %4171 = stablehlo.reduce(%4170 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4172 = stablehlo.reshape %4171 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4173 = stablehlo.broadcast_in_dim %4170, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4174 = stablehlo.broadcast_in_dim %4172, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4175 = stablehlo.divide %4173, %4174 : tensor<1x5x1200x300xf32>
-    %4176 = stablehlo.convert %4175 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %4177 = stablehlo.reshape %4176 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4178 = stablehlo.reshape %4155 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4179 = stablehlo.broadcast_in_dim %4178, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4180 = stablehlo.dot_general %4177, %4179, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4181 = stablehlo.reshape %4180 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4182 = stablehlo.transpose %4181, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4183 = stablehlo.reshape %4182 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %4184 = stablehlo.reshape %4183 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4185 = stablehlo.convert %4184 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4186 = stablehlo.dot_general %4185, %arg663, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4187 = stablehlo.broadcast_in_dim %4186, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4188 = stablehlo.multiply %4187, %3065 : tensor<1200x320xf32>
-    %4189 = stablehlo.broadcast_in_dim %4188, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4190 = stablehlo.broadcast_in_dim %arg664, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4191 = stablehlo.add %4189, %4190 : tensor<1200x320xf32>
-    %4192 = stablehlo.convert %4191 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4193 = stablehlo.reshape %4192 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4194 = stablehlo.add %4193, %4038 : tensor<1x1200x320xbf16>
-    %4195 = stablehlo.convert %4194 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4196 = stablehlo.convert %4195 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4197 = stablehlo.reduce(%4196 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4198 = stablehlo.reshape %4197 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4199 = stablehlo.broadcast_in_dim %4198, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4200 = stablehlo.divide %4199, %2987 : tensor<1x1200x1xf64>
-    %4201 = stablehlo.broadcast_in_dim %4196, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4202 = stablehlo.broadcast_in_dim %4200, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4203 = stablehlo.subtract %4201, %4202 : tensor<1x1200x320xf64>
-    %4204 = stablehlo.multiply %4203, %4203 : tensor<1x1200x320xf64>
-    %4205 = stablehlo.reduce(%4204 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4206 = stablehlo.reshape %4205 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4207 = stablehlo.broadcast_in_dim %4206, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4208 = stablehlo.divide %4207, %2987 : tensor<1x1200x1xf64>
-    %4209 = stablehlo.convert %4208 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4210 = stablehlo.reduce(%4195 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4211 = stablehlo.reshape %4210 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4212 = stablehlo.broadcast_in_dim %4211, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4213 = stablehlo.divide %4212, %3003 : tensor<1x1200x1xf32>
-    %4214 = stablehlo.broadcast_in_dim %4209, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4215 = stablehlo.add %4214, %3006 : tensor<1x1200x1xf32>
-    %4216 = stablehlo.rsqrt %4215 : tensor<1x1200x1xf32>
-    %4217 = stablehlo.broadcast_in_dim %4195, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4218 = stablehlo.broadcast_in_dim %4213, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4219 = stablehlo.subtract %4217, %4218 : tensor<1x1200x320xf32>
-    %4220 = stablehlo.broadcast_in_dim %4219, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4221 = stablehlo.broadcast_in_dim %4216, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4222 = stablehlo.multiply %4220, %4221 : tensor<1x1200x320xf32>
-    %4223 = stablehlo.convert %arg188 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4224 = stablehlo.broadcast_in_dim %4222, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4225 = stablehlo.broadcast_in_dim %4223, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4226 = stablehlo.multiply %4224, %4225 : tensor<1x1200x320xf32>
-    %4227 = stablehlo.convert %arg189 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4228 = stablehlo.broadcast_in_dim %4226, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4229 = stablehlo.broadcast_in_dim %4227, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4230 = stablehlo.add %4228, %4229 : tensor<1x1200x320xf32>
-    %4231 = stablehlo.convert %4230 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4232 = stablehlo.reshape %4231 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4233 = stablehlo.convert %4232 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4234 = stablehlo.dot_general %4233, %arg665, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %4235 = stablehlo.broadcast_in_dim %4234, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4236 = stablehlo.multiply %4235, %3226 : tensor<1200x1280xf32>
-    %4237 = stablehlo.broadcast_in_dim %4236, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4238 = stablehlo.broadcast_in_dim %arg666, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %4239 = stablehlo.add %4237, %4238 : tensor<1200x1280xf32>
-    %4240 = stablehlo.convert %4239 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %4241 = stablehlo.reshape %4240 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %4242 = stablehlo.transpose %4241, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %4243 = stablehlo.reshape %4242 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4244 = stablehlo.convolution(%4243, %arg190) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4245 = stablehlo.reshape %arg191 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %4246 = stablehlo.broadcast_in_dim %4244, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4247 = stablehlo.broadcast_in_dim %4245, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4248 = stablehlo.add %4246, %4247 : tensor<1x1280x30x40xbf16>
-    %4249 = stablehlo.reshape %4248 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %4250 = stablehlo.transpose %4249, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %4251 = stablehlo.multiply %4250, %cst_42 : tensor<1x1200x1280xbf16>
-    %4252 = stablehlo.multiply %4250, %3243 : tensor<1x1200x1280xbf16>
-    %4253 = stablehlo.convert %4252 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %4254 = stablehlo.clamp %cst_43, %4253, %cst_44 : tensor<1x1200x1280xf32>
-    %4255 = stablehlo.multiply %4254, %4254 : tensor<1x1200x1280xf32>
-    %4256 = stablehlo.multiply %cst_45, %4255 : tensor<1x1200x1280xf32>
-    %4257 = stablehlo.add %4256, %cst_46 : tensor<1x1200x1280xf32>
-    %4258 = stablehlo.multiply %4257, %4255 : tensor<1x1200x1280xf32>
-    %4259 = stablehlo.add %4258, %cst_47 : tensor<1x1200x1280xf32>
-    %4260 = stablehlo.multiply %4259, %4255 : tensor<1x1200x1280xf32>
-    %4261 = stablehlo.add %4260, %cst_48 : tensor<1x1200x1280xf32>
-    %4262 = stablehlo.multiply %4261, %4255 : tensor<1x1200x1280xf32>
-    %4263 = stablehlo.add %4262, %cst_49 : tensor<1x1200x1280xf32>
-    %4264 = stablehlo.multiply %4263, %4255 : tensor<1x1200x1280xf32>
-    %4265 = stablehlo.add %4264, %cst_50 : tensor<1x1200x1280xf32>
-    %4266 = stablehlo.multiply %4265, %4255 : tensor<1x1200x1280xf32>
-    %4267 = stablehlo.add %4266, %cst_51 : tensor<1x1200x1280xf32>
-    %4268 = stablehlo.multiply %cst_52, %4255 : tensor<1x1200x1280xf32>
-    %4269 = stablehlo.add %4268, %cst_53 : tensor<1x1200x1280xf32>
-    %4270 = stablehlo.multiply %4269, %4255 : tensor<1x1200x1280xf32>
-    %4271 = stablehlo.add %4270, %cst_54 : tensor<1x1200x1280xf32>
-    %4272 = stablehlo.multiply %4271, %4255 : tensor<1x1200x1280xf32>
-    %4273 = stablehlo.add %4272, %cst_55 : tensor<1x1200x1280xf32>
-    %4274 = stablehlo.multiply %4273, %4255 : tensor<1x1200x1280xf32>
-    %4275 = stablehlo.add %4274, %cst_56 : tensor<1x1200x1280xf32>
-    %4276 = stablehlo.multiply %4254, %4267 : tensor<1x1200x1280xf32>
-    %4277 = stablehlo.divide %4276, %4275 : tensor<1x1200x1280xf32>
-    %4278 = stablehlo.clamp %cst_57, %4277, %cst_58 : tensor<1x1200x1280xf32>
-    %4279 = stablehlo.convert %4278 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %4280 = stablehlo.add %4279, %cst_40 : tensor<1x1200x1280xbf16>
-    %4281 = stablehlo.multiply %4280, %4251 : tensor<1x1200x1280xbf16>
-    %4282 = stablehlo.reshape %4281 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %4283 = stablehlo.dot_general %4282, %arg667, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %4284 = stablehlo.reshape %4283 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4285 = stablehlo.broadcast_in_dim %4284, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4286 = stablehlo.broadcast_in_dim %arg192, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %4287 = stablehlo.add %4285, %4286 : tensor<1x1200x320xbf16>
-    %4288 = stablehlo.reshape %4287 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4289 = stablehlo.reshape %4288 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4290 = stablehlo.add %4289, %4194 : tensor<1x1200x320xbf16>
-    %4291 = stablehlo.convert %4290 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4292 = stablehlo.convert %4291 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4293 = stablehlo.reduce(%4292 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4294 = stablehlo.reshape %4293 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4295 = stablehlo.broadcast_in_dim %4294, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4296 = stablehlo.divide %4295, %2987 : tensor<1x1200x1xf64>
-    %4297 = stablehlo.broadcast_in_dim %4292, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4298 = stablehlo.broadcast_in_dim %4296, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4299 = stablehlo.subtract %4297, %4298 : tensor<1x1200x320xf64>
-    %4300 = stablehlo.multiply %4299, %4299 : tensor<1x1200x320xf64>
-    %4301 = stablehlo.reduce(%4300 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4302 = stablehlo.reshape %4301 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4303 = stablehlo.broadcast_in_dim %4302, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4304 = stablehlo.divide %4303, %2987 : tensor<1x1200x1xf64>
-    %4305 = stablehlo.convert %4304 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4306 = stablehlo.reduce(%4291 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4307 = stablehlo.reshape %4306 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4308 = stablehlo.broadcast_in_dim %4307, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4309 = stablehlo.divide %4308, %3003 : tensor<1x1200x1xf32>
-    %4310 = stablehlo.broadcast_in_dim %4305, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4311 = stablehlo.add %4310, %3006 : tensor<1x1200x1xf32>
-    %4312 = stablehlo.rsqrt %4311 : tensor<1x1200x1xf32>
-    %4313 = stablehlo.broadcast_in_dim %4291, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4314 = stablehlo.broadcast_in_dim %4309, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4315 = stablehlo.subtract %4313, %4314 : tensor<1x1200x320xf32>
-    %4316 = stablehlo.broadcast_in_dim %4315, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4317 = stablehlo.broadcast_in_dim %4312, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4318 = stablehlo.multiply %4316, %4317 : tensor<1x1200x320xf32>
-    %4319 = stablehlo.convert %arg193 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4320 = stablehlo.broadcast_in_dim %4318, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4321 = stablehlo.broadcast_in_dim %4319, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4322 = stablehlo.multiply %4320, %4321 : tensor<1x1200x320xf32>
-    %4323 = stablehlo.convert %arg194 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4324 = stablehlo.broadcast_in_dim %4322, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4325 = stablehlo.broadcast_in_dim %4323, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4326 = stablehlo.add %4324, %4325 : tensor<1x1200x320xf32>
-    %4327 = stablehlo.convert %4326 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4328 = stablehlo.reshape %4327 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4329 = stablehlo.convert %4328 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4330 = stablehlo.dot_general %4329, %arg668, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4331 = stablehlo.broadcast_in_dim %4330, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4332 = stablehlo.multiply %4331, %3065 : tensor<1200x320xf32>
-    %4333 = stablehlo.broadcast_in_dim %4332, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4334 = stablehlo.broadcast_in_dim %arg669, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4335 = stablehlo.add %4333, %4334 : tensor<1200x320xf32>
-    %4336 = stablehlo.convert %4335 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4337 = stablehlo.reshape %4336 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4338 = stablehlo.reshape %4337 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4339 = stablehlo.transpose %4338, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4340 = stablehlo.transpose %4327, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %4341 = stablehlo.reshape %4340 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %4342 = stablehlo.convolution(%4341, %arg195) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %4343 = stablehlo.reshape %arg196 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %4344 = stablehlo.broadcast_in_dim %4342, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %4345 = stablehlo.broadcast_in_dim %4343, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %4346 = stablehlo.add %4344, %4345 : tensor<1x320x15x20xbf16>
-    %4347 = stablehlo.reshape %4346 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %4348 = stablehlo.transpose %4347, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %4349 = stablehlo.convert %4348 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %4350 = stablehlo.convert %4349 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %4351 = stablehlo.reduce(%4350 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4352 = stablehlo.reshape %4351 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4353 = stablehlo.broadcast_in_dim %4352, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4354 = stablehlo.divide %4353, %3088 : tensor<1x300x1xf64>
-    %4355 = stablehlo.broadcast_in_dim %4350, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %4356 = stablehlo.broadcast_in_dim %4354, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %4357 = stablehlo.subtract %4355, %4356 : tensor<1x300x320xf64>
-    %4358 = stablehlo.multiply %4357, %4357 : tensor<1x300x320xf64>
-    %4359 = stablehlo.reduce(%4358 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4360 = stablehlo.reshape %4359 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4361 = stablehlo.broadcast_in_dim %4360, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4362 = stablehlo.divide %4361, %3088 : tensor<1x300x1xf64>
-    %4363 = stablehlo.convert %4362 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %4364 = stablehlo.reduce(%4349 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %4365 = stablehlo.reshape %4364 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %4366 = stablehlo.broadcast_in_dim %4365, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4367 = stablehlo.divide %4366, %3102 : tensor<1x300x1xf32>
-    %4368 = stablehlo.broadcast_in_dim %4363, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4369 = stablehlo.add %4368, %136 : tensor<1x300x1xf32>
-    %4370 = stablehlo.rsqrt %4369 : tensor<1x300x1xf32>
-    %4371 = stablehlo.broadcast_in_dim %4349, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4372 = stablehlo.broadcast_in_dim %4367, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4373 = stablehlo.subtract %4371, %4372 : tensor<1x300x320xf32>
-    %4374 = stablehlo.broadcast_in_dim %4373, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4375 = stablehlo.broadcast_in_dim %4370, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4376 = stablehlo.multiply %4374, %4375 : tensor<1x300x320xf32>
-    %4377 = stablehlo.convert %arg197 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4378 = stablehlo.broadcast_in_dim %4376, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4379 = stablehlo.broadcast_in_dim %4377, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4380 = stablehlo.multiply %4378, %4379 : tensor<1x300x320xf32>
-    %4381 = stablehlo.convert %arg198 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4382 = stablehlo.broadcast_in_dim %4380, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4383 = stablehlo.broadcast_in_dim %4381, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4384 = stablehlo.add %4382, %4383 : tensor<1x300x320xf32>
-    %4385 = stablehlo.convert %4384 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %4386 = stablehlo.reshape %4385 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %4387 = stablehlo.convert %4386 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %4388 = stablehlo.dot_general %4387, %arg670, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4389 = stablehlo.broadcast_in_dim %4388, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4390 = stablehlo.multiply %4389, %3126 : tensor<300x320xf32>
-    %4391 = stablehlo.broadcast_in_dim %4390, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4392 = stablehlo.broadcast_in_dim %arg671, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4393 = stablehlo.add %4391, %4392 : tensor<300x320xf32>
-    %4394 = stablehlo.convert %4393 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4395 = stablehlo.reshape %4394 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4396 = stablehlo.reshape %4395 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4397 = stablehlo.transpose %4396, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4398 = stablehlo.dot_general %4387, %arg672, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4399 = stablehlo.broadcast_in_dim %4398, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4400 = stablehlo.multiply %4399, %3126 : tensor<300x320xf32>
-    %4401 = stablehlo.broadcast_in_dim %4400, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4402 = stablehlo.broadcast_in_dim %arg673, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4403 = stablehlo.add %4401, %4402 : tensor<300x320xf32>
-    %4404 = stablehlo.convert %4403 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4405 = stablehlo.reshape %4404 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4406 = stablehlo.reshape %4405 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4407 = stablehlo.transpose %4406, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4408 = stablehlo.transpose %4397, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %4409 = stablehlo.reshape %4339 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4410 = stablehlo.reshape %4408 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4411 = stablehlo.broadcast_in_dim %4410, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4412 = stablehlo.dot_general %4409, %4411, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4413 = stablehlo.reshape %4412 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4414 = stablehlo.broadcast_in_dim %4413, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4415 = stablehlo.divide %4414, %3152 : tensor<1x5x1200x300xbf16>
-    %4416 = stablehlo.convert %4415 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %4417 = stablehlo.reduce(%4416 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4418 = stablehlo.reshape %4417 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4419 = stablehlo.broadcast_in_dim %4416, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4420 = stablehlo.broadcast_in_dim %4418, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4421 = stablehlo.subtract %4419, %4420 : tensor<1x5x1200x300xf32>
-    %4422 = stablehlo.exponential %4421 : tensor<1x5x1200x300xf32>
-    %4423 = stablehlo.reduce(%4422 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4424 = stablehlo.reshape %4423 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4425 = stablehlo.broadcast_in_dim %4422, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4426 = stablehlo.broadcast_in_dim %4424, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4427 = stablehlo.divide %4425, %4426 : tensor<1x5x1200x300xf32>
-    %4428 = stablehlo.convert %4427 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %4429 = stablehlo.reshape %4428 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4430 = stablehlo.reshape %4407 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4431 = stablehlo.broadcast_in_dim %4430, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4432 = stablehlo.dot_general %4429, %4431, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4433 = stablehlo.reshape %4432 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4434 = stablehlo.transpose %4433, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4435 = stablehlo.reshape %4434 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %4436 = stablehlo.reshape %4435 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4437 = stablehlo.convert %4436 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4438 = stablehlo.dot_general %4437, %arg674, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4439 = stablehlo.broadcast_in_dim %4438, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4440 = stablehlo.multiply %4439, %3065 : tensor<1200x320xf32>
-    %4441 = stablehlo.broadcast_in_dim %4440, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4442 = stablehlo.broadcast_in_dim %arg675, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4443 = stablehlo.add %4441, %4442 : tensor<1200x320xf32>
-    %4444 = stablehlo.convert %4443 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4445 = stablehlo.reshape %4444 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4446 = stablehlo.add %4445, %4290 : tensor<1x1200x320xbf16>
-    %4447 = stablehlo.convert %4446 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4448 = stablehlo.convert %4447 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4449 = stablehlo.reduce(%4448 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4450 = stablehlo.reshape %4449 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4451 = stablehlo.broadcast_in_dim %4450, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4452 = stablehlo.divide %4451, %2987 : tensor<1x1200x1xf64>
-    %4453 = stablehlo.broadcast_in_dim %4448, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4454 = stablehlo.broadcast_in_dim %4452, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4455 = stablehlo.subtract %4453, %4454 : tensor<1x1200x320xf64>
-    %4456 = stablehlo.multiply %4455, %4455 : tensor<1x1200x320xf64>
-    %4457 = stablehlo.reduce(%4456 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4458 = stablehlo.reshape %4457 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4459 = stablehlo.broadcast_in_dim %4458, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4460 = stablehlo.divide %4459, %2987 : tensor<1x1200x1xf64>
-    %4461 = stablehlo.convert %4460 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4462 = stablehlo.reduce(%4447 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4463 = stablehlo.reshape %4462 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4464 = stablehlo.broadcast_in_dim %4463, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4465 = stablehlo.divide %4464, %3003 : tensor<1x1200x1xf32>
-    %4466 = stablehlo.broadcast_in_dim %4461, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4467 = stablehlo.add %4466, %3006 : tensor<1x1200x1xf32>
-    %4468 = stablehlo.rsqrt %4467 : tensor<1x1200x1xf32>
-    %4469 = stablehlo.broadcast_in_dim %4447, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4470 = stablehlo.broadcast_in_dim %4465, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4471 = stablehlo.subtract %4469, %4470 : tensor<1x1200x320xf32>
-    %4472 = stablehlo.broadcast_in_dim %4471, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4473 = stablehlo.broadcast_in_dim %4468, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4474 = stablehlo.multiply %4472, %4473 : tensor<1x1200x320xf32>
-    %4475 = stablehlo.convert %arg199 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4476 = stablehlo.broadcast_in_dim %4474, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4477 = stablehlo.broadcast_in_dim %4475, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4478 = stablehlo.multiply %4476, %4477 : tensor<1x1200x320xf32>
-    %4479 = stablehlo.convert %arg200 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4480 = stablehlo.broadcast_in_dim %4478, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4481 = stablehlo.broadcast_in_dim %4479, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4482 = stablehlo.add %4480, %4481 : tensor<1x1200x320xf32>
-    %4483 = stablehlo.convert %4482 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4484 = stablehlo.reshape %4483 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4485 = stablehlo.convert %4484 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4486 = stablehlo.dot_general %4485, %arg676, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %4487 = stablehlo.broadcast_in_dim %4486, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4488 = stablehlo.multiply %4487, %3226 : tensor<1200x1280xf32>
-    %4489 = stablehlo.broadcast_in_dim %4488, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4490 = stablehlo.broadcast_in_dim %arg677, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %4491 = stablehlo.add %4489, %4490 : tensor<1200x1280xf32>
-    %4492 = stablehlo.convert %4491 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %4493 = stablehlo.reshape %4492 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %4494 = stablehlo.transpose %4493, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %4495 = stablehlo.reshape %4494 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4496 = stablehlo.convolution(%4495, %arg201) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4497 = stablehlo.reshape %arg202 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %4498 = stablehlo.broadcast_in_dim %4496, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4499 = stablehlo.broadcast_in_dim %4497, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4500 = stablehlo.add %4498, %4499 : tensor<1x1280x30x40xbf16>
-    %4501 = stablehlo.reshape %4500 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %4502 = stablehlo.transpose %4501, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %4503 = stablehlo.multiply %4502, %cst_42 : tensor<1x1200x1280xbf16>
-    %4504 = stablehlo.multiply %4502, %3243 : tensor<1x1200x1280xbf16>
-    %4505 = stablehlo.convert %4504 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %4506 = stablehlo.clamp %cst_43, %4505, %cst_44 : tensor<1x1200x1280xf32>
-    %4507 = stablehlo.multiply %4506, %4506 : tensor<1x1200x1280xf32>
-    %4508 = stablehlo.multiply %cst_45, %4507 : tensor<1x1200x1280xf32>
-    %4509 = stablehlo.add %4508, %cst_46 : tensor<1x1200x1280xf32>
-    %4510 = stablehlo.multiply %4509, %4507 : tensor<1x1200x1280xf32>
-    %4511 = stablehlo.add %4510, %cst_47 : tensor<1x1200x1280xf32>
-    %4512 = stablehlo.multiply %4511, %4507 : tensor<1x1200x1280xf32>
-    %4513 = stablehlo.add %4512, %cst_48 : tensor<1x1200x1280xf32>
-    %4514 = stablehlo.multiply %4513, %4507 : tensor<1x1200x1280xf32>
-    %4515 = stablehlo.add %4514, %cst_49 : tensor<1x1200x1280xf32>
-    %4516 = stablehlo.multiply %4515, %4507 : tensor<1x1200x1280xf32>
-    %4517 = stablehlo.add %4516, %cst_50 : tensor<1x1200x1280xf32>
-    %4518 = stablehlo.multiply %4517, %4507 : tensor<1x1200x1280xf32>
-    %4519 = stablehlo.add %4518, %cst_51 : tensor<1x1200x1280xf32>
-    %4520 = stablehlo.multiply %cst_52, %4507 : tensor<1x1200x1280xf32>
-    %4521 = stablehlo.add %4520, %cst_53 : tensor<1x1200x1280xf32>
-    %4522 = stablehlo.multiply %4521, %4507 : tensor<1x1200x1280xf32>
-    %4523 = stablehlo.add %4522, %cst_54 : tensor<1x1200x1280xf32>
-    %4524 = stablehlo.multiply %4523, %4507 : tensor<1x1200x1280xf32>
-    %4525 = stablehlo.add %4524, %cst_55 : tensor<1x1200x1280xf32>
-    %4526 = stablehlo.multiply %4525, %4507 : tensor<1x1200x1280xf32>
-    %4527 = stablehlo.add %4526, %cst_56 : tensor<1x1200x1280xf32>
-    %4528 = stablehlo.multiply %4506, %4519 : tensor<1x1200x1280xf32>
-    %4529 = stablehlo.divide %4528, %4527 : tensor<1x1200x1280xf32>
-    %4530 = stablehlo.clamp %cst_57, %4529, %cst_58 : tensor<1x1200x1280xf32>
-    %4531 = stablehlo.convert %4530 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %4532 = stablehlo.add %4531, %cst_40 : tensor<1x1200x1280xbf16>
-    %4533 = stablehlo.multiply %4532, %4503 : tensor<1x1200x1280xbf16>
-    %4534 = stablehlo.reshape %4533 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %4535 = stablehlo.dot_general %4534, %arg678, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %4536 = stablehlo.reshape %4535 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4537 = stablehlo.broadcast_in_dim %4536, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4538 = stablehlo.broadcast_in_dim %arg203, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %4539 = stablehlo.add %4537, %4538 : tensor<1x1200x320xbf16>
-    %4540 = stablehlo.reshape %4539 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4541 = stablehlo.reshape %4540 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4542 = stablehlo.add %4541, %4446 : tensor<1x1200x320xbf16>
-    %4543 = stablehlo.convert %4542 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4544 = stablehlo.convert %4543 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4545 = stablehlo.reduce(%4544 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4546 = stablehlo.reshape %4545 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4547 = stablehlo.broadcast_in_dim %4546, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4548 = stablehlo.divide %4547, %2987 : tensor<1x1200x1xf64>
-    %4549 = stablehlo.broadcast_in_dim %4544, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4550 = stablehlo.broadcast_in_dim %4548, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4551 = stablehlo.subtract %4549, %4550 : tensor<1x1200x320xf64>
-    %4552 = stablehlo.multiply %4551, %4551 : tensor<1x1200x320xf64>
-    %4553 = stablehlo.reduce(%4552 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4554 = stablehlo.reshape %4553 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4555 = stablehlo.broadcast_in_dim %4554, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4556 = stablehlo.divide %4555, %2987 : tensor<1x1200x1xf64>
-    %4557 = stablehlo.convert %4556 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4558 = stablehlo.reduce(%4543 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4559 = stablehlo.reshape %4558 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4560 = stablehlo.broadcast_in_dim %4559, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4561 = stablehlo.divide %4560, %3003 : tensor<1x1200x1xf32>
-    %4562 = stablehlo.broadcast_in_dim %4557, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4563 = stablehlo.add %4562, %3006 : tensor<1x1200x1xf32>
-    %4564 = stablehlo.rsqrt %4563 : tensor<1x1200x1xf32>
-    %4565 = stablehlo.broadcast_in_dim %4543, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4566 = stablehlo.broadcast_in_dim %4561, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4567 = stablehlo.subtract %4565, %4566 : tensor<1x1200x320xf32>
-    %4568 = stablehlo.broadcast_in_dim %4567, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4569 = stablehlo.broadcast_in_dim %4564, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4570 = stablehlo.multiply %4568, %4569 : tensor<1x1200x320xf32>
-    %4571 = stablehlo.convert %arg204 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4572 = stablehlo.broadcast_in_dim %4570, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4573 = stablehlo.broadcast_in_dim %4571, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4574 = stablehlo.multiply %4572, %4573 : tensor<1x1200x320xf32>
-    %4575 = stablehlo.convert %arg205 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4576 = stablehlo.broadcast_in_dim %4574, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4577 = stablehlo.broadcast_in_dim %4575, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4578 = stablehlo.add %4576, %4577 : tensor<1x1200x320xf32>
-    %4579 = stablehlo.convert %4578 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4580 = stablehlo.reshape %4579 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4581 = stablehlo.convert %4580 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4582 = stablehlo.dot_general %4581, %arg679, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4583 = stablehlo.broadcast_in_dim %4582, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4584 = stablehlo.multiply %4583, %3065 : tensor<1200x320xf32>
-    %4585 = stablehlo.broadcast_in_dim %4584, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4586 = stablehlo.broadcast_in_dim %arg680, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4587 = stablehlo.add %4585, %4586 : tensor<1200x320xf32>
-    %4588 = stablehlo.convert %4587 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4589 = stablehlo.reshape %4588 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4590 = stablehlo.reshape %4589 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4591 = stablehlo.transpose %4590, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4592 = stablehlo.transpose %4579, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %4593 = stablehlo.reshape %4592 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %4594 = stablehlo.convolution(%4593, %arg206) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %4595 = stablehlo.reshape %arg207 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %4596 = stablehlo.broadcast_in_dim %4594, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %4597 = stablehlo.broadcast_in_dim %4595, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %4598 = stablehlo.add %4596, %4597 : tensor<1x320x15x20xbf16>
-    %4599 = stablehlo.reshape %4598 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %4600 = stablehlo.transpose %4599, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %4601 = stablehlo.convert %4600 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %4602 = stablehlo.convert %4601 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %4603 = stablehlo.reduce(%4602 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4604 = stablehlo.reshape %4603 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4605 = stablehlo.broadcast_in_dim %4604, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4606 = stablehlo.divide %4605, %3088 : tensor<1x300x1xf64>
-    %4607 = stablehlo.broadcast_in_dim %4602, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %4608 = stablehlo.broadcast_in_dim %4606, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %4609 = stablehlo.subtract %4607, %4608 : tensor<1x300x320xf64>
-    %4610 = stablehlo.multiply %4609, %4609 : tensor<1x300x320xf64>
-    %4611 = stablehlo.reduce(%4610 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4612 = stablehlo.reshape %4611 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4613 = stablehlo.broadcast_in_dim %4612, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4614 = stablehlo.divide %4613, %3088 : tensor<1x300x1xf64>
-    %4615 = stablehlo.convert %4614 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %4616 = stablehlo.reduce(%4601 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %4617 = stablehlo.reshape %4616 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %4618 = stablehlo.broadcast_in_dim %4617, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4619 = stablehlo.divide %4618, %3102 : tensor<1x300x1xf32>
-    %4620 = stablehlo.broadcast_in_dim %4615, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4621 = stablehlo.add %4620, %136 : tensor<1x300x1xf32>
-    %4622 = stablehlo.rsqrt %4621 : tensor<1x300x1xf32>
-    %4623 = stablehlo.broadcast_in_dim %4601, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4624 = stablehlo.broadcast_in_dim %4619, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4625 = stablehlo.subtract %4623, %4624 : tensor<1x300x320xf32>
-    %4626 = stablehlo.broadcast_in_dim %4625, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4627 = stablehlo.broadcast_in_dim %4622, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4628 = stablehlo.multiply %4626, %4627 : tensor<1x300x320xf32>
-    %4629 = stablehlo.convert %arg208 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4630 = stablehlo.broadcast_in_dim %4628, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4631 = stablehlo.broadcast_in_dim %4629, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4632 = stablehlo.multiply %4630, %4631 : tensor<1x300x320xf32>
-    %4633 = stablehlo.convert %arg209 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4634 = stablehlo.broadcast_in_dim %4632, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4635 = stablehlo.broadcast_in_dim %4633, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4636 = stablehlo.add %4634, %4635 : tensor<1x300x320xf32>
-    %4637 = stablehlo.convert %4636 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %4638 = stablehlo.reshape %4637 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %4639 = stablehlo.convert %4638 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %4640 = stablehlo.dot_general %4639, %arg681, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4641 = stablehlo.broadcast_in_dim %4640, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4642 = stablehlo.multiply %4641, %3126 : tensor<300x320xf32>
-    %4643 = stablehlo.broadcast_in_dim %4642, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4644 = stablehlo.broadcast_in_dim %arg682, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4645 = stablehlo.add %4643, %4644 : tensor<300x320xf32>
-    %4646 = stablehlo.convert %4645 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4647 = stablehlo.reshape %4646 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4648 = stablehlo.reshape %4647 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4649 = stablehlo.transpose %4648, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4650 = stablehlo.dot_general %4639, %arg683, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4651 = stablehlo.broadcast_in_dim %4650, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4652 = stablehlo.multiply %4651, %3126 : tensor<300x320xf32>
-    %4653 = stablehlo.broadcast_in_dim %4652, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4654 = stablehlo.broadcast_in_dim %arg684, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4655 = stablehlo.add %4653, %4654 : tensor<300x320xf32>
-    %4656 = stablehlo.convert %4655 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4657 = stablehlo.reshape %4656 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4658 = stablehlo.reshape %4657 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4659 = stablehlo.transpose %4658, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4660 = stablehlo.transpose %4649, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %4661 = stablehlo.reshape %4591 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4662 = stablehlo.reshape %4660 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4663 = stablehlo.broadcast_in_dim %4662, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4664 = stablehlo.dot_general %4661, %4663, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4665 = stablehlo.reshape %4664 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4666 = stablehlo.broadcast_in_dim %4665, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4667 = stablehlo.divide %4666, %3152 : tensor<1x5x1200x300xbf16>
-    %4668 = stablehlo.convert %4667 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %4669 = stablehlo.reduce(%4668 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4670 = stablehlo.reshape %4669 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4671 = stablehlo.broadcast_in_dim %4668, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4672 = stablehlo.broadcast_in_dim %4670, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4673 = stablehlo.subtract %4671, %4672 : tensor<1x5x1200x300xf32>
-    %4674 = stablehlo.exponential %4673 : tensor<1x5x1200x300xf32>
-    %4675 = stablehlo.reduce(%4674 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4676 = stablehlo.reshape %4675 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4677 = stablehlo.broadcast_in_dim %4674, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4678 = stablehlo.broadcast_in_dim %4676, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4679 = stablehlo.divide %4677, %4678 : tensor<1x5x1200x300xf32>
-    %4680 = stablehlo.convert %4679 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %4681 = stablehlo.reshape %4680 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4682 = stablehlo.reshape %4659 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4683 = stablehlo.broadcast_in_dim %4682, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4684 = stablehlo.dot_general %4681, %4683, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4685 = stablehlo.reshape %4684 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4686 = stablehlo.transpose %4685, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4687 = stablehlo.reshape %4686 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %4688 = stablehlo.reshape %4687 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4689 = stablehlo.convert %4688 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4690 = stablehlo.dot_general %4689, %arg685, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4691 = stablehlo.broadcast_in_dim %4690, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4692 = stablehlo.multiply %4691, %3065 : tensor<1200x320xf32>
-    %4693 = stablehlo.broadcast_in_dim %4692, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4694 = stablehlo.broadcast_in_dim %arg686, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4695 = stablehlo.add %4693, %4694 : tensor<1200x320xf32>
-    %4696 = stablehlo.convert %4695 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4697 = stablehlo.reshape %4696 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4698 = stablehlo.add %4697, %4542 : tensor<1x1200x320xbf16>
-    %4699 = stablehlo.convert %4698 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4700 = stablehlo.convert %4699 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4701 = stablehlo.reduce(%4700 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4702 = stablehlo.reshape %4701 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4703 = stablehlo.broadcast_in_dim %4702, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4704 = stablehlo.divide %4703, %2987 : tensor<1x1200x1xf64>
-    %4705 = stablehlo.broadcast_in_dim %4700, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4706 = stablehlo.broadcast_in_dim %4704, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4707 = stablehlo.subtract %4705, %4706 : tensor<1x1200x320xf64>
-    %4708 = stablehlo.multiply %4707, %4707 : tensor<1x1200x320xf64>
-    %4709 = stablehlo.reduce(%4708 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4710 = stablehlo.reshape %4709 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4711 = stablehlo.broadcast_in_dim %4710, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4712 = stablehlo.divide %4711, %2987 : tensor<1x1200x1xf64>
-    %4713 = stablehlo.convert %4712 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4714 = stablehlo.reduce(%4699 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4715 = stablehlo.reshape %4714 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4716 = stablehlo.broadcast_in_dim %4715, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4717 = stablehlo.divide %4716, %3003 : tensor<1x1200x1xf32>
-    %4718 = stablehlo.broadcast_in_dim %4713, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4719 = stablehlo.add %4718, %3006 : tensor<1x1200x1xf32>
-    %4720 = stablehlo.rsqrt %4719 : tensor<1x1200x1xf32>
-    %4721 = stablehlo.broadcast_in_dim %4699, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4722 = stablehlo.broadcast_in_dim %4717, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4723 = stablehlo.subtract %4721, %4722 : tensor<1x1200x320xf32>
-    %4724 = stablehlo.broadcast_in_dim %4723, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4725 = stablehlo.broadcast_in_dim %4720, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4726 = stablehlo.multiply %4724, %4725 : tensor<1x1200x320xf32>
-    %4727 = stablehlo.convert %arg210 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4728 = stablehlo.broadcast_in_dim %4726, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4729 = stablehlo.broadcast_in_dim %4727, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4730 = stablehlo.multiply %4728, %4729 : tensor<1x1200x320xf32>
-    %4731 = stablehlo.convert %arg211 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4732 = stablehlo.broadcast_in_dim %4730, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4733 = stablehlo.broadcast_in_dim %4731, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4734 = stablehlo.add %4732, %4733 : tensor<1x1200x320xf32>
-    %4735 = stablehlo.convert %4734 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4736 = stablehlo.reshape %4735 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4737 = stablehlo.convert %4736 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4738 = stablehlo.dot_general %4737, %arg687, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %4739 = stablehlo.broadcast_in_dim %4738, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4740 = stablehlo.multiply %4739, %3226 : tensor<1200x1280xf32>
-    %4741 = stablehlo.broadcast_in_dim %4740, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4742 = stablehlo.broadcast_in_dim %arg688, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %4743 = stablehlo.add %4741, %4742 : tensor<1200x1280xf32>
-    %4744 = stablehlo.convert %4743 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %4745 = stablehlo.reshape %4744 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %4746 = stablehlo.transpose %4745, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %4747 = stablehlo.reshape %4746 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4748 = stablehlo.convolution(%4747, %arg212) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4749 = stablehlo.reshape %arg213 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %4750 = stablehlo.broadcast_in_dim %4748, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4751 = stablehlo.broadcast_in_dim %4749, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %4752 = stablehlo.add %4750, %4751 : tensor<1x1280x30x40xbf16>
-    %4753 = stablehlo.reshape %4752 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %4754 = stablehlo.transpose %4753, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %4755 = stablehlo.multiply %4754, %cst_42 : tensor<1x1200x1280xbf16>
-    %4756 = stablehlo.multiply %4754, %3243 : tensor<1x1200x1280xbf16>
-    %4757 = stablehlo.convert %4756 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %4758 = stablehlo.clamp %cst_43, %4757, %cst_44 : tensor<1x1200x1280xf32>
-    %4759 = stablehlo.multiply %4758, %4758 : tensor<1x1200x1280xf32>
-    %4760 = stablehlo.multiply %cst_45, %4759 : tensor<1x1200x1280xf32>
-    %4761 = stablehlo.add %4760, %cst_46 : tensor<1x1200x1280xf32>
-    %4762 = stablehlo.multiply %4761, %4759 : tensor<1x1200x1280xf32>
-    %4763 = stablehlo.add %4762, %cst_47 : tensor<1x1200x1280xf32>
-    %4764 = stablehlo.multiply %4763, %4759 : tensor<1x1200x1280xf32>
-    %4765 = stablehlo.add %4764, %cst_48 : tensor<1x1200x1280xf32>
-    %4766 = stablehlo.multiply %4765, %4759 : tensor<1x1200x1280xf32>
-    %4767 = stablehlo.add %4766, %cst_49 : tensor<1x1200x1280xf32>
-    %4768 = stablehlo.multiply %4767, %4759 : tensor<1x1200x1280xf32>
-    %4769 = stablehlo.add %4768, %cst_50 : tensor<1x1200x1280xf32>
-    %4770 = stablehlo.multiply %4769, %4759 : tensor<1x1200x1280xf32>
-    %4771 = stablehlo.add %4770, %cst_51 : tensor<1x1200x1280xf32>
-    %4772 = stablehlo.multiply %cst_52, %4759 : tensor<1x1200x1280xf32>
-    %4773 = stablehlo.add %4772, %cst_53 : tensor<1x1200x1280xf32>
-    %4774 = stablehlo.multiply %4773, %4759 : tensor<1x1200x1280xf32>
-    %4775 = stablehlo.add %4774, %cst_54 : tensor<1x1200x1280xf32>
-    %4776 = stablehlo.multiply %4775, %4759 : tensor<1x1200x1280xf32>
-    %4777 = stablehlo.add %4776, %cst_55 : tensor<1x1200x1280xf32>
-    %4778 = stablehlo.multiply %4777, %4759 : tensor<1x1200x1280xf32>
-    %4779 = stablehlo.add %4778, %cst_56 : tensor<1x1200x1280xf32>
-    %4780 = stablehlo.multiply %4758, %4771 : tensor<1x1200x1280xf32>
-    %4781 = stablehlo.divide %4780, %4779 : tensor<1x1200x1280xf32>
-    %4782 = stablehlo.clamp %cst_57, %4781, %cst_58 : tensor<1x1200x1280xf32>
-    %4783 = stablehlo.convert %4782 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %4784 = stablehlo.add %4783, %cst_40 : tensor<1x1200x1280xbf16>
-    %4785 = stablehlo.multiply %4784, %4755 : tensor<1x1200x1280xbf16>
-    %4786 = stablehlo.reshape %4785 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %4787 = stablehlo.dot_general %4786, %arg689, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %4788 = stablehlo.reshape %4787 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4789 = stablehlo.broadcast_in_dim %4788, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4790 = stablehlo.broadcast_in_dim %arg214, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %4791 = stablehlo.add %4789, %4790 : tensor<1x1200x320xbf16>
-    %4792 = stablehlo.reshape %4791 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4793 = stablehlo.reshape %4792 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4794 = stablehlo.add %4793, %4698 : tensor<1x1200x320xbf16>
-    %4795 = stablehlo.convert %4794 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4796 = stablehlo.convert %4795 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4797 = stablehlo.reduce(%4796 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4798 = stablehlo.reshape %4797 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4799 = stablehlo.broadcast_in_dim %4798, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4800 = stablehlo.divide %4799, %2987 : tensor<1x1200x1xf64>
-    %4801 = stablehlo.broadcast_in_dim %4796, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4802 = stablehlo.broadcast_in_dim %4800, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4803 = stablehlo.subtract %4801, %4802 : tensor<1x1200x320xf64>
-    %4804 = stablehlo.multiply %4803, %4803 : tensor<1x1200x320xf64>
-    %4805 = stablehlo.reduce(%4804 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4806 = stablehlo.reshape %4805 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4807 = stablehlo.broadcast_in_dim %4806, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4808 = stablehlo.divide %4807, %2987 : tensor<1x1200x1xf64>
-    %4809 = stablehlo.convert %4808 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4810 = stablehlo.reduce(%4795 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4811 = stablehlo.reshape %4810 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4812 = stablehlo.broadcast_in_dim %4811, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4813 = stablehlo.divide %4812, %3003 : tensor<1x1200x1xf32>
-    %4814 = stablehlo.broadcast_in_dim %4809, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4815 = stablehlo.add %4814, %3006 : tensor<1x1200x1xf32>
-    %4816 = stablehlo.rsqrt %4815 : tensor<1x1200x1xf32>
-    %4817 = stablehlo.broadcast_in_dim %4795, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4818 = stablehlo.broadcast_in_dim %4813, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4819 = stablehlo.subtract %4817, %4818 : tensor<1x1200x320xf32>
-    %4820 = stablehlo.broadcast_in_dim %4819, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4821 = stablehlo.broadcast_in_dim %4816, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4822 = stablehlo.multiply %4820, %4821 : tensor<1x1200x320xf32>
-    %4823 = stablehlo.convert %arg215 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4824 = stablehlo.broadcast_in_dim %4822, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4825 = stablehlo.broadcast_in_dim %4823, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4826 = stablehlo.multiply %4824, %4825 : tensor<1x1200x320xf32>
-    %4827 = stablehlo.convert %arg216 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4828 = stablehlo.broadcast_in_dim %4826, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4829 = stablehlo.broadcast_in_dim %4827, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4830 = stablehlo.add %4828, %4829 : tensor<1x1200x320xf32>
-    %4831 = stablehlo.convert %4830 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4832 = stablehlo.reshape %4831 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4833 = stablehlo.convert %4832 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4834 = stablehlo.dot_general %4833, %arg690, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4835 = stablehlo.broadcast_in_dim %4834, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4836 = stablehlo.multiply %4835, %3065 : tensor<1200x320xf32>
-    %4837 = stablehlo.broadcast_in_dim %4836, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4838 = stablehlo.broadcast_in_dim %arg691, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4839 = stablehlo.add %4837, %4838 : tensor<1200x320xf32>
-    %4840 = stablehlo.convert %4839 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4841 = stablehlo.reshape %4840 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4842 = stablehlo.reshape %4841 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4843 = stablehlo.transpose %4842, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4844 = stablehlo.transpose %4831, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %4845 = stablehlo.reshape %4844 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %4846 = stablehlo.convolution(%4845, %arg217) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %4847 = stablehlo.reshape %arg218 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %4848 = stablehlo.broadcast_in_dim %4846, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %4849 = stablehlo.broadcast_in_dim %4847, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %4850 = stablehlo.add %4848, %4849 : tensor<1x320x15x20xbf16>
-    %4851 = stablehlo.reshape %4850 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %4852 = stablehlo.transpose %4851, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %4853 = stablehlo.convert %4852 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %4854 = stablehlo.convert %4853 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %4855 = stablehlo.reduce(%4854 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4856 = stablehlo.reshape %4855 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4857 = stablehlo.broadcast_in_dim %4856, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4858 = stablehlo.divide %4857, %3088 : tensor<1x300x1xf64>
-    %4859 = stablehlo.broadcast_in_dim %4854, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %4860 = stablehlo.broadcast_in_dim %4858, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %4861 = stablehlo.subtract %4859, %4860 : tensor<1x300x320xf64>
-    %4862 = stablehlo.multiply %4861, %4861 : tensor<1x300x320xf64>
-    %4863 = stablehlo.reduce(%4862 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %4864 = stablehlo.reshape %4863 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %4865 = stablehlo.broadcast_in_dim %4864, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %4866 = stablehlo.divide %4865, %3088 : tensor<1x300x1xf64>
-    %4867 = stablehlo.convert %4866 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %4868 = stablehlo.reduce(%4853 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %4869 = stablehlo.reshape %4868 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %4870 = stablehlo.broadcast_in_dim %4869, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4871 = stablehlo.divide %4870, %3102 : tensor<1x300x1xf32>
-    %4872 = stablehlo.broadcast_in_dim %4867, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %4873 = stablehlo.add %4872, %136 : tensor<1x300x1xf32>
-    %4874 = stablehlo.rsqrt %4873 : tensor<1x300x1xf32>
-    %4875 = stablehlo.broadcast_in_dim %4853, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4876 = stablehlo.broadcast_in_dim %4871, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4877 = stablehlo.subtract %4875, %4876 : tensor<1x300x320xf32>
-    %4878 = stablehlo.broadcast_in_dim %4877, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4879 = stablehlo.broadcast_in_dim %4874, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %4880 = stablehlo.multiply %4878, %4879 : tensor<1x300x320xf32>
-    %4881 = stablehlo.convert %arg219 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4882 = stablehlo.broadcast_in_dim %4880, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4883 = stablehlo.broadcast_in_dim %4881, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4884 = stablehlo.multiply %4882, %4883 : tensor<1x300x320xf32>
-    %4885 = stablehlo.convert %arg220 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4886 = stablehlo.broadcast_in_dim %4884, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %4887 = stablehlo.broadcast_in_dim %4885, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %4888 = stablehlo.add %4886, %4887 : tensor<1x300x320xf32>
-    %4889 = stablehlo.convert %4888 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %4890 = stablehlo.reshape %4889 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %4891 = stablehlo.convert %4890 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %4892 = stablehlo.dot_general %4891, %arg692, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4893 = stablehlo.broadcast_in_dim %4892, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4894 = stablehlo.multiply %4893, %3126 : tensor<300x320xf32>
-    %4895 = stablehlo.broadcast_in_dim %4894, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4896 = stablehlo.broadcast_in_dim %arg693, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4897 = stablehlo.add %4895, %4896 : tensor<300x320xf32>
-    %4898 = stablehlo.convert %4897 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4899 = stablehlo.reshape %4898 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4900 = stablehlo.reshape %4899 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4901 = stablehlo.transpose %4900, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4902 = stablehlo.dot_general %4891, %arg694, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %4903 = stablehlo.broadcast_in_dim %4902, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4904 = stablehlo.multiply %4903, %3126 : tensor<300x320xf32>
-    %4905 = stablehlo.broadcast_in_dim %4904, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %4906 = stablehlo.broadcast_in_dim %arg695, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %4907 = stablehlo.add %4905, %4906 : tensor<300x320xf32>
-    %4908 = stablehlo.convert %4907 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %4909 = stablehlo.reshape %4908 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %4910 = stablehlo.reshape %4909 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %4911 = stablehlo.transpose %4910, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %4912 = stablehlo.transpose %4901, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %4913 = stablehlo.reshape %4843 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4914 = stablehlo.reshape %4912 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4915 = stablehlo.broadcast_in_dim %4914, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %4916 = stablehlo.dot_general %4913, %4915, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4917 = stablehlo.reshape %4916 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4918 = stablehlo.broadcast_in_dim %4917, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %4919 = stablehlo.divide %4918, %3152 : tensor<1x5x1200x300xbf16>
-    %4920 = stablehlo.convert %4919 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %4921 = stablehlo.reduce(%4920 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4922 = stablehlo.reshape %4921 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4923 = stablehlo.broadcast_in_dim %4920, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4924 = stablehlo.broadcast_in_dim %4922, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4925 = stablehlo.subtract %4923, %4924 : tensor<1x5x1200x300xf32>
-    %4926 = stablehlo.exponential %4925 : tensor<1x5x1200x300xf32>
-    %4927 = stablehlo.reduce(%4926 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %4928 = stablehlo.reshape %4927 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %4929 = stablehlo.broadcast_in_dim %4926, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %4930 = stablehlo.broadcast_in_dim %4928, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %4931 = stablehlo.divide %4929, %4930 : tensor<1x5x1200x300xf32>
-    %4932 = stablehlo.convert %4931 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %4933 = stablehlo.reshape %4932 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %4934 = stablehlo.reshape %4911 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4935 = stablehlo.broadcast_in_dim %4934, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %4936 = stablehlo.dot_general %4933, %4935, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %4937 = stablehlo.reshape %4936 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %4938 = stablehlo.transpose %4937, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %4939 = stablehlo.reshape %4938 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %4940 = stablehlo.reshape %4939 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4941 = stablehlo.convert %4940 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4942 = stablehlo.dot_general %4941, %arg696, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %4943 = stablehlo.broadcast_in_dim %4942, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4944 = stablehlo.multiply %4943, %3065 : tensor<1200x320xf32>
-    %4945 = stablehlo.broadcast_in_dim %4944, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %4946 = stablehlo.broadcast_in_dim %arg697, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %4947 = stablehlo.add %4945, %4946 : tensor<1200x320xf32>
-    %4948 = stablehlo.convert %4947 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %4949 = stablehlo.reshape %4948 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %4950 = stablehlo.add %4949, %4794 : tensor<1x1200x320xbf16>
-    %4951 = stablehlo.convert %4950 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %4952 = stablehlo.convert %4951 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %4953 = stablehlo.reduce(%4952 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4954 = stablehlo.reshape %4953 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4955 = stablehlo.broadcast_in_dim %4954, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4956 = stablehlo.divide %4955, %2987 : tensor<1x1200x1xf64>
-    %4957 = stablehlo.broadcast_in_dim %4952, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %4958 = stablehlo.broadcast_in_dim %4956, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %4959 = stablehlo.subtract %4957, %4958 : tensor<1x1200x320xf64>
-    %4960 = stablehlo.multiply %4959, %4959 : tensor<1x1200x320xf64>
-    %4961 = stablehlo.reduce(%4960 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %4962 = stablehlo.reshape %4961 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %4963 = stablehlo.broadcast_in_dim %4962, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %4964 = stablehlo.divide %4963, %2987 : tensor<1x1200x1xf64>
-    %4965 = stablehlo.convert %4964 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %4966 = stablehlo.reduce(%4951 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %4967 = stablehlo.reshape %4966 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %4968 = stablehlo.broadcast_in_dim %4967, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4969 = stablehlo.divide %4968, %3003 : tensor<1x1200x1xf32>
-    %4970 = stablehlo.broadcast_in_dim %4965, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %4971 = stablehlo.add %4970, %3006 : tensor<1x1200x1xf32>
-    %4972 = stablehlo.rsqrt %4971 : tensor<1x1200x1xf32>
-    %4973 = stablehlo.broadcast_in_dim %4951, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4974 = stablehlo.broadcast_in_dim %4969, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4975 = stablehlo.subtract %4973, %4974 : tensor<1x1200x320xf32>
-    %4976 = stablehlo.broadcast_in_dim %4975, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4977 = stablehlo.broadcast_in_dim %4972, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %4978 = stablehlo.multiply %4976, %4977 : tensor<1x1200x320xf32>
-    %4979 = stablehlo.convert %arg221 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4980 = stablehlo.broadcast_in_dim %4978, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4981 = stablehlo.broadcast_in_dim %4979, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4982 = stablehlo.multiply %4980, %4981 : tensor<1x1200x320xf32>
-    %4983 = stablehlo.convert %arg222 : (tensor<320xbf16>) -> tensor<320xf32>
-    %4984 = stablehlo.broadcast_in_dim %4982, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %4985 = stablehlo.broadcast_in_dim %4983, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %4986 = stablehlo.add %4984, %4985 : tensor<1x1200x320xf32>
-    %4987 = stablehlo.convert %4986 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %4988 = stablehlo.reshape %4987 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %4989 = stablehlo.convert %4988 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %4990 = stablehlo.dot_general %4989, %arg698, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %4991 = stablehlo.broadcast_in_dim %4990, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4992 = stablehlo.multiply %4991, %3226 : tensor<1200x1280xf32>
-    %4993 = stablehlo.broadcast_in_dim %4992, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %4994 = stablehlo.broadcast_in_dim %arg699, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %4995 = stablehlo.add %4993, %4994 : tensor<1200x1280xf32>
-    %4996 = stablehlo.convert %4995 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %4997 = stablehlo.reshape %4996 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %4998 = stablehlo.transpose %4997, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %4999 = stablehlo.reshape %4998 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5000 = stablehlo.convolution(%4999, %arg223) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5001 = stablehlo.reshape %arg224 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %5002 = stablehlo.broadcast_in_dim %5000, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5003 = stablehlo.broadcast_in_dim %5001, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5004 = stablehlo.add %5002, %5003 : tensor<1x1280x30x40xbf16>
-    %5005 = stablehlo.reshape %5004 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %5006 = stablehlo.transpose %5005, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %5007 = stablehlo.multiply %5006, %cst_42 : tensor<1x1200x1280xbf16>
-    %5008 = stablehlo.multiply %5006, %3243 : tensor<1x1200x1280xbf16>
-    %5009 = stablehlo.convert %5008 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %5010 = stablehlo.clamp %cst_43, %5009, %cst_44 : tensor<1x1200x1280xf32>
-    %5011 = stablehlo.multiply %5010, %5010 : tensor<1x1200x1280xf32>
-    %5012 = stablehlo.multiply %cst_45, %5011 : tensor<1x1200x1280xf32>
-    %5013 = stablehlo.add %5012, %cst_46 : tensor<1x1200x1280xf32>
-    %5014 = stablehlo.multiply %5013, %5011 : tensor<1x1200x1280xf32>
-    %5015 = stablehlo.add %5014, %cst_47 : tensor<1x1200x1280xf32>
-    %5016 = stablehlo.multiply %5015, %5011 : tensor<1x1200x1280xf32>
-    %5017 = stablehlo.add %5016, %cst_48 : tensor<1x1200x1280xf32>
-    %5018 = stablehlo.multiply %5017, %5011 : tensor<1x1200x1280xf32>
-    %5019 = stablehlo.add %5018, %cst_49 : tensor<1x1200x1280xf32>
-    %5020 = stablehlo.multiply %5019, %5011 : tensor<1x1200x1280xf32>
-    %5021 = stablehlo.add %5020, %cst_50 : tensor<1x1200x1280xf32>
-    %5022 = stablehlo.multiply %5021, %5011 : tensor<1x1200x1280xf32>
-    %5023 = stablehlo.add %5022, %cst_51 : tensor<1x1200x1280xf32>
-    %5024 = stablehlo.multiply %cst_52, %5011 : tensor<1x1200x1280xf32>
-    %5025 = stablehlo.add %5024, %cst_53 : tensor<1x1200x1280xf32>
-    %5026 = stablehlo.multiply %5025, %5011 : tensor<1x1200x1280xf32>
-    %5027 = stablehlo.add %5026, %cst_54 : tensor<1x1200x1280xf32>
-    %5028 = stablehlo.multiply %5027, %5011 : tensor<1x1200x1280xf32>
-    %5029 = stablehlo.add %5028, %cst_55 : tensor<1x1200x1280xf32>
-    %5030 = stablehlo.multiply %5029, %5011 : tensor<1x1200x1280xf32>
-    %5031 = stablehlo.add %5030, %cst_56 : tensor<1x1200x1280xf32>
-    %5032 = stablehlo.multiply %5010, %5023 : tensor<1x1200x1280xf32>
-    %5033 = stablehlo.divide %5032, %5031 : tensor<1x1200x1280xf32>
-    %5034 = stablehlo.clamp %cst_57, %5033, %cst_58 : tensor<1x1200x1280xf32>
-    %5035 = stablehlo.convert %5034 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %5036 = stablehlo.add %5035, %cst_40 : tensor<1x1200x1280xbf16>
-    %5037 = stablehlo.multiply %5036, %5007 : tensor<1x1200x1280xbf16>
-    %5038 = stablehlo.reshape %5037 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %5039 = stablehlo.dot_general %5038, %arg700, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %5040 = stablehlo.reshape %5039 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5041 = stablehlo.broadcast_in_dim %5040, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5042 = stablehlo.broadcast_in_dim %arg225, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %5043 = stablehlo.add %5041, %5042 : tensor<1x1200x320xbf16>
-    %5044 = stablehlo.reshape %5043 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5045 = stablehlo.reshape %5044 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5046 = stablehlo.add %5045, %4950 : tensor<1x1200x320xbf16>
-    %5047 = stablehlo.convert %5046 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5048 = stablehlo.convert %5047 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5049 = stablehlo.reduce(%5048 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5050 = stablehlo.reshape %5049 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5051 = stablehlo.broadcast_in_dim %5050, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5052 = stablehlo.divide %5051, %2987 : tensor<1x1200x1xf64>
-    %5053 = stablehlo.broadcast_in_dim %5048, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5054 = stablehlo.broadcast_in_dim %5052, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5055 = stablehlo.subtract %5053, %5054 : tensor<1x1200x320xf64>
-    %5056 = stablehlo.multiply %5055, %5055 : tensor<1x1200x320xf64>
-    %5057 = stablehlo.reduce(%5056 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5058 = stablehlo.reshape %5057 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5059 = stablehlo.broadcast_in_dim %5058, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5060 = stablehlo.divide %5059, %2987 : tensor<1x1200x1xf64>
-    %5061 = stablehlo.convert %5060 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5062 = stablehlo.reduce(%5047 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5063 = stablehlo.reshape %5062 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5064 = stablehlo.broadcast_in_dim %5063, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5065 = stablehlo.divide %5064, %3003 : tensor<1x1200x1xf32>
-    %5066 = stablehlo.broadcast_in_dim %5061, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5067 = stablehlo.add %5066, %3006 : tensor<1x1200x1xf32>
-    %5068 = stablehlo.rsqrt %5067 : tensor<1x1200x1xf32>
-    %5069 = stablehlo.broadcast_in_dim %5047, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5070 = stablehlo.broadcast_in_dim %5065, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5071 = stablehlo.subtract %5069, %5070 : tensor<1x1200x320xf32>
-    %5072 = stablehlo.broadcast_in_dim %5071, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5073 = stablehlo.broadcast_in_dim %5068, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5074 = stablehlo.multiply %5072, %5073 : tensor<1x1200x320xf32>
-    %5075 = stablehlo.convert %arg226 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5076 = stablehlo.broadcast_in_dim %5074, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5077 = stablehlo.broadcast_in_dim %5075, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5078 = stablehlo.multiply %5076, %5077 : tensor<1x1200x320xf32>
-    %5079 = stablehlo.convert %arg227 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5080 = stablehlo.broadcast_in_dim %5078, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5081 = stablehlo.broadcast_in_dim %5079, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5082 = stablehlo.add %5080, %5081 : tensor<1x1200x320xf32>
-    %5083 = stablehlo.convert %5082 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5084 = stablehlo.reshape %5083 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5085 = stablehlo.convert %5084 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5086 = stablehlo.dot_general %5085, %arg701, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5087 = stablehlo.broadcast_in_dim %5086, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5088 = stablehlo.multiply %5087, %3065 : tensor<1200x320xf32>
-    %5089 = stablehlo.broadcast_in_dim %5088, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5090 = stablehlo.broadcast_in_dim %arg702, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5091 = stablehlo.add %5089, %5090 : tensor<1200x320xf32>
-    %5092 = stablehlo.convert %5091 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5093 = stablehlo.reshape %5092 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5094 = stablehlo.reshape %5093 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5095 = stablehlo.transpose %5094, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5096 = stablehlo.transpose %5083, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %5097 = stablehlo.reshape %5096 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %5098 = stablehlo.convolution(%5097, %arg228) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %5099 = stablehlo.reshape %arg229 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %5100 = stablehlo.broadcast_in_dim %5098, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %5101 = stablehlo.broadcast_in_dim %5099, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %5102 = stablehlo.add %5100, %5101 : tensor<1x320x15x20xbf16>
-    %5103 = stablehlo.reshape %5102 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %5104 = stablehlo.transpose %5103, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %5105 = stablehlo.convert %5104 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %5106 = stablehlo.convert %5105 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %5107 = stablehlo.reduce(%5106 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5108 = stablehlo.reshape %5107 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5109 = stablehlo.broadcast_in_dim %5108, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5110 = stablehlo.divide %5109, %3088 : tensor<1x300x1xf64>
-    %5111 = stablehlo.broadcast_in_dim %5106, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %5112 = stablehlo.broadcast_in_dim %5110, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %5113 = stablehlo.subtract %5111, %5112 : tensor<1x300x320xf64>
-    %5114 = stablehlo.multiply %5113, %5113 : tensor<1x300x320xf64>
-    %5115 = stablehlo.reduce(%5114 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5116 = stablehlo.reshape %5115 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5117 = stablehlo.broadcast_in_dim %5116, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5118 = stablehlo.divide %5117, %3088 : tensor<1x300x1xf64>
-    %5119 = stablehlo.convert %5118 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %5120 = stablehlo.reduce(%5105 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %5121 = stablehlo.reshape %5120 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %5122 = stablehlo.broadcast_in_dim %5121, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5123 = stablehlo.divide %5122, %3102 : tensor<1x300x1xf32>
-    %5124 = stablehlo.broadcast_in_dim %5119, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5125 = stablehlo.add %5124, %136 : tensor<1x300x1xf32>
-    %5126 = stablehlo.rsqrt %5125 : tensor<1x300x1xf32>
-    %5127 = stablehlo.broadcast_in_dim %5105, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5128 = stablehlo.broadcast_in_dim %5123, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5129 = stablehlo.subtract %5127, %5128 : tensor<1x300x320xf32>
-    %5130 = stablehlo.broadcast_in_dim %5129, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5131 = stablehlo.broadcast_in_dim %5126, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5132 = stablehlo.multiply %5130, %5131 : tensor<1x300x320xf32>
-    %5133 = stablehlo.convert %arg230 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5134 = stablehlo.broadcast_in_dim %5132, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5135 = stablehlo.broadcast_in_dim %5133, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5136 = stablehlo.multiply %5134, %5135 : tensor<1x300x320xf32>
-    %5137 = stablehlo.convert %arg231 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5138 = stablehlo.broadcast_in_dim %5136, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5139 = stablehlo.broadcast_in_dim %5137, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5140 = stablehlo.add %5138, %5139 : tensor<1x300x320xf32>
-    %5141 = stablehlo.convert %5140 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %5142 = stablehlo.reshape %5141 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %5143 = stablehlo.convert %5142 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %5144 = stablehlo.dot_general %5143, %arg703, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5145 = stablehlo.broadcast_in_dim %5144, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5146 = stablehlo.multiply %5145, %3126 : tensor<300x320xf32>
-    %5147 = stablehlo.broadcast_in_dim %5146, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5148 = stablehlo.broadcast_in_dim %arg704, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5149 = stablehlo.add %5147, %5148 : tensor<300x320xf32>
-    %5150 = stablehlo.convert %5149 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5151 = stablehlo.reshape %5150 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5152 = stablehlo.reshape %5151 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5153 = stablehlo.transpose %5152, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5154 = stablehlo.dot_general %5143, %arg705, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5155 = stablehlo.broadcast_in_dim %5154, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5156 = stablehlo.multiply %5155, %3126 : tensor<300x320xf32>
-    %5157 = stablehlo.broadcast_in_dim %5156, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5158 = stablehlo.broadcast_in_dim %arg706, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5159 = stablehlo.add %5157, %5158 : tensor<300x320xf32>
-    %5160 = stablehlo.convert %5159 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5161 = stablehlo.reshape %5160 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5162 = stablehlo.reshape %5161 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5163 = stablehlo.transpose %5162, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5164 = stablehlo.transpose %5153, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %5165 = stablehlo.reshape %5095 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5166 = stablehlo.reshape %5164 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5167 = stablehlo.broadcast_in_dim %5166, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5168 = stablehlo.dot_general %5165, %5167, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5169 = stablehlo.reshape %5168 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5170 = stablehlo.broadcast_in_dim %5169, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5171 = stablehlo.divide %5170, %3152 : tensor<1x5x1200x300xbf16>
-    %5172 = stablehlo.convert %5171 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %5173 = stablehlo.reduce(%5172 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5174 = stablehlo.reshape %5173 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5175 = stablehlo.broadcast_in_dim %5172, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5176 = stablehlo.broadcast_in_dim %5174, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5177 = stablehlo.subtract %5175, %5176 : tensor<1x5x1200x300xf32>
-    %5178 = stablehlo.exponential %5177 : tensor<1x5x1200x300xf32>
-    %5179 = stablehlo.reduce(%5178 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5180 = stablehlo.reshape %5179 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5181 = stablehlo.broadcast_in_dim %5178, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5182 = stablehlo.broadcast_in_dim %5180, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5183 = stablehlo.divide %5181, %5182 : tensor<1x5x1200x300xf32>
-    %5184 = stablehlo.convert %5183 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %5185 = stablehlo.reshape %5184 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5186 = stablehlo.reshape %5163 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5187 = stablehlo.broadcast_in_dim %5186, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5188 = stablehlo.dot_general %5185, %5187, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5189 = stablehlo.reshape %5188 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5190 = stablehlo.transpose %5189, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5191 = stablehlo.reshape %5190 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %5192 = stablehlo.reshape %5191 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5193 = stablehlo.convert %5192 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5194 = stablehlo.dot_general %5193, %arg707, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5195 = stablehlo.broadcast_in_dim %5194, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5196 = stablehlo.multiply %5195, %3065 : tensor<1200x320xf32>
-    %5197 = stablehlo.broadcast_in_dim %5196, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5198 = stablehlo.broadcast_in_dim %arg708, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5199 = stablehlo.add %5197, %5198 : tensor<1200x320xf32>
-    %5200 = stablehlo.convert %5199 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5201 = stablehlo.reshape %5200 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5202 = stablehlo.add %5201, %5046 : tensor<1x1200x320xbf16>
-    %5203 = stablehlo.convert %5202 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5204 = stablehlo.convert %5203 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5205 = stablehlo.reduce(%5204 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5206 = stablehlo.reshape %5205 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5207 = stablehlo.broadcast_in_dim %5206, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5208 = stablehlo.divide %5207, %2987 : tensor<1x1200x1xf64>
-    %5209 = stablehlo.broadcast_in_dim %5204, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5210 = stablehlo.broadcast_in_dim %5208, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5211 = stablehlo.subtract %5209, %5210 : tensor<1x1200x320xf64>
-    %5212 = stablehlo.multiply %5211, %5211 : tensor<1x1200x320xf64>
-    %5213 = stablehlo.reduce(%5212 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5214 = stablehlo.reshape %5213 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5215 = stablehlo.broadcast_in_dim %5214, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5216 = stablehlo.divide %5215, %2987 : tensor<1x1200x1xf64>
-    %5217 = stablehlo.convert %5216 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5218 = stablehlo.reduce(%5203 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5219 = stablehlo.reshape %5218 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5220 = stablehlo.broadcast_in_dim %5219, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5221 = stablehlo.divide %5220, %3003 : tensor<1x1200x1xf32>
-    %5222 = stablehlo.broadcast_in_dim %5217, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5223 = stablehlo.add %5222, %3006 : tensor<1x1200x1xf32>
-    %5224 = stablehlo.rsqrt %5223 : tensor<1x1200x1xf32>
-    %5225 = stablehlo.broadcast_in_dim %5203, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5226 = stablehlo.broadcast_in_dim %5221, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5227 = stablehlo.subtract %5225, %5226 : tensor<1x1200x320xf32>
-    %5228 = stablehlo.broadcast_in_dim %5227, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5229 = stablehlo.broadcast_in_dim %5224, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5230 = stablehlo.multiply %5228, %5229 : tensor<1x1200x320xf32>
-    %5231 = stablehlo.convert %arg232 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5232 = stablehlo.broadcast_in_dim %5230, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5233 = stablehlo.broadcast_in_dim %5231, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5234 = stablehlo.multiply %5232, %5233 : tensor<1x1200x320xf32>
-    %5235 = stablehlo.convert %arg233 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5236 = stablehlo.broadcast_in_dim %5234, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5237 = stablehlo.broadcast_in_dim %5235, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5238 = stablehlo.add %5236, %5237 : tensor<1x1200x320xf32>
-    %5239 = stablehlo.convert %5238 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5240 = stablehlo.reshape %5239 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5241 = stablehlo.convert %5240 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5242 = stablehlo.dot_general %5241, %arg709, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %5243 = stablehlo.broadcast_in_dim %5242, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %5244 = stablehlo.multiply %5243, %3226 : tensor<1200x1280xf32>
-    %5245 = stablehlo.broadcast_in_dim %5244, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %5246 = stablehlo.broadcast_in_dim %arg710, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %5247 = stablehlo.add %5245, %5246 : tensor<1200x1280xf32>
-    %5248 = stablehlo.convert %5247 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %5249 = stablehlo.reshape %5248 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %5250 = stablehlo.transpose %5249, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %5251 = stablehlo.reshape %5250 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5252 = stablehlo.convolution(%5251, %arg234) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5253 = stablehlo.reshape %arg235 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %5254 = stablehlo.broadcast_in_dim %5252, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5255 = stablehlo.broadcast_in_dim %5253, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5256 = stablehlo.add %5254, %5255 : tensor<1x1280x30x40xbf16>
-    %5257 = stablehlo.reshape %5256 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %5258 = stablehlo.transpose %5257, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %5259 = stablehlo.multiply %5258, %cst_42 : tensor<1x1200x1280xbf16>
-    %5260 = stablehlo.multiply %5258, %3243 : tensor<1x1200x1280xbf16>
-    %5261 = stablehlo.convert %5260 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %5262 = stablehlo.clamp %cst_43, %5261, %cst_44 : tensor<1x1200x1280xf32>
-    %5263 = stablehlo.multiply %5262, %5262 : tensor<1x1200x1280xf32>
-    %5264 = stablehlo.multiply %cst_45, %5263 : tensor<1x1200x1280xf32>
-    %5265 = stablehlo.add %5264, %cst_46 : tensor<1x1200x1280xf32>
-    %5266 = stablehlo.multiply %5265, %5263 : tensor<1x1200x1280xf32>
-    %5267 = stablehlo.add %5266, %cst_47 : tensor<1x1200x1280xf32>
-    %5268 = stablehlo.multiply %5267, %5263 : tensor<1x1200x1280xf32>
-    %5269 = stablehlo.add %5268, %cst_48 : tensor<1x1200x1280xf32>
-    %5270 = stablehlo.multiply %5269, %5263 : tensor<1x1200x1280xf32>
-    %5271 = stablehlo.add %5270, %cst_49 : tensor<1x1200x1280xf32>
-    %5272 = stablehlo.multiply %5271, %5263 : tensor<1x1200x1280xf32>
-    %5273 = stablehlo.add %5272, %cst_50 : tensor<1x1200x1280xf32>
-    %5274 = stablehlo.multiply %5273, %5263 : tensor<1x1200x1280xf32>
-    %5275 = stablehlo.add %5274, %cst_51 : tensor<1x1200x1280xf32>
-    %5276 = stablehlo.multiply %cst_52, %5263 : tensor<1x1200x1280xf32>
-    %5277 = stablehlo.add %5276, %cst_53 : tensor<1x1200x1280xf32>
-    %5278 = stablehlo.multiply %5277, %5263 : tensor<1x1200x1280xf32>
-    %5279 = stablehlo.add %5278, %cst_54 : tensor<1x1200x1280xf32>
-    %5280 = stablehlo.multiply %5279, %5263 : tensor<1x1200x1280xf32>
-    %5281 = stablehlo.add %5280, %cst_55 : tensor<1x1200x1280xf32>
-    %5282 = stablehlo.multiply %5281, %5263 : tensor<1x1200x1280xf32>
-    %5283 = stablehlo.add %5282, %cst_56 : tensor<1x1200x1280xf32>
-    %5284 = stablehlo.multiply %5262, %5275 : tensor<1x1200x1280xf32>
-    %5285 = stablehlo.divide %5284, %5283 : tensor<1x1200x1280xf32>
-    %5286 = stablehlo.clamp %cst_57, %5285, %cst_58 : tensor<1x1200x1280xf32>
-    %5287 = stablehlo.convert %5286 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %5288 = stablehlo.add %5287, %cst_40 : tensor<1x1200x1280xbf16>
-    %5289 = stablehlo.multiply %5288, %5259 : tensor<1x1200x1280xbf16>
-    %5290 = stablehlo.reshape %5289 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %5291 = stablehlo.dot_general %5290, %arg711, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %5292 = stablehlo.reshape %5291 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5293 = stablehlo.broadcast_in_dim %5292, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5294 = stablehlo.broadcast_in_dim %arg236, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %5295 = stablehlo.add %5293, %5294 : tensor<1x1200x320xbf16>
-    %5296 = stablehlo.reshape %5295 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5297 = stablehlo.reshape %5296 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5298 = stablehlo.add %5297, %5202 : tensor<1x1200x320xbf16>
-    %5299 = stablehlo.convert %5298 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5300 = stablehlo.convert %5299 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5301 = stablehlo.reduce(%5300 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5302 = stablehlo.reshape %5301 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5303 = stablehlo.broadcast_in_dim %5302, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5304 = stablehlo.divide %5303, %2987 : tensor<1x1200x1xf64>
-    %5305 = stablehlo.broadcast_in_dim %5300, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5306 = stablehlo.broadcast_in_dim %5304, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5307 = stablehlo.subtract %5305, %5306 : tensor<1x1200x320xf64>
-    %5308 = stablehlo.multiply %5307, %5307 : tensor<1x1200x320xf64>
-    %5309 = stablehlo.reduce(%5308 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5310 = stablehlo.reshape %5309 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5311 = stablehlo.broadcast_in_dim %5310, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5312 = stablehlo.divide %5311, %2987 : tensor<1x1200x1xf64>
-    %5313 = stablehlo.convert %5312 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5314 = stablehlo.reduce(%5299 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5315 = stablehlo.reshape %5314 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5316 = stablehlo.broadcast_in_dim %5315, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5317 = stablehlo.divide %5316, %3003 : tensor<1x1200x1xf32>
-    %5318 = stablehlo.broadcast_in_dim %5313, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5319 = stablehlo.add %5318, %3006 : tensor<1x1200x1xf32>
-    %5320 = stablehlo.rsqrt %5319 : tensor<1x1200x1xf32>
-    %5321 = stablehlo.broadcast_in_dim %5299, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5322 = stablehlo.broadcast_in_dim %5317, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5323 = stablehlo.subtract %5321, %5322 : tensor<1x1200x320xf32>
-    %5324 = stablehlo.broadcast_in_dim %5323, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5325 = stablehlo.broadcast_in_dim %5320, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5326 = stablehlo.multiply %5324, %5325 : tensor<1x1200x320xf32>
-    %5327 = stablehlo.convert %arg237 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5328 = stablehlo.broadcast_in_dim %5326, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5329 = stablehlo.broadcast_in_dim %5327, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5330 = stablehlo.multiply %5328, %5329 : tensor<1x1200x320xf32>
-    %5331 = stablehlo.convert %arg238 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5332 = stablehlo.broadcast_in_dim %5330, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5333 = stablehlo.broadcast_in_dim %5331, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5334 = stablehlo.add %5332, %5333 : tensor<1x1200x320xf32>
-    %5335 = stablehlo.convert %5334 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5336 = stablehlo.reshape %5335 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5337 = stablehlo.convert %5336 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5338 = stablehlo.dot_general %5337, %arg712, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5339 = stablehlo.broadcast_in_dim %5338, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5340 = stablehlo.multiply %5339, %3065 : tensor<1200x320xf32>
-    %5341 = stablehlo.broadcast_in_dim %5340, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5342 = stablehlo.broadcast_in_dim %arg713, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5343 = stablehlo.add %5341, %5342 : tensor<1200x320xf32>
-    %5344 = stablehlo.convert %5343 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5345 = stablehlo.reshape %5344 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5346 = stablehlo.reshape %5345 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5347 = stablehlo.transpose %5346, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5348 = stablehlo.transpose %5335, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %5349 = stablehlo.reshape %5348 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %5350 = stablehlo.convolution(%5349, %arg239) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %5351 = stablehlo.reshape %arg240 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %5352 = stablehlo.broadcast_in_dim %5350, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %5353 = stablehlo.broadcast_in_dim %5351, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %5354 = stablehlo.add %5352, %5353 : tensor<1x320x15x20xbf16>
-    %5355 = stablehlo.reshape %5354 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %5356 = stablehlo.transpose %5355, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %5357 = stablehlo.convert %5356 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %5358 = stablehlo.convert %5357 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %5359 = stablehlo.reduce(%5358 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5360 = stablehlo.reshape %5359 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5361 = stablehlo.broadcast_in_dim %5360, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5362 = stablehlo.divide %5361, %3088 : tensor<1x300x1xf64>
-    %5363 = stablehlo.broadcast_in_dim %5358, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %5364 = stablehlo.broadcast_in_dim %5362, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %5365 = stablehlo.subtract %5363, %5364 : tensor<1x300x320xf64>
-    %5366 = stablehlo.multiply %5365, %5365 : tensor<1x300x320xf64>
-    %5367 = stablehlo.reduce(%5366 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5368 = stablehlo.reshape %5367 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5369 = stablehlo.broadcast_in_dim %5368, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5370 = stablehlo.divide %5369, %3088 : tensor<1x300x1xf64>
-    %5371 = stablehlo.convert %5370 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %5372 = stablehlo.reduce(%5357 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %5373 = stablehlo.reshape %5372 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %5374 = stablehlo.broadcast_in_dim %5373, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5375 = stablehlo.divide %5374, %3102 : tensor<1x300x1xf32>
-    %5376 = stablehlo.broadcast_in_dim %5371, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5377 = stablehlo.add %5376, %136 : tensor<1x300x1xf32>
-    %5378 = stablehlo.rsqrt %5377 : tensor<1x300x1xf32>
-    %5379 = stablehlo.broadcast_in_dim %5357, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5380 = stablehlo.broadcast_in_dim %5375, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5381 = stablehlo.subtract %5379, %5380 : tensor<1x300x320xf32>
-    %5382 = stablehlo.broadcast_in_dim %5381, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5383 = stablehlo.broadcast_in_dim %5378, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5384 = stablehlo.multiply %5382, %5383 : tensor<1x300x320xf32>
-    %5385 = stablehlo.convert %arg241 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5386 = stablehlo.broadcast_in_dim %5384, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5387 = stablehlo.broadcast_in_dim %5385, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5388 = stablehlo.multiply %5386, %5387 : tensor<1x300x320xf32>
-    %5389 = stablehlo.convert %arg242 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5390 = stablehlo.broadcast_in_dim %5388, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5391 = stablehlo.broadcast_in_dim %5389, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5392 = stablehlo.add %5390, %5391 : tensor<1x300x320xf32>
-    %5393 = stablehlo.convert %5392 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %5394 = stablehlo.reshape %5393 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %5395 = stablehlo.convert %5394 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %5396 = stablehlo.dot_general %5395, %arg714, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5397 = stablehlo.broadcast_in_dim %5396, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5398 = stablehlo.multiply %5397, %3126 : tensor<300x320xf32>
-    %5399 = stablehlo.broadcast_in_dim %5398, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5400 = stablehlo.broadcast_in_dim %arg715, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5401 = stablehlo.add %5399, %5400 : tensor<300x320xf32>
-    %5402 = stablehlo.convert %5401 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5403 = stablehlo.reshape %5402 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5404 = stablehlo.reshape %5403 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5405 = stablehlo.transpose %5404, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5406 = stablehlo.dot_general %5395, %arg716, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5407 = stablehlo.broadcast_in_dim %5406, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5408 = stablehlo.multiply %5407, %3126 : tensor<300x320xf32>
-    %5409 = stablehlo.broadcast_in_dim %5408, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5410 = stablehlo.broadcast_in_dim %arg717, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5411 = stablehlo.add %5409, %5410 : tensor<300x320xf32>
-    %5412 = stablehlo.convert %5411 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5413 = stablehlo.reshape %5412 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5414 = stablehlo.reshape %5413 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5415 = stablehlo.transpose %5414, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5416 = stablehlo.transpose %5405, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %5417 = stablehlo.reshape %5347 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5418 = stablehlo.reshape %5416 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5419 = stablehlo.broadcast_in_dim %5418, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5420 = stablehlo.dot_general %5417, %5419, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5421 = stablehlo.reshape %5420 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5422 = stablehlo.broadcast_in_dim %5421, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5423 = stablehlo.divide %5422, %3152 : tensor<1x5x1200x300xbf16>
-    %5424 = stablehlo.convert %5423 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %5425 = stablehlo.reduce(%5424 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5426 = stablehlo.reshape %5425 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5427 = stablehlo.broadcast_in_dim %5424, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5428 = stablehlo.broadcast_in_dim %5426, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5429 = stablehlo.subtract %5427, %5428 : tensor<1x5x1200x300xf32>
-    %5430 = stablehlo.exponential %5429 : tensor<1x5x1200x300xf32>
-    %5431 = stablehlo.reduce(%5430 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5432 = stablehlo.reshape %5431 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5433 = stablehlo.broadcast_in_dim %5430, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5434 = stablehlo.broadcast_in_dim %5432, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5435 = stablehlo.divide %5433, %5434 : tensor<1x5x1200x300xf32>
-    %5436 = stablehlo.convert %5435 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %5437 = stablehlo.reshape %5436 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5438 = stablehlo.reshape %5415 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5439 = stablehlo.broadcast_in_dim %5438, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5440 = stablehlo.dot_general %5437, %5439, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5441 = stablehlo.reshape %5440 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5442 = stablehlo.transpose %5441, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5443 = stablehlo.reshape %5442 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %5444 = stablehlo.reshape %5443 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5445 = stablehlo.convert %5444 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5446 = stablehlo.dot_general %5445, %arg718, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5447 = stablehlo.broadcast_in_dim %5446, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5448 = stablehlo.multiply %5447, %3065 : tensor<1200x320xf32>
-    %5449 = stablehlo.broadcast_in_dim %5448, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5450 = stablehlo.broadcast_in_dim %arg719, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5451 = stablehlo.add %5449, %5450 : tensor<1200x320xf32>
-    %5452 = stablehlo.convert %5451 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5453 = stablehlo.reshape %5452 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5454 = stablehlo.add %5453, %5298 : tensor<1x1200x320xbf16>
-    %5455 = stablehlo.convert %5454 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5456 = stablehlo.convert %5455 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5457 = stablehlo.reduce(%5456 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5458 = stablehlo.reshape %5457 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5459 = stablehlo.broadcast_in_dim %5458, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5460 = stablehlo.divide %5459, %2987 : tensor<1x1200x1xf64>
-    %5461 = stablehlo.broadcast_in_dim %5456, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5462 = stablehlo.broadcast_in_dim %5460, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5463 = stablehlo.subtract %5461, %5462 : tensor<1x1200x320xf64>
-    %5464 = stablehlo.multiply %5463, %5463 : tensor<1x1200x320xf64>
-    %5465 = stablehlo.reduce(%5464 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5466 = stablehlo.reshape %5465 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5467 = stablehlo.broadcast_in_dim %5466, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5468 = stablehlo.divide %5467, %2987 : tensor<1x1200x1xf64>
-    %5469 = stablehlo.convert %5468 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5470 = stablehlo.reduce(%5455 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5471 = stablehlo.reshape %5470 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5472 = stablehlo.broadcast_in_dim %5471, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5473 = stablehlo.divide %5472, %3003 : tensor<1x1200x1xf32>
-    %5474 = stablehlo.broadcast_in_dim %5469, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5475 = stablehlo.add %5474, %3006 : tensor<1x1200x1xf32>
-    %5476 = stablehlo.rsqrt %5475 : tensor<1x1200x1xf32>
-    %5477 = stablehlo.broadcast_in_dim %5455, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5478 = stablehlo.broadcast_in_dim %5473, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5479 = stablehlo.subtract %5477, %5478 : tensor<1x1200x320xf32>
-    %5480 = stablehlo.broadcast_in_dim %5479, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5481 = stablehlo.broadcast_in_dim %5476, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5482 = stablehlo.multiply %5480, %5481 : tensor<1x1200x320xf32>
-    %5483 = stablehlo.convert %arg243 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5484 = stablehlo.broadcast_in_dim %5482, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5485 = stablehlo.broadcast_in_dim %5483, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5486 = stablehlo.multiply %5484, %5485 : tensor<1x1200x320xf32>
-    %5487 = stablehlo.convert %arg244 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5488 = stablehlo.broadcast_in_dim %5486, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5489 = stablehlo.broadcast_in_dim %5487, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5490 = stablehlo.add %5488, %5489 : tensor<1x1200x320xf32>
-    %5491 = stablehlo.convert %5490 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5492 = stablehlo.reshape %5491 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5493 = stablehlo.convert %5492 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5494 = stablehlo.dot_general %5493, %arg720, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %5495 = stablehlo.broadcast_in_dim %5494, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %5496 = stablehlo.multiply %5495, %3226 : tensor<1200x1280xf32>
-    %5497 = stablehlo.broadcast_in_dim %5496, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %5498 = stablehlo.broadcast_in_dim %arg721, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %5499 = stablehlo.add %5497, %5498 : tensor<1200x1280xf32>
-    %5500 = stablehlo.convert %5499 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %5501 = stablehlo.reshape %5500 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %5502 = stablehlo.transpose %5501, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %5503 = stablehlo.reshape %5502 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5504 = stablehlo.convolution(%5503, %arg245) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5505 = stablehlo.reshape %arg246 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %5506 = stablehlo.broadcast_in_dim %5504, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5507 = stablehlo.broadcast_in_dim %5505, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5508 = stablehlo.add %5506, %5507 : tensor<1x1280x30x40xbf16>
-    %5509 = stablehlo.reshape %5508 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %5510 = stablehlo.transpose %5509, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %5511 = stablehlo.multiply %5510, %cst_42 : tensor<1x1200x1280xbf16>
-    %5512 = stablehlo.multiply %5510, %3243 : tensor<1x1200x1280xbf16>
-    %5513 = stablehlo.convert %5512 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %5514 = stablehlo.clamp %cst_43, %5513, %cst_44 : tensor<1x1200x1280xf32>
-    %5515 = stablehlo.multiply %5514, %5514 : tensor<1x1200x1280xf32>
-    %5516 = stablehlo.multiply %cst_45, %5515 : tensor<1x1200x1280xf32>
-    %5517 = stablehlo.add %5516, %cst_46 : tensor<1x1200x1280xf32>
-    %5518 = stablehlo.multiply %5517, %5515 : tensor<1x1200x1280xf32>
-    %5519 = stablehlo.add %5518, %cst_47 : tensor<1x1200x1280xf32>
-    %5520 = stablehlo.multiply %5519, %5515 : tensor<1x1200x1280xf32>
-    %5521 = stablehlo.add %5520, %cst_48 : tensor<1x1200x1280xf32>
-    %5522 = stablehlo.multiply %5521, %5515 : tensor<1x1200x1280xf32>
-    %5523 = stablehlo.add %5522, %cst_49 : tensor<1x1200x1280xf32>
-    %5524 = stablehlo.multiply %5523, %5515 : tensor<1x1200x1280xf32>
-    %5525 = stablehlo.add %5524, %cst_50 : tensor<1x1200x1280xf32>
-    %5526 = stablehlo.multiply %5525, %5515 : tensor<1x1200x1280xf32>
-    %5527 = stablehlo.add %5526, %cst_51 : tensor<1x1200x1280xf32>
-    %5528 = stablehlo.multiply %cst_52, %5515 : tensor<1x1200x1280xf32>
-    %5529 = stablehlo.add %5528, %cst_53 : tensor<1x1200x1280xf32>
-    %5530 = stablehlo.multiply %5529, %5515 : tensor<1x1200x1280xf32>
-    %5531 = stablehlo.add %5530, %cst_54 : tensor<1x1200x1280xf32>
-    %5532 = stablehlo.multiply %5531, %5515 : tensor<1x1200x1280xf32>
-    %5533 = stablehlo.add %5532, %cst_55 : tensor<1x1200x1280xf32>
-    %5534 = stablehlo.multiply %5533, %5515 : tensor<1x1200x1280xf32>
-    %5535 = stablehlo.add %5534, %cst_56 : tensor<1x1200x1280xf32>
-    %5536 = stablehlo.multiply %5514, %5527 : tensor<1x1200x1280xf32>
-    %5537 = stablehlo.divide %5536, %5535 : tensor<1x1200x1280xf32>
-    %5538 = stablehlo.clamp %cst_57, %5537, %cst_58 : tensor<1x1200x1280xf32>
-    %5539 = stablehlo.convert %5538 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %5540 = stablehlo.add %5539, %cst_40 : tensor<1x1200x1280xbf16>
-    %5541 = stablehlo.multiply %5540, %5511 : tensor<1x1200x1280xbf16>
-    %5542 = stablehlo.reshape %5541 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %5543 = stablehlo.dot_general %5542, %arg722, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %5544 = stablehlo.reshape %5543 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5545 = stablehlo.broadcast_in_dim %5544, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5546 = stablehlo.broadcast_in_dim %arg247, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %5547 = stablehlo.add %5545, %5546 : tensor<1x1200x320xbf16>
-    %5548 = stablehlo.reshape %5547 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5549 = stablehlo.reshape %5548 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5550 = stablehlo.add %5549, %5454 : tensor<1x1200x320xbf16>
-    %5551 = stablehlo.convert %5550 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5552 = stablehlo.convert %5551 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5553 = stablehlo.reduce(%5552 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5554 = stablehlo.reshape %5553 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5555 = stablehlo.broadcast_in_dim %5554, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5556 = stablehlo.divide %5555, %2987 : tensor<1x1200x1xf64>
-    %5557 = stablehlo.broadcast_in_dim %5552, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5558 = stablehlo.broadcast_in_dim %5556, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5559 = stablehlo.subtract %5557, %5558 : tensor<1x1200x320xf64>
-    %5560 = stablehlo.multiply %5559, %5559 : tensor<1x1200x320xf64>
-    %5561 = stablehlo.reduce(%5560 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5562 = stablehlo.reshape %5561 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5563 = stablehlo.broadcast_in_dim %5562, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5564 = stablehlo.divide %5563, %2987 : tensor<1x1200x1xf64>
-    %5565 = stablehlo.convert %5564 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5566 = stablehlo.reduce(%5551 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5567 = stablehlo.reshape %5566 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5568 = stablehlo.broadcast_in_dim %5567, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5569 = stablehlo.divide %5568, %3003 : tensor<1x1200x1xf32>
-    %5570 = stablehlo.broadcast_in_dim %5565, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5571 = stablehlo.add %5570, %3006 : tensor<1x1200x1xf32>
-    %5572 = stablehlo.rsqrt %5571 : tensor<1x1200x1xf32>
-    %5573 = stablehlo.broadcast_in_dim %5551, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5574 = stablehlo.broadcast_in_dim %5569, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5575 = stablehlo.subtract %5573, %5574 : tensor<1x1200x320xf32>
-    %5576 = stablehlo.broadcast_in_dim %5575, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5577 = stablehlo.broadcast_in_dim %5572, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5578 = stablehlo.multiply %5576, %5577 : tensor<1x1200x320xf32>
-    %5579 = stablehlo.convert %arg248 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5580 = stablehlo.broadcast_in_dim %5578, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5581 = stablehlo.broadcast_in_dim %5579, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5582 = stablehlo.multiply %5580, %5581 : tensor<1x1200x320xf32>
-    %5583 = stablehlo.convert %arg249 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5584 = stablehlo.broadcast_in_dim %5582, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5585 = stablehlo.broadcast_in_dim %5583, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5586 = stablehlo.add %5584, %5585 : tensor<1x1200x320xf32>
-    %5587 = stablehlo.convert %5586 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5588 = stablehlo.reshape %5587 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5589 = stablehlo.convert %5588 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5590 = stablehlo.dot_general %5589, %arg723, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5591 = stablehlo.broadcast_in_dim %5590, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5592 = stablehlo.multiply %5591, %3065 : tensor<1200x320xf32>
-    %5593 = stablehlo.broadcast_in_dim %5592, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5594 = stablehlo.broadcast_in_dim %arg724, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5595 = stablehlo.add %5593, %5594 : tensor<1200x320xf32>
-    %5596 = stablehlo.convert %5595 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5597 = stablehlo.reshape %5596 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5598 = stablehlo.reshape %5597 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5599 = stablehlo.transpose %5598, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5600 = stablehlo.transpose %5587, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %5601 = stablehlo.reshape %5600 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %5602 = stablehlo.convolution(%5601, %arg250) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %5603 = stablehlo.reshape %arg251 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %5604 = stablehlo.broadcast_in_dim %5602, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %5605 = stablehlo.broadcast_in_dim %5603, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %5606 = stablehlo.add %5604, %5605 : tensor<1x320x15x20xbf16>
-    %5607 = stablehlo.reshape %5606 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %5608 = stablehlo.transpose %5607, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %5609 = stablehlo.convert %5608 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %5610 = stablehlo.convert %5609 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %5611 = stablehlo.reduce(%5610 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5612 = stablehlo.reshape %5611 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5613 = stablehlo.broadcast_in_dim %5612, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5614 = stablehlo.divide %5613, %3088 : tensor<1x300x1xf64>
-    %5615 = stablehlo.broadcast_in_dim %5610, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %5616 = stablehlo.broadcast_in_dim %5614, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %5617 = stablehlo.subtract %5615, %5616 : tensor<1x300x320xf64>
-    %5618 = stablehlo.multiply %5617, %5617 : tensor<1x300x320xf64>
-    %5619 = stablehlo.reduce(%5618 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5620 = stablehlo.reshape %5619 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5621 = stablehlo.broadcast_in_dim %5620, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5622 = stablehlo.divide %5621, %3088 : tensor<1x300x1xf64>
-    %5623 = stablehlo.convert %5622 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %5624 = stablehlo.reduce(%5609 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %5625 = stablehlo.reshape %5624 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %5626 = stablehlo.broadcast_in_dim %5625, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5627 = stablehlo.divide %5626, %3102 : tensor<1x300x1xf32>
-    %5628 = stablehlo.broadcast_in_dim %5623, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5629 = stablehlo.add %5628, %136 : tensor<1x300x1xf32>
-    %5630 = stablehlo.rsqrt %5629 : tensor<1x300x1xf32>
-    %5631 = stablehlo.broadcast_in_dim %5609, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5632 = stablehlo.broadcast_in_dim %5627, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5633 = stablehlo.subtract %5631, %5632 : tensor<1x300x320xf32>
-    %5634 = stablehlo.broadcast_in_dim %5633, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5635 = stablehlo.broadcast_in_dim %5630, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5636 = stablehlo.multiply %5634, %5635 : tensor<1x300x320xf32>
-    %5637 = stablehlo.convert %arg252 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5638 = stablehlo.broadcast_in_dim %5636, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5639 = stablehlo.broadcast_in_dim %5637, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5640 = stablehlo.multiply %5638, %5639 : tensor<1x300x320xf32>
-    %5641 = stablehlo.convert %arg253 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5642 = stablehlo.broadcast_in_dim %5640, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5643 = stablehlo.broadcast_in_dim %5641, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5644 = stablehlo.add %5642, %5643 : tensor<1x300x320xf32>
-    %5645 = stablehlo.convert %5644 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %5646 = stablehlo.reshape %5645 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %5647 = stablehlo.convert %5646 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %5648 = stablehlo.dot_general %5647, %arg725, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5649 = stablehlo.broadcast_in_dim %5648, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5650 = stablehlo.multiply %5649, %3126 : tensor<300x320xf32>
-    %5651 = stablehlo.broadcast_in_dim %5650, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5652 = stablehlo.broadcast_in_dim %arg726, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5653 = stablehlo.add %5651, %5652 : tensor<300x320xf32>
-    %5654 = stablehlo.convert %5653 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5655 = stablehlo.reshape %5654 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5656 = stablehlo.reshape %5655 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5657 = stablehlo.transpose %5656, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5658 = stablehlo.dot_general %5647, %arg727, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5659 = stablehlo.broadcast_in_dim %5658, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5660 = stablehlo.multiply %5659, %3126 : tensor<300x320xf32>
-    %5661 = stablehlo.broadcast_in_dim %5660, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5662 = stablehlo.broadcast_in_dim %arg728, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5663 = stablehlo.add %5661, %5662 : tensor<300x320xf32>
-    %5664 = stablehlo.convert %5663 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5665 = stablehlo.reshape %5664 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5666 = stablehlo.reshape %5665 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5667 = stablehlo.transpose %5666, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5668 = stablehlo.transpose %5657, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %5669 = stablehlo.reshape %5599 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5670 = stablehlo.reshape %5668 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5671 = stablehlo.broadcast_in_dim %5670, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5672 = stablehlo.dot_general %5669, %5671, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5673 = stablehlo.reshape %5672 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5674 = stablehlo.broadcast_in_dim %5673, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5675 = stablehlo.divide %5674, %3152 : tensor<1x5x1200x300xbf16>
-    %5676 = stablehlo.convert %5675 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %5677 = stablehlo.reduce(%5676 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5678 = stablehlo.reshape %5677 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5679 = stablehlo.broadcast_in_dim %5676, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5680 = stablehlo.broadcast_in_dim %5678, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5681 = stablehlo.subtract %5679, %5680 : tensor<1x5x1200x300xf32>
-    %5682 = stablehlo.exponential %5681 : tensor<1x5x1200x300xf32>
-    %5683 = stablehlo.reduce(%5682 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5684 = stablehlo.reshape %5683 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5685 = stablehlo.broadcast_in_dim %5682, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5686 = stablehlo.broadcast_in_dim %5684, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5687 = stablehlo.divide %5685, %5686 : tensor<1x5x1200x300xf32>
-    %5688 = stablehlo.convert %5687 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %5689 = stablehlo.reshape %5688 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5690 = stablehlo.reshape %5667 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5691 = stablehlo.broadcast_in_dim %5690, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5692 = stablehlo.dot_general %5689, %5691, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5693 = stablehlo.reshape %5692 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5694 = stablehlo.transpose %5693, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5695 = stablehlo.reshape %5694 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %5696 = stablehlo.reshape %5695 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5697 = stablehlo.convert %5696 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5698 = stablehlo.dot_general %5697, %arg729, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5699 = stablehlo.broadcast_in_dim %5698, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5700 = stablehlo.multiply %5699, %3065 : tensor<1200x320xf32>
-    %5701 = stablehlo.broadcast_in_dim %5700, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5702 = stablehlo.broadcast_in_dim %arg730, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5703 = stablehlo.add %5701, %5702 : tensor<1200x320xf32>
-    %5704 = stablehlo.convert %5703 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5705 = stablehlo.reshape %5704 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5706 = stablehlo.add %5705, %5550 : tensor<1x1200x320xbf16>
-    %5707 = stablehlo.convert %5706 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5708 = stablehlo.convert %5707 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5709 = stablehlo.reduce(%5708 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5710 = stablehlo.reshape %5709 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5711 = stablehlo.broadcast_in_dim %5710, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5712 = stablehlo.divide %5711, %2987 : tensor<1x1200x1xf64>
-    %5713 = stablehlo.broadcast_in_dim %5708, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5714 = stablehlo.broadcast_in_dim %5712, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5715 = stablehlo.subtract %5713, %5714 : tensor<1x1200x320xf64>
-    %5716 = stablehlo.multiply %5715, %5715 : tensor<1x1200x320xf64>
-    %5717 = stablehlo.reduce(%5716 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5718 = stablehlo.reshape %5717 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5719 = stablehlo.broadcast_in_dim %5718, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5720 = stablehlo.divide %5719, %2987 : tensor<1x1200x1xf64>
-    %5721 = stablehlo.convert %5720 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5722 = stablehlo.reduce(%5707 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5723 = stablehlo.reshape %5722 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5724 = stablehlo.broadcast_in_dim %5723, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5725 = stablehlo.divide %5724, %3003 : tensor<1x1200x1xf32>
-    %5726 = stablehlo.broadcast_in_dim %5721, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5727 = stablehlo.add %5726, %3006 : tensor<1x1200x1xf32>
-    %5728 = stablehlo.rsqrt %5727 : tensor<1x1200x1xf32>
-    %5729 = stablehlo.broadcast_in_dim %5707, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5730 = stablehlo.broadcast_in_dim %5725, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5731 = stablehlo.subtract %5729, %5730 : tensor<1x1200x320xf32>
-    %5732 = stablehlo.broadcast_in_dim %5731, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5733 = stablehlo.broadcast_in_dim %5728, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5734 = stablehlo.multiply %5732, %5733 : tensor<1x1200x320xf32>
-    %5735 = stablehlo.convert %arg254 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5736 = stablehlo.broadcast_in_dim %5734, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5737 = stablehlo.broadcast_in_dim %5735, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5738 = stablehlo.multiply %5736, %5737 : tensor<1x1200x320xf32>
-    %5739 = stablehlo.convert %arg255 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5740 = stablehlo.broadcast_in_dim %5738, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5741 = stablehlo.broadcast_in_dim %5739, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5742 = stablehlo.add %5740, %5741 : tensor<1x1200x320xf32>
-    %5743 = stablehlo.convert %5742 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5744 = stablehlo.reshape %5743 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5745 = stablehlo.convert %5744 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5746 = stablehlo.dot_general %5745, %arg731, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %5747 = stablehlo.broadcast_in_dim %5746, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %5748 = stablehlo.multiply %5747, %3226 : tensor<1200x1280xf32>
-    %5749 = stablehlo.broadcast_in_dim %5748, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %5750 = stablehlo.broadcast_in_dim %arg732, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %5751 = stablehlo.add %5749, %5750 : tensor<1200x1280xf32>
-    %5752 = stablehlo.convert %5751 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %5753 = stablehlo.reshape %5752 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %5754 = stablehlo.transpose %5753, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %5755 = stablehlo.reshape %5754 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5756 = stablehlo.convolution(%5755, %arg256) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5757 = stablehlo.reshape %arg257 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %5758 = stablehlo.broadcast_in_dim %5756, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5759 = stablehlo.broadcast_in_dim %5757, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %5760 = stablehlo.add %5758, %5759 : tensor<1x1280x30x40xbf16>
-    %5761 = stablehlo.reshape %5760 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %5762 = stablehlo.transpose %5761, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %5763 = stablehlo.multiply %5762, %cst_42 : tensor<1x1200x1280xbf16>
-    %5764 = stablehlo.multiply %5762, %3243 : tensor<1x1200x1280xbf16>
-    %5765 = stablehlo.convert %5764 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %5766 = stablehlo.clamp %cst_43, %5765, %cst_44 : tensor<1x1200x1280xf32>
-    %5767 = stablehlo.multiply %5766, %5766 : tensor<1x1200x1280xf32>
-    %5768 = stablehlo.multiply %cst_45, %5767 : tensor<1x1200x1280xf32>
-    %5769 = stablehlo.add %5768, %cst_46 : tensor<1x1200x1280xf32>
-    %5770 = stablehlo.multiply %5769, %5767 : tensor<1x1200x1280xf32>
-    %5771 = stablehlo.add %5770, %cst_47 : tensor<1x1200x1280xf32>
-    %5772 = stablehlo.multiply %5771, %5767 : tensor<1x1200x1280xf32>
-    %5773 = stablehlo.add %5772, %cst_48 : tensor<1x1200x1280xf32>
-    %5774 = stablehlo.multiply %5773, %5767 : tensor<1x1200x1280xf32>
-    %5775 = stablehlo.add %5774, %cst_49 : tensor<1x1200x1280xf32>
-    %5776 = stablehlo.multiply %5775, %5767 : tensor<1x1200x1280xf32>
-    %5777 = stablehlo.add %5776, %cst_50 : tensor<1x1200x1280xf32>
-    %5778 = stablehlo.multiply %5777, %5767 : tensor<1x1200x1280xf32>
-    %5779 = stablehlo.add %5778, %cst_51 : tensor<1x1200x1280xf32>
-    %5780 = stablehlo.multiply %cst_52, %5767 : tensor<1x1200x1280xf32>
-    %5781 = stablehlo.add %5780, %cst_53 : tensor<1x1200x1280xf32>
-    %5782 = stablehlo.multiply %5781, %5767 : tensor<1x1200x1280xf32>
-    %5783 = stablehlo.add %5782, %cst_54 : tensor<1x1200x1280xf32>
-    %5784 = stablehlo.multiply %5783, %5767 : tensor<1x1200x1280xf32>
-    %5785 = stablehlo.add %5784, %cst_55 : tensor<1x1200x1280xf32>
-    %5786 = stablehlo.multiply %5785, %5767 : tensor<1x1200x1280xf32>
-    %5787 = stablehlo.add %5786, %cst_56 : tensor<1x1200x1280xf32>
-    %5788 = stablehlo.multiply %5766, %5779 : tensor<1x1200x1280xf32>
-    %5789 = stablehlo.divide %5788, %5787 : tensor<1x1200x1280xf32>
-    %5790 = stablehlo.clamp %cst_57, %5789, %cst_58 : tensor<1x1200x1280xf32>
-    %5791 = stablehlo.convert %5790 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %5792 = stablehlo.add %5791, %cst_40 : tensor<1x1200x1280xbf16>
-    %5793 = stablehlo.multiply %5792, %5763 : tensor<1x1200x1280xbf16>
-    %5794 = stablehlo.reshape %5793 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %5795 = stablehlo.dot_general %5794, %arg733, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %5796 = stablehlo.reshape %5795 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5797 = stablehlo.broadcast_in_dim %5796, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5798 = stablehlo.broadcast_in_dim %arg258, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %5799 = stablehlo.add %5797, %5798 : tensor<1x1200x320xbf16>
-    %5800 = stablehlo.reshape %5799 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5801 = stablehlo.reshape %5800 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5802 = stablehlo.add %5801, %5706 : tensor<1x1200x320xbf16>
-    %5803 = stablehlo.convert %5802 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5804 = stablehlo.convert %5803 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5805 = stablehlo.reduce(%5804 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5806 = stablehlo.reshape %5805 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5807 = stablehlo.broadcast_in_dim %5806, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5808 = stablehlo.divide %5807, %2987 : tensor<1x1200x1xf64>
-    %5809 = stablehlo.broadcast_in_dim %5804, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5810 = stablehlo.broadcast_in_dim %5808, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5811 = stablehlo.subtract %5809, %5810 : tensor<1x1200x320xf64>
-    %5812 = stablehlo.multiply %5811, %5811 : tensor<1x1200x320xf64>
-    %5813 = stablehlo.reduce(%5812 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5814 = stablehlo.reshape %5813 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5815 = stablehlo.broadcast_in_dim %5814, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5816 = stablehlo.divide %5815, %2987 : tensor<1x1200x1xf64>
-    %5817 = stablehlo.convert %5816 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5818 = stablehlo.reduce(%5803 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5819 = stablehlo.reshape %5818 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5820 = stablehlo.broadcast_in_dim %5819, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5821 = stablehlo.divide %5820, %3003 : tensor<1x1200x1xf32>
-    %5822 = stablehlo.broadcast_in_dim %5817, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5823 = stablehlo.add %5822, %3006 : tensor<1x1200x1xf32>
-    %5824 = stablehlo.rsqrt %5823 : tensor<1x1200x1xf32>
-    %5825 = stablehlo.broadcast_in_dim %5803, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5826 = stablehlo.broadcast_in_dim %5821, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5827 = stablehlo.subtract %5825, %5826 : tensor<1x1200x320xf32>
-    %5828 = stablehlo.broadcast_in_dim %5827, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5829 = stablehlo.broadcast_in_dim %5824, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5830 = stablehlo.multiply %5828, %5829 : tensor<1x1200x320xf32>
-    %5831 = stablehlo.convert %arg259 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5832 = stablehlo.broadcast_in_dim %5830, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5833 = stablehlo.broadcast_in_dim %5831, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5834 = stablehlo.multiply %5832, %5833 : tensor<1x1200x320xf32>
-    %5835 = stablehlo.convert %arg260 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5836 = stablehlo.broadcast_in_dim %5834, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5837 = stablehlo.broadcast_in_dim %5835, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5838 = stablehlo.add %5836, %5837 : tensor<1x1200x320xf32>
-    %5839 = stablehlo.convert %5838 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5840 = stablehlo.reshape %5839 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5841 = stablehlo.convert %5840 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5842 = stablehlo.dot_general %5841, %arg734, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5843 = stablehlo.broadcast_in_dim %5842, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5844 = stablehlo.multiply %5843, %3065 : tensor<1200x320xf32>
-    %5845 = stablehlo.broadcast_in_dim %5844, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5846 = stablehlo.broadcast_in_dim %arg735, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5847 = stablehlo.add %5845, %5846 : tensor<1200x320xf32>
-    %5848 = stablehlo.convert %5847 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5849 = stablehlo.reshape %5848 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5850 = stablehlo.reshape %5849 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5851 = stablehlo.transpose %5850, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5852 = stablehlo.transpose %5839, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %5853 = stablehlo.reshape %5852 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %5854 = stablehlo.convolution(%5853, %arg261) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %5855 = stablehlo.reshape %arg262 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %5856 = stablehlo.broadcast_in_dim %5854, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %5857 = stablehlo.broadcast_in_dim %5855, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %5858 = stablehlo.add %5856, %5857 : tensor<1x320x15x20xbf16>
-    %5859 = stablehlo.reshape %5858 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %5860 = stablehlo.transpose %5859, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %5861 = stablehlo.convert %5860 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %5862 = stablehlo.convert %5861 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %5863 = stablehlo.reduce(%5862 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5864 = stablehlo.reshape %5863 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5865 = stablehlo.broadcast_in_dim %5864, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5866 = stablehlo.divide %5865, %3088 : tensor<1x300x1xf64>
-    %5867 = stablehlo.broadcast_in_dim %5862, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %5868 = stablehlo.broadcast_in_dim %5866, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %5869 = stablehlo.subtract %5867, %5868 : tensor<1x300x320xf64>
-    %5870 = stablehlo.multiply %5869, %5869 : tensor<1x300x320xf64>
-    %5871 = stablehlo.reduce(%5870 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %5872 = stablehlo.reshape %5871 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %5873 = stablehlo.broadcast_in_dim %5872, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %5874 = stablehlo.divide %5873, %3088 : tensor<1x300x1xf64>
-    %5875 = stablehlo.convert %5874 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %5876 = stablehlo.reduce(%5861 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %5877 = stablehlo.reshape %5876 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %5878 = stablehlo.broadcast_in_dim %5877, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5879 = stablehlo.divide %5878, %3102 : tensor<1x300x1xf32>
-    %5880 = stablehlo.broadcast_in_dim %5875, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %5881 = stablehlo.add %5880, %136 : tensor<1x300x1xf32>
-    %5882 = stablehlo.rsqrt %5881 : tensor<1x300x1xf32>
-    %5883 = stablehlo.broadcast_in_dim %5861, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5884 = stablehlo.broadcast_in_dim %5879, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5885 = stablehlo.subtract %5883, %5884 : tensor<1x300x320xf32>
-    %5886 = stablehlo.broadcast_in_dim %5885, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5887 = stablehlo.broadcast_in_dim %5882, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %5888 = stablehlo.multiply %5886, %5887 : tensor<1x300x320xf32>
-    %5889 = stablehlo.convert %arg263 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5890 = stablehlo.broadcast_in_dim %5888, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5891 = stablehlo.broadcast_in_dim %5889, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5892 = stablehlo.multiply %5890, %5891 : tensor<1x300x320xf32>
-    %5893 = stablehlo.convert %arg264 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5894 = stablehlo.broadcast_in_dim %5892, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %5895 = stablehlo.broadcast_in_dim %5893, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %5896 = stablehlo.add %5894, %5895 : tensor<1x300x320xf32>
-    %5897 = stablehlo.convert %5896 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %5898 = stablehlo.reshape %5897 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %5899 = stablehlo.convert %5898 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %5900 = stablehlo.dot_general %5899, %arg736, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5901 = stablehlo.broadcast_in_dim %5900, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5902 = stablehlo.multiply %5901, %3126 : tensor<300x320xf32>
-    %5903 = stablehlo.broadcast_in_dim %5902, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5904 = stablehlo.broadcast_in_dim %arg737, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5905 = stablehlo.add %5903, %5904 : tensor<300x320xf32>
-    %5906 = stablehlo.convert %5905 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5907 = stablehlo.reshape %5906 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5908 = stablehlo.reshape %5907 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5909 = stablehlo.transpose %5908, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5910 = stablehlo.dot_general %5899, %arg738, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %5911 = stablehlo.broadcast_in_dim %5910, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5912 = stablehlo.multiply %5911, %3126 : tensor<300x320xf32>
-    %5913 = stablehlo.broadcast_in_dim %5912, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %5914 = stablehlo.broadcast_in_dim %arg739, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %5915 = stablehlo.add %5913, %5914 : tensor<300x320xf32>
-    %5916 = stablehlo.convert %5915 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %5917 = stablehlo.reshape %5916 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %5918 = stablehlo.reshape %5917 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %5919 = stablehlo.transpose %5918, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %5920 = stablehlo.transpose %5909, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %5921 = stablehlo.reshape %5851 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5922 = stablehlo.reshape %5920 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5923 = stablehlo.broadcast_in_dim %5922, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %5924 = stablehlo.dot_general %5921, %5923, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5925 = stablehlo.reshape %5924 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5926 = stablehlo.broadcast_in_dim %5925, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %5927 = stablehlo.divide %5926, %3152 : tensor<1x5x1200x300xbf16>
-    %5928 = stablehlo.convert %5927 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %5929 = stablehlo.reduce(%5928 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5930 = stablehlo.reshape %5929 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5931 = stablehlo.broadcast_in_dim %5928, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5932 = stablehlo.broadcast_in_dim %5930, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5933 = stablehlo.subtract %5931, %5932 : tensor<1x5x1200x300xf32>
-    %5934 = stablehlo.exponential %5933 : tensor<1x5x1200x300xf32>
-    %5935 = stablehlo.reduce(%5934 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %5936 = stablehlo.reshape %5935 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %5937 = stablehlo.broadcast_in_dim %5934, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %5938 = stablehlo.broadcast_in_dim %5936, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %5939 = stablehlo.divide %5937, %5938 : tensor<1x5x1200x300xf32>
-    %5940 = stablehlo.convert %5939 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %5941 = stablehlo.reshape %5940 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %5942 = stablehlo.reshape %5919 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5943 = stablehlo.broadcast_in_dim %5942, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %5944 = stablehlo.dot_general %5941, %5943, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %5945 = stablehlo.reshape %5944 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %5946 = stablehlo.transpose %5945, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %5947 = stablehlo.reshape %5946 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %5948 = stablehlo.reshape %5947 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5949 = stablehlo.convert %5948 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5950 = stablehlo.dot_general %5949, %arg740, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %5951 = stablehlo.broadcast_in_dim %5950, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5952 = stablehlo.multiply %5951, %3065 : tensor<1200x320xf32>
-    %5953 = stablehlo.broadcast_in_dim %5952, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %5954 = stablehlo.broadcast_in_dim %arg741, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %5955 = stablehlo.add %5953, %5954 : tensor<1200x320xf32>
-    %5956 = stablehlo.convert %5955 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %5957 = stablehlo.reshape %5956 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %5958 = stablehlo.add %5957, %5802 : tensor<1x1200x320xbf16>
-    %5959 = stablehlo.convert %5958 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %5960 = stablehlo.convert %5959 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %5961 = stablehlo.reduce(%5960 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5962 = stablehlo.reshape %5961 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5963 = stablehlo.broadcast_in_dim %5962, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5964 = stablehlo.divide %5963, %2987 : tensor<1x1200x1xf64>
-    %5965 = stablehlo.broadcast_in_dim %5960, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %5966 = stablehlo.broadcast_in_dim %5964, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %5967 = stablehlo.subtract %5965, %5966 : tensor<1x1200x320xf64>
-    %5968 = stablehlo.multiply %5967, %5967 : tensor<1x1200x320xf64>
-    %5969 = stablehlo.reduce(%5968 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %5970 = stablehlo.reshape %5969 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %5971 = stablehlo.broadcast_in_dim %5970, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %5972 = stablehlo.divide %5971, %2987 : tensor<1x1200x1xf64>
-    %5973 = stablehlo.convert %5972 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %5974 = stablehlo.reduce(%5959 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %5975 = stablehlo.reshape %5974 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %5976 = stablehlo.broadcast_in_dim %5975, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5977 = stablehlo.divide %5976, %3003 : tensor<1x1200x1xf32>
-    %5978 = stablehlo.broadcast_in_dim %5973, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %5979 = stablehlo.add %5978, %3006 : tensor<1x1200x1xf32>
-    %5980 = stablehlo.rsqrt %5979 : tensor<1x1200x1xf32>
-    %5981 = stablehlo.broadcast_in_dim %5959, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5982 = stablehlo.broadcast_in_dim %5977, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5983 = stablehlo.subtract %5981, %5982 : tensor<1x1200x320xf32>
-    %5984 = stablehlo.broadcast_in_dim %5983, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5985 = stablehlo.broadcast_in_dim %5980, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %5986 = stablehlo.multiply %5984, %5985 : tensor<1x1200x320xf32>
-    %5987 = stablehlo.convert %arg265 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5988 = stablehlo.broadcast_in_dim %5986, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5989 = stablehlo.broadcast_in_dim %5987, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5990 = stablehlo.multiply %5988, %5989 : tensor<1x1200x320xf32>
-    %5991 = stablehlo.convert %arg266 : (tensor<320xbf16>) -> tensor<320xf32>
-    %5992 = stablehlo.broadcast_in_dim %5990, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %5993 = stablehlo.broadcast_in_dim %5991, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %5994 = stablehlo.add %5992, %5993 : tensor<1x1200x320xf32>
-    %5995 = stablehlo.convert %5994 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %5996 = stablehlo.reshape %5995 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %5997 = stablehlo.convert %5996 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %5998 = stablehlo.dot_general %5997, %arg742, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %5999 = stablehlo.broadcast_in_dim %5998, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6000 = stablehlo.multiply %5999, %3226 : tensor<1200x1280xf32>
-    %6001 = stablehlo.broadcast_in_dim %6000, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6002 = stablehlo.broadcast_in_dim %arg743, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %6003 = stablehlo.add %6001, %6002 : tensor<1200x1280xf32>
-    %6004 = stablehlo.convert %6003 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %6005 = stablehlo.reshape %6004 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %6006 = stablehlo.transpose %6005, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %6007 = stablehlo.reshape %6006 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6008 = stablehlo.convolution(%6007, %arg267) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6009 = stablehlo.reshape %arg268 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %6010 = stablehlo.broadcast_in_dim %6008, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6011 = stablehlo.broadcast_in_dim %6009, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6012 = stablehlo.add %6010, %6011 : tensor<1x1280x30x40xbf16>
-    %6013 = stablehlo.reshape %6012 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %6014 = stablehlo.transpose %6013, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %6015 = stablehlo.multiply %6014, %cst_42 : tensor<1x1200x1280xbf16>
-    %6016 = stablehlo.multiply %6014, %3243 : tensor<1x1200x1280xbf16>
-    %6017 = stablehlo.convert %6016 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %6018 = stablehlo.clamp %cst_43, %6017, %cst_44 : tensor<1x1200x1280xf32>
-    %6019 = stablehlo.multiply %6018, %6018 : tensor<1x1200x1280xf32>
-    %6020 = stablehlo.multiply %cst_45, %6019 : tensor<1x1200x1280xf32>
-    %6021 = stablehlo.add %6020, %cst_46 : tensor<1x1200x1280xf32>
-    %6022 = stablehlo.multiply %6021, %6019 : tensor<1x1200x1280xf32>
-    %6023 = stablehlo.add %6022, %cst_47 : tensor<1x1200x1280xf32>
-    %6024 = stablehlo.multiply %6023, %6019 : tensor<1x1200x1280xf32>
-    %6025 = stablehlo.add %6024, %cst_48 : tensor<1x1200x1280xf32>
-    %6026 = stablehlo.multiply %6025, %6019 : tensor<1x1200x1280xf32>
-    %6027 = stablehlo.add %6026, %cst_49 : tensor<1x1200x1280xf32>
-    %6028 = stablehlo.multiply %6027, %6019 : tensor<1x1200x1280xf32>
-    %6029 = stablehlo.add %6028, %cst_50 : tensor<1x1200x1280xf32>
-    %6030 = stablehlo.multiply %6029, %6019 : tensor<1x1200x1280xf32>
-    %6031 = stablehlo.add %6030, %cst_51 : tensor<1x1200x1280xf32>
-    %6032 = stablehlo.multiply %cst_52, %6019 : tensor<1x1200x1280xf32>
-    %6033 = stablehlo.add %6032, %cst_53 : tensor<1x1200x1280xf32>
-    %6034 = stablehlo.multiply %6033, %6019 : tensor<1x1200x1280xf32>
-    %6035 = stablehlo.add %6034, %cst_54 : tensor<1x1200x1280xf32>
-    %6036 = stablehlo.multiply %6035, %6019 : tensor<1x1200x1280xf32>
-    %6037 = stablehlo.add %6036, %cst_55 : tensor<1x1200x1280xf32>
-    %6038 = stablehlo.multiply %6037, %6019 : tensor<1x1200x1280xf32>
-    %6039 = stablehlo.add %6038, %cst_56 : tensor<1x1200x1280xf32>
-    %6040 = stablehlo.multiply %6018, %6031 : tensor<1x1200x1280xf32>
-    %6041 = stablehlo.divide %6040, %6039 : tensor<1x1200x1280xf32>
-    %6042 = stablehlo.clamp %cst_57, %6041, %cst_58 : tensor<1x1200x1280xf32>
-    %6043 = stablehlo.convert %6042 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %6044 = stablehlo.add %6043, %cst_40 : tensor<1x1200x1280xbf16>
-    %6045 = stablehlo.multiply %6044, %6015 : tensor<1x1200x1280xbf16>
-    %6046 = stablehlo.reshape %6045 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %6047 = stablehlo.dot_general %6046, %arg744, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %6048 = stablehlo.reshape %6047 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6049 = stablehlo.broadcast_in_dim %6048, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6050 = stablehlo.broadcast_in_dim %arg269, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %6051 = stablehlo.add %6049, %6050 : tensor<1x1200x320xbf16>
-    %6052 = stablehlo.reshape %6051 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6053 = stablehlo.reshape %6052 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6054 = stablehlo.add %6053, %5958 : tensor<1x1200x320xbf16>
-    %6055 = stablehlo.convert %6054 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6056 = stablehlo.convert %6055 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6057 = stablehlo.reduce(%6056 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6058 = stablehlo.reshape %6057 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6059 = stablehlo.broadcast_in_dim %6058, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6060 = stablehlo.divide %6059, %2987 : tensor<1x1200x1xf64>
-    %6061 = stablehlo.broadcast_in_dim %6056, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6062 = stablehlo.broadcast_in_dim %6060, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6063 = stablehlo.subtract %6061, %6062 : tensor<1x1200x320xf64>
-    %6064 = stablehlo.multiply %6063, %6063 : tensor<1x1200x320xf64>
-    %6065 = stablehlo.reduce(%6064 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6066 = stablehlo.reshape %6065 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6067 = stablehlo.broadcast_in_dim %6066, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6068 = stablehlo.divide %6067, %2987 : tensor<1x1200x1xf64>
-    %6069 = stablehlo.convert %6068 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6070 = stablehlo.reduce(%6055 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6071 = stablehlo.reshape %6070 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6072 = stablehlo.broadcast_in_dim %6071, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6073 = stablehlo.divide %6072, %3003 : tensor<1x1200x1xf32>
-    %6074 = stablehlo.broadcast_in_dim %6069, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6075 = stablehlo.add %6074, %3006 : tensor<1x1200x1xf32>
-    %6076 = stablehlo.rsqrt %6075 : tensor<1x1200x1xf32>
-    %6077 = stablehlo.broadcast_in_dim %6055, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6078 = stablehlo.broadcast_in_dim %6073, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6079 = stablehlo.subtract %6077, %6078 : tensor<1x1200x320xf32>
-    %6080 = stablehlo.broadcast_in_dim %6079, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6081 = stablehlo.broadcast_in_dim %6076, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6082 = stablehlo.multiply %6080, %6081 : tensor<1x1200x320xf32>
-    %6083 = stablehlo.convert %arg270 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6084 = stablehlo.broadcast_in_dim %6082, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6085 = stablehlo.broadcast_in_dim %6083, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6086 = stablehlo.multiply %6084, %6085 : tensor<1x1200x320xf32>
-    %6087 = stablehlo.convert %arg271 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6088 = stablehlo.broadcast_in_dim %6086, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6089 = stablehlo.broadcast_in_dim %6087, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6090 = stablehlo.add %6088, %6089 : tensor<1x1200x320xf32>
-    %6091 = stablehlo.convert %6090 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6092 = stablehlo.reshape %6091 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6093 = stablehlo.convert %6092 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6094 = stablehlo.dot_general %6093, %arg745, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6095 = stablehlo.broadcast_in_dim %6094, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6096 = stablehlo.multiply %6095, %3065 : tensor<1200x320xf32>
-    %6097 = stablehlo.broadcast_in_dim %6096, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6098 = stablehlo.broadcast_in_dim %arg746, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6099 = stablehlo.add %6097, %6098 : tensor<1200x320xf32>
-    %6100 = stablehlo.convert %6099 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6101 = stablehlo.reshape %6100 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6102 = stablehlo.reshape %6101 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6103 = stablehlo.transpose %6102, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6104 = stablehlo.transpose %6091, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %6105 = stablehlo.reshape %6104 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %6106 = stablehlo.convolution(%6105, %arg272) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %6107 = stablehlo.reshape %arg273 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %6108 = stablehlo.broadcast_in_dim %6106, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %6109 = stablehlo.broadcast_in_dim %6107, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %6110 = stablehlo.add %6108, %6109 : tensor<1x320x15x20xbf16>
-    %6111 = stablehlo.reshape %6110 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %6112 = stablehlo.transpose %6111, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %6113 = stablehlo.convert %6112 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %6114 = stablehlo.convert %6113 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %6115 = stablehlo.reduce(%6114 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6116 = stablehlo.reshape %6115 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6117 = stablehlo.broadcast_in_dim %6116, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6118 = stablehlo.divide %6117, %3088 : tensor<1x300x1xf64>
-    %6119 = stablehlo.broadcast_in_dim %6114, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %6120 = stablehlo.broadcast_in_dim %6118, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %6121 = stablehlo.subtract %6119, %6120 : tensor<1x300x320xf64>
-    %6122 = stablehlo.multiply %6121, %6121 : tensor<1x300x320xf64>
-    %6123 = stablehlo.reduce(%6122 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6124 = stablehlo.reshape %6123 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6125 = stablehlo.broadcast_in_dim %6124, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6126 = stablehlo.divide %6125, %3088 : tensor<1x300x1xf64>
-    %6127 = stablehlo.convert %6126 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %6128 = stablehlo.reduce(%6113 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %6129 = stablehlo.reshape %6128 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %6130 = stablehlo.broadcast_in_dim %6129, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6131 = stablehlo.divide %6130, %3102 : tensor<1x300x1xf32>
-    %6132 = stablehlo.broadcast_in_dim %6127, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6133 = stablehlo.add %6132, %136 : tensor<1x300x1xf32>
-    %6134 = stablehlo.rsqrt %6133 : tensor<1x300x1xf32>
-    %6135 = stablehlo.broadcast_in_dim %6113, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6136 = stablehlo.broadcast_in_dim %6131, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6137 = stablehlo.subtract %6135, %6136 : tensor<1x300x320xf32>
-    %6138 = stablehlo.broadcast_in_dim %6137, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6139 = stablehlo.broadcast_in_dim %6134, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6140 = stablehlo.multiply %6138, %6139 : tensor<1x300x320xf32>
-    %6141 = stablehlo.convert %arg274 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6142 = stablehlo.broadcast_in_dim %6140, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6143 = stablehlo.broadcast_in_dim %6141, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6144 = stablehlo.multiply %6142, %6143 : tensor<1x300x320xf32>
-    %6145 = stablehlo.convert %arg275 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6146 = stablehlo.broadcast_in_dim %6144, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6147 = stablehlo.broadcast_in_dim %6145, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6148 = stablehlo.add %6146, %6147 : tensor<1x300x320xf32>
-    %6149 = stablehlo.convert %6148 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %6150 = stablehlo.reshape %6149 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %6151 = stablehlo.convert %6150 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %6152 = stablehlo.dot_general %6151, %arg747, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6153 = stablehlo.broadcast_in_dim %6152, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6154 = stablehlo.multiply %6153, %3126 : tensor<300x320xf32>
-    %6155 = stablehlo.broadcast_in_dim %6154, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6156 = stablehlo.broadcast_in_dim %arg748, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6157 = stablehlo.add %6155, %6156 : tensor<300x320xf32>
-    %6158 = stablehlo.convert %6157 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6159 = stablehlo.reshape %6158 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6160 = stablehlo.reshape %6159 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6161 = stablehlo.transpose %6160, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6162 = stablehlo.dot_general %6151, %arg749, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6163 = stablehlo.broadcast_in_dim %6162, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6164 = stablehlo.multiply %6163, %3126 : tensor<300x320xf32>
-    %6165 = stablehlo.broadcast_in_dim %6164, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6166 = stablehlo.broadcast_in_dim %arg750, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6167 = stablehlo.add %6165, %6166 : tensor<300x320xf32>
-    %6168 = stablehlo.convert %6167 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6169 = stablehlo.reshape %6168 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6170 = stablehlo.reshape %6169 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6171 = stablehlo.transpose %6170, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6172 = stablehlo.transpose %6161, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %6173 = stablehlo.reshape %6103 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6174 = stablehlo.reshape %6172 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6175 = stablehlo.broadcast_in_dim %6174, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6176 = stablehlo.dot_general %6173, %6175, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6177 = stablehlo.reshape %6176 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6178 = stablehlo.broadcast_in_dim %6177, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6179 = stablehlo.divide %6178, %3152 : tensor<1x5x1200x300xbf16>
-    %6180 = stablehlo.convert %6179 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %6181 = stablehlo.reduce(%6180 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6182 = stablehlo.reshape %6181 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6183 = stablehlo.broadcast_in_dim %6180, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6184 = stablehlo.broadcast_in_dim %6182, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6185 = stablehlo.subtract %6183, %6184 : tensor<1x5x1200x300xf32>
-    %6186 = stablehlo.exponential %6185 : tensor<1x5x1200x300xf32>
-    %6187 = stablehlo.reduce(%6186 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6188 = stablehlo.reshape %6187 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6189 = stablehlo.broadcast_in_dim %6186, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6190 = stablehlo.broadcast_in_dim %6188, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6191 = stablehlo.divide %6189, %6190 : tensor<1x5x1200x300xf32>
-    %6192 = stablehlo.convert %6191 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %6193 = stablehlo.reshape %6192 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6194 = stablehlo.reshape %6171 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6195 = stablehlo.broadcast_in_dim %6194, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6196 = stablehlo.dot_general %6193, %6195, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6197 = stablehlo.reshape %6196 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6198 = stablehlo.transpose %6197, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6199 = stablehlo.reshape %6198 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %6200 = stablehlo.reshape %6199 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6201 = stablehlo.convert %6200 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6202 = stablehlo.dot_general %6201, %arg751, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6203 = stablehlo.broadcast_in_dim %6202, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6204 = stablehlo.multiply %6203, %3065 : tensor<1200x320xf32>
-    %6205 = stablehlo.broadcast_in_dim %6204, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6206 = stablehlo.broadcast_in_dim %arg752, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6207 = stablehlo.add %6205, %6206 : tensor<1200x320xf32>
-    %6208 = stablehlo.convert %6207 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6209 = stablehlo.reshape %6208 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6210 = stablehlo.add %6209, %6054 : tensor<1x1200x320xbf16>
-    %6211 = stablehlo.convert %6210 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6212 = stablehlo.convert %6211 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6213 = stablehlo.reduce(%6212 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6214 = stablehlo.reshape %6213 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6215 = stablehlo.broadcast_in_dim %6214, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6216 = stablehlo.divide %6215, %2987 : tensor<1x1200x1xf64>
-    %6217 = stablehlo.broadcast_in_dim %6212, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6218 = stablehlo.broadcast_in_dim %6216, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6219 = stablehlo.subtract %6217, %6218 : tensor<1x1200x320xf64>
-    %6220 = stablehlo.multiply %6219, %6219 : tensor<1x1200x320xf64>
-    %6221 = stablehlo.reduce(%6220 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6222 = stablehlo.reshape %6221 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6223 = stablehlo.broadcast_in_dim %6222, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6224 = stablehlo.divide %6223, %2987 : tensor<1x1200x1xf64>
-    %6225 = stablehlo.convert %6224 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6226 = stablehlo.reduce(%6211 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6227 = stablehlo.reshape %6226 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6228 = stablehlo.broadcast_in_dim %6227, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6229 = stablehlo.divide %6228, %3003 : tensor<1x1200x1xf32>
-    %6230 = stablehlo.broadcast_in_dim %6225, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6231 = stablehlo.add %6230, %3006 : tensor<1x1200x1xf32>
-    %6232 = stablehlo.rsqrt %6231 : tensor<1x1200x1xf32>
-    %6233 = stablehlo.broadcast_in_dim %6211, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6234 = stablehlo.broadcast_in_dim %6229, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6235 = stablehlo.subtract %6233, %6234 : tensor<1x1200x320xf32>
-    %6236 = stablehlo.broadcast_in_dim %6235, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6237 = stablehlo.broadcast_in_dim %6232, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6238 = stablehlo.multiply %6236, %6237 : tensor<1x1200x320xf32>
-    %6239 = stablehlo.convert %arg276 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6240 = stablehlo.broadcast_in_dim %6238, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6241 = stablehlo.broadcast_in_dim %6239, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6242 = stablehlo.multiply %6240, %6241 : tensor<1x1200x320xf32>
-    %6243 = stablehlo.convert %arg277 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6244 = stablehlo.broadcast_in_dim %6242, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6245 = stablehlo.broadcast_in_dim %6243, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6246 = stablehlo.add %6244, %6245 : tensor<1x1200x320xf32>
-    %6247 = stablehlo.convert %6246 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6248 = stablehlo.reshape %6247 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6249 = stablehlo.convert %6248 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6250 = stablehlo.dot_general %6249, %arg753, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %6251 = stablehlo.broadcast_in_dim %6250, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6252 = stablehlo.multiply %6251, %3226 : tensor<1200x1280xf32>
-    %6253 = stablehlo.broadcast_in_dim %6252, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6254 = stablehlo.broadcast_in_dim %arg754, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %6255 = stablehlo.add %6253, %6254 : tensor<1200x1280xf32>
-    %6256 = stablehlo.convert %6255 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %6257 = stablehlo.reshape %6256 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %6258 = stablehlo.transpose %6257, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %6259 = stablehlo.reshape %6258 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6260 = stablehlo.convolution(%6259, %arg278) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6261 = stablehlo.reshape %arg279 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %6262 = stablehlo.broadcast_in_dim %6260, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6263 = stablehlo.broadcast_in_dim %6261, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6264 = stablehlo.add %6262, %6263 : tensor<1x1280x30x40xbf16>
-    %6265 = stablehlo.reshape %6264 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %6266 = stablehlo.transpose %6265, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %6267 = stablehlo.multiply %6266, %cst_42 : tensor<1x1200x1280xbf16>
-    %6268 = stablehlo.multiply %6266, %3243 : tensor<1x1200x1280xbf16>
-    %6269 = stablehlo.convert %6268 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %6270 = stablehlo.clamp %cst_43, %6269, %cst_44 : tensor<1x1200x1280xf32>
-    %6271 = stablehlo.multiply %6270, %6270 : tensor<1x1200x1280xf32>
-    %6272 = stablehlo.multiply %cst_45, %6271 : tensor<1x1200x1280xf32>
-    %6273 = stablehlo.add %6272, %cst_46 : tensor<1x1200x1280xf32>
-    %6274 = stablehlo.multiply %6273, %6271 : tensor<1x1200x1280xf32>
-    %6275 = stablehlo.add %6274, %cst_47 : tensor<1x1200x1280xf32>
-    %6276 = stablehlo.multiply %6275, %6271 : tensor<1x1200x1280xf32>
-    %6277 = stablehlo.add %6276, %cst_48 : tensor<1x1200x1280xf32>
-    %6278 = stablehlo.multiply %6277, %6271 : tensor<1x1200x1280xf32>
-    %6279 = stablehlo.add %6278, %cst_49 : tensor<1x1200x1280xf32>
-    %6280 = stablehlo.multiply %6279, %6271 : tensor<1x1200x1280xf32>
-    %6281 = stablehlo.add %6280, %cst_50 : tensor<1x1200x1280xf32>
-    %6282 = stablehlo.multiply %6281, %6271 : tensor<1x1200x1280xf32>
-    %6283 = stablehlo.add %6282, %cst_51 : tensor<1x1200x1280xf32>
-    %6284 = stablehlo.multiply %cst_52, %6271 : tensor<1x1200x1280xf32>
-    %6285 = stablehlo.add %6284, %cst_53 : tensor<1x1200x1280xf32>
-    %6286 = stablehlo.multiply %6285, %6271 : tensor<1x1200x1280xf32>
-    %6287 = stablehlo.add %6286, %cst_54 : tensor<1x1200x1280xf32>
-    %6288 = stablehlo.multiply %6287, %6271 : tensor<1x1200x1280xf32>
-    %6289 = stablehlo.add %6288, %cst_55 : tensor<1x1200x1280xf32>
-    %6290 = stablehlo.multiply %6289, %6271 : tensor<1x1200x1280xf32>
-    %6291 = stablehlo.add %6290, %cst_56 : tensor<1x1200x1280xf32>
-    %6292 = stablehlo.multiply %6270, %6283 : tensor<1x1200x1280xf32>
-    %6293 = stablehlo.divide %6292, %6291 : tensor<1x1200x1280xf32>
-    %6294 = stablehlo.clamp %cst_57, %6293, %cst_58 : tensor<1x1200x1280xf32>
-    %6295 = stablehlo.convert %6294 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %6296 = stablehlo.add %6295, %cst_40 : tensor<1x1200x1280xbf16>
-    %6297 = stablehlo.multiply %6296, %6267 : tensor<1x1200x1280xbf16>
-    %6298 = stablehlo.reshape %6297 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %6299 = stablehlo.dot_general %6298, %arg755, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %6300 = stablehlo.reshape %6299 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6301 = stablehlo.broadcast_in_dim %6300, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6302 = stablehlo.broadcast_in_dim %arg280, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %6303 = stablehlo.add %6301, %6302 : tensor<1x1200x320xbf16>
-    %6304 = stablehlo.reshape %6303 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6305 = stablehlo.reshape %6304 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6306 = stablehlo.add %6305, %6210 : tensor<1x1200x320xbf16>
-    %6307 = stablehlo.convert %6306 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6308 = stablehlo.convert %6307 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6309 = stablehlo.reduce(%6308 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6310 = stablehlo.reshape %6309 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6311 = stablehlo.broadcast_in_dim %6310, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6312 = stablehlo.divide %6311, %2987 : tensor<1x1200x1xf64>
-    %6313 = stablehlo.broadcast_in_dim %6308, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6314 = stablehlo.broadcast_in_dim %6312, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6315 = stablehlo.subtract %6313, %6314 : tensor<1x1200x320xf64>
-    %6316 = stablehlo.multiply %6315, %6315 : tensor<1x1200x320xf64>
-    %6317 = stablehlo.reduce(%6316 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6318 = stablehlo.reshape %6317 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6319 = stablehlo.broadcast_in_dim %6318, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6320 = stablehlo.divide %6319, %2987 : tensor<1x1200x1xf64>
-    %6321 = stablehlo.convert %6320 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6322 = stablehlo.reduce(%6307 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6323 = stablehlo.reshape %6322 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6324 = stablehlo.broadcast_in_dim %6323, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6325 = stablehlo.divide %6324, %3003 : tensor<1x1200x1xf32>
-    %6326 = stablehlo.broadcast_in_dim %6321, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6327 = stablehlo.add %6326, %3006 : tensor<1x1200x1xf32>
-    %6328 = stablehlo.rsqrt %6327 : tensor<1x1200x1xf32>
-    %6329 = stablehlo.broadcast_in_dim %6307, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6330 = stablehlo.broadcast_in_dim %6325, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6331 = stablehlo.subtract %6329, %6330 : tensor<1x1200x320xf32>
-    %6332 = stablehlo.broadcast_in_dim %6331, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6333 = stablehlo.broadcast_in_dim %6328, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6334 = stablehlo.multiply %6332, %6333 : tensor<1x1200x320xf32>
-    %6335 = stablehlo.convert %arg281 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6336 = stablehlo.broadcast_in_dim %6334, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6337 = stablehlo.broadcast_in_dim %6335, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6338 = stablehlo.multiply %6336, %6337 : tensor<1x1200x320xf32>
-    %6339 = stablehlo.convert %arg282 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6340 = stablehlo.broadcast_in_dim %6338, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6341 = stablehlo.broadcast_in_dim %6339, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6342 = stablehlo.add %6340, %6341 : tensor<1x1200x320xf32>
-    %6343 = stablehlo.convert %6342 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6344 = stablehlo.reshape %6343 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6345 = stablehlo.convert %6344 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6346 = stablehlo.dot_general %6345, %arg756, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6347 = stablehlo.broadcast_in_dim %6346, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6348 = stablehlo.multiply %6347, %3065 : tensor<1200x320xf32>
-    %6349 = stablehlo.broadcast_in_dim %6348, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6350 = stablehlo.broadcast_in_dim %arg757, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6351 = stablehlo.add %6349, %6350 : tensor<1200x320xf32>
-    %6352 = stablehlo.convert %6351 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6353 = stablehlo.reshape %6352 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6354 = stablehlo.reshape %6353 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6355 = stablehlo.transpose %6354, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6356 = stablehlo.transpose %6343, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %6357 = stablehlo.reshape %6356 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %6358 = stablehlo.convolution(%6357, %arg283) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %6359 = stablehlo.reshape %arg284 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %6360 = stablehlo.broadcast_in_dim %6358, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %6361 = stablehlo.broadcast_in_dim %6359, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %6362 = stablehlo.add %6360, %6361 : tensor<1x320x15x20xbf16>
-    %6363 = stablehlo.reshape %6362 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %6364 = stablehlo.transpose %6363, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %6365 = stablehlo.convert %6364 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %6366 = stablehlo.convert %6365 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %6367 = stablehlo.reduce(%6366 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6368 = stablehlo.reshape %6367 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6369 = stablehlo.broadcast_in_dim %6368, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6370 = stablehlo.divide %6369, %3088 : tensor<1x300x1xf64>
-    %6371 = stablehlo.broadcast_in_dim %6366, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %6372 = stablehlo.broadcast_in_dim %6370, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %6373 = stablehlo.subtract %6371, %6372 : tensor<1x300x320xf64>
-    %6374 = stablehlo.multiply %6373, %6373 : tensor<1x300x320xf64>
-    %6375 = stablehlo.reduce(%6374 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6376 = stablehlo.reshape %6375 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6377 = stablehlo.broadcast_in_dim %6376, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6378 = stablehlo.divide %6377, %3088 : tensor<1x300x1xf64>
-    %6379 = stablehlo.convert %6378 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %6380 = stablehlo.reduce(%6365 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %6381 = stablehlo.reshape %6380 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %6382 = stablehlo.broadcast_in_dim %6381, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6383 = stablehlo.divide %6382, %3102 : tensor<1x300x1xf32>
-    %6384 = stablehlo.broadcast_in_dim %6379, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6385 = stablehlo.add %6384, %136 : tensor<1x300x1xf32>
-    %6386 = stablehlo.rsqrt %6385 : tensor<1x300x1xf32>
-    %6387 = stablehlo.broadcast_in_dim %6365, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6388 = stablehlo.broadcast_in_dim %6383, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6389 = stablehlo.subtract %6387, %6388 : tensor<1x300x320xf32>
-    %6390 = stablehlo.broadcast_in_dim %6389, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6391 = stablehlo.broadcast_in_dim %6386, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6392 = stablehlo.multiply %6390, %6391 : tensor<1x300x320xf32>
-    %6393 = stablehlo.convert %arg285 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6394 = stablehlo.broadcast_in_dim %6392, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6395 = stablehlo.broadcast_in_dim %6393, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6396 = stablehlo.multiply %6394, %6395 : tensor<1x300x320xf32>
-    %6397 = stablehlo.convert %arg286 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6398 = stablehlo.broadcast_in_dim %6396, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6399 = stablehlo.broadcast_in_dim %6397, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6400 = stablehlo.add %6398, %6399 : tensor<1x300x320xf32>
-    %6401 = stablehlo.convert %6400 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %6402 = stablehlo.reshape %6401 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %6403 = stablehlo.convert %6402 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %6404 = stablehlo.dot_general %6403, %arg758, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6405 = stablehlo.broadcast_in_dim %6404, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6406 = stablehlo.multiply %6405, %3126 : tensor<300x320xf32>
-    %6407 = stablehlo.broadcast_in_dim %6406, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6408 = stablehlo.broadcast_in_dim %arg759, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6409 = stablehlo.add %6407, %6408 : tensor<300x320xf32>
-    %6410 = stablehlo.convert %6409 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6411 = stablehlo.reshape %6410 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6412 = stablehlo.reshape %6411 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6413 = stablehlo.transpose %6412, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6414 = stablehlo.dot_general %6403, %arg760, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6415 = stablehlo.broadcast_in_dim %6414, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6416 = stablehlo.multiply %6415, %3126 : tensor<300x320xf32>
-    %6417 = stablehlo.broadcast_in_dim %6416, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6418 = stablehlo.broadcast_in_dim %arg761, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6419 = stablehlo.add %6417, %6418 : tensor<300x320xf32>
-    %6420 = stablehlo.convert %6419 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6421 = stablehlo.reshape %6420 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6422 = stablehlo.reshape %6421 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6423 = stablehlo.transpose %6422, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6424 = stablehlo.transpose %6413, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %6425 = stablehlo.reshape %6355 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6426 = stablehlo.reshape %6424 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6427 = stablehlo.broadcast_in_dim %6426, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6428 = stablehlo.dot_general %6425, %6427, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6429 = stablehlo.reshape %6428 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6430 = stablehlo.broadcast_in_dim %6429, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6431 = stablehlo.divide %6430, %3152 : tensor<1x5x1200x300xbf16>
-    %6432 = stablehlo.convert %6431 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %6433 = stablehlo.reduce(%6432 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6434 = stablehlo.reshape %6433 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6435 = stablehlo.broadcast_in_dim %6432, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6436 = stablehlo.broadcast_in_dim %6434, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6437 = stablehlo.subtract %6435, %6436 : tensor<1x5x1200x300xf32>
-    %6438 = stablehlo.exponential %6437 : tensor<1x5x1200x300xf32>
-    %6439 = stablehlo.reduce(%6438 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6440 = stablehlo.reshape %6439 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6441 = stablehlo.broadcast_in_dim %6438, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6442 = stablehlo.broadcast_in_dim %6440, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6443 = stablehlo.divide %6441, %6442 : tensor<1x5x1200x300xf32>
-    %6444 = stablehlo.convert %6443 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %6445 = stablehlo.reshape %6444 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6446 = stablehlo.reshape %6423 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6447 = stablehlo.broadcast_in_dim %6446, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6448 = stablehlo.dot_general %6445, %6447, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6449 = stablehlo.reshape %6448 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6450 = stablehlo.transpose %6449, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6451 = stablehlo.reshape %6450 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %6452 = stablehlo.reshape %6451 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6453 = stablehlo.convert %6452 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6454 = stablehlo.dot_general %6453, %arg762, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6455 = stablehlo.broadcast_in_dim %6454, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6456 = stablehlo.multiply %6455, %3065 : tensor<1200x320xf32>
-    %6457 = stablehlo.broadcast_in_dim %6456, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6458 = stablehlo.broadcast_in_dim %arg763, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6459 = stablehlo.add %6457, %6458 : tensor<1200x320xf32>
-    %6460 = stablehlo.convert %6459 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6461 = stablehlo.reshape %6460 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6462 = stablehlo.add %6461, %6306 : tensor<1x1200x320xbf16>
-    %6463 = stablehlo.convert %6462 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6464 = stablehlo.convert %6463 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6465 = stablehlo.reduce(%6464 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6466 = stablehlo.reshape %6465 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6467 = stablehlo.broadcast_in_dim %6466, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6468 = stablehlo.divide %6467, %2987 : tensor<1x1200x1xf64>
-    %6469 = stablehlo.broadcast_in_dim %6464, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6470 = stablehlo.broadcast_in_dim %6468, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6471 = stablehlo.subtract %6469, %6470 : tensor<1x1200x320xf64>
-    %6472 = stablehlo.multiply %6471, %6471 : tensor<1x1200x320xf64>
-    %6473 = stablehlo.reduce(%6472 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6474 = stablehlo.reshape %6473 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6475 = stablehlo.broadcast_in_dim %6474, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6476 = stablehlo.divide %6475, %2987 : tensor<1x1200x1xf64>
-    %6477 = stablehlo.convert %6476 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6478 = stablehlo.reduce(%6463 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6479 = stablehlo.reshape %6478 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6480 = stablehlo.broadcast_in_dim %6479, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6481 = stablehlo.divide %6480, %3003 : tensor<1x1200x1xf32>
-    %6482 = stablehlo.broadcast_in_dim %6477, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6483 = stablehlo.add %6482, %3006 : tensor<1x1200x1xf32>
-    %6484 = stablehlo.rsqrt %6483 : tensor<1x1200x1xf32>
-    %6485 = stablehlo.broadcast_in_dim %6463, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6486 = stablehlo.broadcast_in_dim %6481, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6487 = stablehlo.subtract %6485, %6486 : tensor<1x1200x320xf32>
-    %6488 = stablehlo.broadcast_in_dim %6487, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6489 = stablehlo.broadcast_in_dim %6484, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6490 = stablehlo.multiply %6488, %6489 : tensor<1x1200x320xf32>
-    %6491 = stablehlo.convert %arg287 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6492 = stablehlo.broadcast_in_dim %6490, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6493 = stablehlo.broadcast_in_dim %6491, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6494 = stablehlo.multiply %6492, %6493 : tensor<1x1200x320xf32>
-    %6495 = stablehlo.convert %arg288 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6496 = stablehlo.broadcast_in_dim %6494, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6497 = stablehlo.broadcast_in_dim %6495, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6498 = stablehlo.add %6496, %6497 : tensor<1x1200x320xf32>
-    %6499 = stablehlo.convert %6498 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6500 = stablehlo.reshape %6499 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6501 = stablehlo.convert %6500 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6502 = stablehlo.dot_general %6501, %arg764, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %6503 = stablehlo.broadcast_in_dim %6502, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6504 = stablehlo.multiply %6503, %3226 : tensor<1200x1280xf32>
-    %6505 = stablehlo.broadcast_in_dim %6504, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6506 = stablehlo.broadcast_in_dim %arg765, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %6507 = stablehlo.add %6505, %6506 : tensor<1200x1280xf32>
-    %6508 = stablehlo.convert %6507 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %6509 = stablehlo.reshape %6508 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %6510 = stablehlo.transpose %6509, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %6511 = stablehlo.reshape %6510 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6512 = stablehlo.convolution(%6511, %arg289) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6513 = stablehlo.reshape %arg290 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %6514 = stablehlo.broadcast_in_dim %6512, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6515 = stablehlo.broadcast_in_dim %6513, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6516 = stablehlo.add %6514, %6515 : tensor<1x1280x30x40xbf16>
-    %6517 = stablehlo.reshape %6516 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %6518 = stablehlo.transpose %6517, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %6519 = stablehlo.multiply %6518, %cst_42 : tensor<1x1200x1280xbf16>
-    %6520 = stablehlo.multiply %6518, %3243 : tensor<1x1200x1280xbf16>
-    %6521 = stablehlo.convert %6520 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %6522 = stablehlo.clamp %cst_43, %6521, %cst_44 : tensor<1x1200x1280xf32>
-    %6523 = stablehlo.multiply %6522, %6522 : tensor<1x1200x1280xf32>
-    %6524 = stablehlo.multiply %cst_45, %6523 : tensor<1x1200x1280xf32>
-    %6525 = stablehlo.add %6524, %cst_46 : tensor<1x1200x1280xf32>
-    %6526 = stablehlo.multiply %6525, %6523 : tensor<1x1200x1280xf32>
-    %6527 = stablehlo.add %6526, %cst_47 : tensor<1x1200x1280xf32>
-    %6528 = stablehlo.multiply %6527, %6523 : tensor<1x1200x1280xf32>
-    %6529 = stablehlo.add %6528, %cst_48 : tensor<1x1200x1280xf32>
-    %6530 = stablehlo.multiply %6529, %6523 : tensor<1x1200x1280xf32>
-    %6531 = stablehlo.add %6530, %cst_49 : tensor<1x1200x1280xf32>
-    %6532 = stablehlo.multiply %6531, %6523 : tensor<1x1200x1280xf32>
-    %6533 = stablehlo.add %6532, %cst_50 : tensor<1x1200x1280xf32>
-    %6534 = stablehlo.multiply %6533, %6523 : tensor<1x1200x1280xf32>
-    %6535 = stablehlo.add %6534, %cst_51 : tensor<1x1200x1280xf32>
-    %6536 = stablehlo.multiply %cst_52, %6523 : tensor<1x1200x1280xf32>
-    %6537 = stablehlo.add %6536, %cst_53 : tensor<1x1200x1280xf32>
-    %6538 = stablehlo.multiply %6537, %6523 : tensor<1x1200x1280xf32>
-    %6539 = stablehlo.add %6538, %cst_54 : tensor<1x1200x1280xf32>
-    %6540 = stablehlo.multiply %6539, %6523 : tensor<1x1200x1280xf32>
-    %6541 = stablehlo.add %6540, %cst_55 : tensor<1x1200x1280xf32>
-    %6542 = stablehlo.multiply %6541, %6523 : tensor<1x1200x1280xf32>
-    %6543 = stablehlo.add %6542, %cst_56 : tensor<1x1200x1280xf32>
-    %6544 = stablehlo.multiply %6522, %6535 : tensor<1x1200x1280xf32>
-    %6545 = stablehlo.divide %6544, %6543 : tensor<1x1200x1280xf32>
-    %6546 = stablehlo.clamp %cst_57, %6545, %cst_58 : tensor<1x1200x1280xf32>
-    %6547 = stablehlo.convert %6546 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %6548 = stablehlo.add %6547, %cst_40 : tensor<1x1200x1280xbf16>
-    %6549 = stablehlo.multiply %6548, %6519 : tensor<1x1200x1280xbf16>
-    %6550 = stablehlo.reshape %6549 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %6551 = stablehlo.dot_general %6550, %arg766, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %6552 = stablehlo.reshape %6551 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6553 = stablehlo.broadcast_in_dim %6552, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6554 = stablehlo.broadcast_in_dim %arg291, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %6555 = stablehlo.add %6553, %6554 : tensor<1x1200x320xbf16>
-    %6556 = stablehlo.reshape %6555 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6557 = stablehlo.reshape %6556 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6558 = stablehlo.add %6557, %6462 : tensor<1x1200x320xbf16>
-    %6559 = stablehlo.convert %6558 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6560 = stablehlo.convert %6559 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6561 = stablehlo.reduce(%6560 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6562 = stablehlo.reshape %6561 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6563 = stablehlo.broadcast_in_dim %6562, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6564 = stablehlo.divide %6563, %2987 : tensor<1x1200x1xf64>
-    %6565 = stablehlo.broadcast_in_dim %6560, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6566 = stablehlo.broadcast_in_dim %6564, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6567 = stablehlo.subtract %6565, %6566 : tensor<1x1200x320xf64>
-    %6568 = stablehlo.multiply %6567, %6567 : tensor<1x1200x320xf64>
-    %6569 = stablehlo.reduce(%6568 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6570 = stablehlo.reshape %6569 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6571 = stablehlo.broadcast_in_dim %6570, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6572 = stablehlo.divide %6571, %2987 : tensor<1x1200x1xf64>
-    %6573 = stablehlo.convert %6572 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6574 = stablehlo.reduce(%6559 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6575 = stablehlo.reshape %6574 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6576 = stablehlo.broadcast_in_dim %6575, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6577 = stablehlo.divide %6576, %3003 : tensor<1x1200x1xf32>
-    %6578 = stablehlo.broadcast_in_dim %6573, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6579 = stablehlo.add %6578, %3006 : tensor<1x1200x1xf32>
-    %6580 = stablehlo.rsqrt %6579 : tensor<1x1200x1xf32>
-    %6581 = stablehlo.broadcast_in_dim %6559, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6582 = stablehlo.broadcast_in_dim %6577, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6583 = stablehlo.subtract %6581, %6582 : tensor<1x1200x320xf32>
-    %6584 = stablehlo.broadcast_in_dim %6583, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6585 = stablehlo.broadcast_in_dim %6580, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6586 = stablehlo.multiply %6584, %6585 : tensor<1x1200x320xf32>
-    %6587 = stablehlo.convert %arg292 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6588 = stablehlo.broadcast_in_dim %6586, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6589 = stablehlo.broadcast_in_dim %6587, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6590 = stablehlo.multiply %6588, %6589 : tensor<1x1200x320xf32>
-    %6591 = stablehlo.convert %arg293 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6592 = stablehlo.broadcast_in_dim %6590, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6593 = stablehlo.broadcast_in_dim %6591, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6594 = stablehlo.add %6592, %6593 : tensor<1x1200x320xf32>
-    %6595 = stablehlo.convert %6594 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6596 = stablehlo.reshape %6595 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6597 = stablehlo.convert %6596 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6598 = stablehlo.dot_general %6597, %arg767, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6599 = stablehlo.broadcast_in_dim %6598, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6600 = stablehlo.multiply %6599, %3065 : tensor<1200x320xf32>
-    %6601 = stablehlo.broadcast_in_dim %6600, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6602 = stablehlo.broadcast_in_dim %arg768, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6603 = stablehlo.add %6601, %6602 : tensor<1200x320xf32>
-    %6604 = stablehlo.convert %6603 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6605 = stablehlo.reshape %6604 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6606 = stablehlo.reshape %6605 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6607 = stablehlo.transpose %6606, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6608 = stablehlo.transpose %6595, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %6609 = stablehlo.reshape %6608 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %6610 = stablehlo.convolution(%6609, %arg294) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %6611 = stablehlo.reshape %arg295 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %6612 = stablehlo.broadcast_in_dim %6610, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %6613 = stablehlo.broadcast_in_dim %6611, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %6614 = stablehlo.add %6612, %6613 : tensor<1x320x15x20xbf16>
-    %6615 = stablehlo.reshape %6614 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %6616 = stablehlo.transpose %6615, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %6617 = stablehlo.convert %6616 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %6618 = stablehlo.convert %6617 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %6619 = stablehlo.reduce(%6618 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6620 = stablehlo.reshape %6619 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6621 = stablehlo.broadcast_in_dim %6620, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6622 = stablehlo.divide %6621, %3088 : tensor<1x300x1xf64>
-    %6623 = stablehlo.broadcast_in_dim %6618, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %6624 = stablehlo.broadcast_in_dim %6622, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %6625 = stablehlo.subtract %6623, %6624 : tensor<1x300x320xf64>
-    %6626 = stablehlo.multiply %6625, %6625 : tensor<1x300x320xf64>
-    %6627 = stablehlo.reduce(%6626 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6628 = stablehlo.reshape %6627 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6629 = stablehlo.broadcast_in_dim %6628, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6630 = stablehlo.divide %6629, %3088 : tensor<1x300x1xf64>
-    %6631 = stablehlo.convert %6630 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %6632 = stablehlo.reduce(%6617 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %6633 = stablehlo.reshape %6632 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %6634 = stablehlo.broadcast_in_dim %6633, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6635 = stablehlo.divide %6634, %3102 : tensor<1x300x1xf32>
-    %6636 = stablehlo.broadcast_in_dim %6631, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6637 = stablehlo.add %6636, %136 : tensor<1x300x1xf32>
-    %6638 = stablehlo.rsqrt %6637 : tensor<1x300x1xf32>
-    %6639 = stablehlo.broadcast_in_dim %6617, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6640 = stablehlo.broadcast_in_dim %6635, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6641 = stablehlo.subtract %6639, %6640 : tensor<1x300x320xf32>
-    %6642 = stablehlo.broadcast_in_dim %6641, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6643 = stablehlo.broadcast_in_dim %6638, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6644 = stablehlo.multiply %6642, %6643 : tensor<1x300x320xf32>
-    %6645 = stablehlo.convert %arg296 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6646 = stablehlo.broadcast_in_dim %6644, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6647 = stablehlo.broadcast_in_dim %6645, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6648 = stablehlo.multiply %6646, %6647 : tensor<1x300x320xf32>
-    %6649 = stablehlo.convert %arg297 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6650 = stablehlo.broadcast_in_dim %6648, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6651 = stablehlo.broadcast_in_dim %6649, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6652 = stablehlo.add %6650, %6651 : tensor<1x300x320xf32>
-    %6653 = stablehlo.convert %6652 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %6654 = stablehlo.reshape %6653 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %6655 = stablehlo.convert %6654 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %6656 = stablehlo.dot_general %6655, %arg769, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6657 = stablehlo.broadcast_in_dim %6656, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6658 = stablehlo.multiply %6657, %3126 : tensor<300x320xf32>
-    %6659 = stablehlo.broadcast_in_dim %6658, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6660 = stablehlo.broadcast_in_dim %arg770, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6661 = stablehlo.add %6659, %6660 : tensor<300x320xf32>
-    %6662 = stablehlo.convert %6661 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6663 = stablehlo.reshape %6662 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6664 = stablehlo.reshape %6663 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6665 = stablehlo.transpose %6664, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6666 = stablehlo.dot_general %6655, %arg771, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6667 = stablehlo.broadcast_in_dim %6666, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6668 = stablehlo.multiply %6667, %3126 : tensor<300x320xf32>
-    %6669 = stablehlo.broadcast_in_dim %6668, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6670 = stablehlo.broadcast_in_dim %arg772, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6671 = stablehlo.add %6669, %6670 : tensor<300x320xf32>
-    %6672 = stablehlo.convert %6671 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6673 = stablehlo.reshape %6672 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6674 = stablehlo.reshape %6673 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6675 = stablehlo.transpose %6674, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6676 = stablehlo.transpose %6665, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %6677 = stablehlo.reshape %6607 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6678 = stablehlo.reshape %6676 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6679 = stablehlo.broadcast_in_dim %6678, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6680 = stablehlo.dot_general %6677, %6679, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6681 = stablehlo.reshape %6680 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6682 = stablehlo.broadcast_in_dim %6681, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6683 = stablehlo.divide %6682, %3152 : tensor<1x5x1200x300xbf16>
-    %6684 = stablehlo.convert %6683 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %6685 = stablehlo.reduce(%6684 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6686 = stablehlo.reshape %6685 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6687 = stablehlo.broadcast_in_dim %6684, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6688 = stablehlo.broadcast_in_dim %6686, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6689 = stablehlo.subtract %6687, %6688 : tensor<1x5x1200x300xf32>
-    %6690 = stablehlo.exponential %6689 : tensor<1x5x1200x300xf32>
-    %6691 = stablehlo.reduce(%6690 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6692 = stablehlo.reshape %6691 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6693 = stablehlo.broadcast_in_dim %6690, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6694 = stablehlo.broadcast_in_dim %6692, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6695 = stablehlo.divide %6693, %6694 : tensor<1x5x1200x300xf32>
-    %6696 = stablehlo.convert %6695 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %6697 = stablehlo.reshape %6696 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6698 = stablehlo.reshape %6675 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6699 = stablehlo.broadcast_in_dim %6698, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6700 = stablehlo.dot_general %6697, %6699, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6701 = stablehlo.reshape %6700 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6702 = stablehlo.transpose %6701, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6703 = stablehlo.reshape %6702 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %6704 = stablehlo.reshape %6703 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6705 = stablehlo.convert %6704 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6706 = stablehlo.dot_general %6705, %arg773, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6707 = stablehlo.broadcast_in_dim %6706, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6708 = stablehlo.multiply %6707, %3065 : tensor<1200x320xf32>
-    %6709 = stablehlo.broadcast_in_dim %6708, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6710 = stablehlo.broadcast_in_dim %arg774, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6711 = stablehlo.add %6709, %6710 : tensor<1200x320xf32>
-    %6712 = stablehlo.convert %6711 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6713 = stablehlo.reshape %6712 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6714 = stablehlo.add %6713, %6558 : tensor<1x1200x320xbf16>
-    %6715 = stablehlo.convert %6714 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6716 = stablehlo.convert %6715 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6717 = stablehlo.reduce(%6716 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6718 = stablehlo.reshape %6717 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6719 = stablehlo.broadcast_in_dim %6718, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6720 = stablehlo.divide %6719, %2987 : tensor<1x1200x1xf64>
-    %6721 = stablehlo.broadcast_in_dim %6716, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6722 = stablehlo.broadcast_in_dim %6720, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6723 = stablehlo.subtract %6721, %6722 : tensor<1x1200x320xf64>
-    %6724 = stablehlo.multiply %6723, %6723 : tensor<1x1200x320xf64>
-    %6725 = stablehlo.reduce(%6724 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6726 = stablehlo.reshape %6725 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6727 = stablehlo.broadcast_in_dim %6726, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6728 = stablehlo.divide %6727, %2987 : tensor<1x1200x1xf64>
-    %6729 = stablehlo.convert %6728 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6730 = stablehlo.reduce(%6715 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6731 = stablehlo.reshape %6730 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6732 = stablehlo.broadcast_in_dim %6731, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6733 = stablehlo.divide %6732, %3003 : tensor<1x1200x1xf32>
-    %6734 = stablehlo.broadcast_in_dim %6729, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6735 = stablehlo.add %6734, %3006 : tensor<1x1200x1xf32>
-    %6736 = stablehlo.rsqrt %6735 : tensor<1x1200x1xf32>
-    %6737 = stablehlo.broadcast_in_dim %6715, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6738 = stablehlo.broadcast_in_dim %6733, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6739 = stablehlo.subtract %6737, %6738 : tensor<1x1200x320xf32>
-    %6740 = stablehlo.broadcast_in_dim %6739, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6741 = stablehlo.broadcast_in_dim %6736, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6742 = stablehlo.multiply %6740, %6741 : tensor<1x1200x320xf32>
-    %6743 = stablehlo.convert %arg298 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6744 = stablehlo.broadcast_in_dim %6742, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6745 = stablehlo.broadcast_in_dim %6743, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6746 = stablehlo.multiply %6744, %6745 : tensor<1x1200x320xf32>
-    %6747 = stablehlo.convert %arg299 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6748 = stablehlo.broadcast_in_dim %6746, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6749 = stablehlo.broadcast_in_dim %6747, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6750 = stablehlo.add %6748, %6749 : tensor<1x1200x320xf32>
-    %6751 = stablehlo.convert %6750 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6752 = stablehlo.reshape %6751 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6753 = stablehlo.convert %6752 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6754 = stablehlo.dot_general %6753, %arg775, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %6755 = stablehlo.broadcast_in_dim %6754, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6756 = stablehlo.multiply %6755, %3226 : tensor<1200x1280xf32>
-    %6757 = stablehlo.broadcast_in_dim %6756, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %6758 = stablehlo.broadcast_in_dim %arg776, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %6759 = stablehlo.add %6757, %6758 : tensor<1200x1280xf32>
-    %6760 = stablehlo.convert %6759 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %6761 = stablehlo.reshape %6760 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %6762 = stablehlo.transpose %6761, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %6763 = stablehlo.reshape %6762 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6764 = stablehlo.convolution(%6763, %arg300) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6765 = stablehlo.reshape %arg301 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %6766 = stablehlo.broadcast_in_dim %6764, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6767 = stablehlo.broadcast_in_dim %6765, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %6768 = stablehlo.add %6766, %6767 : tensor<1x1280x30x40xbf16>
-    %6769 = stablehlo.reshape %6768 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %6770 = stablehlo.transpose %6769, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %6771 = stablehlo.multiply %6770, %cst_42 : tensor<1x1200x1280xbf16>
-    %6772 = stablehlo.multiply %6770, %3243 : tensor<1x1200x1280xbf16>
-    %6773 = stablehlo.convert %6772 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %6774 = stablehlo.clamp %cst_43, %6773, %cst_44 : tensor<1x1200x1280xf32>
-    %6775 = stablehlo.multiply %6774, %6774 : tensor<1x1200x1280xf32>
-    %6776 = stablehlo.multiply %cst_45, %6775 : tensor<1x1200x1280xf32>
-    %6777 = stablehlo.add %6776, %cst_46 : tensor<1x1200x1280xf32>
-    %6778 = stablehlo.multiply %6777, %6775 : tensor<1x1200x1280xf32>
-    %6779 = stablehlo.add %6778, %cst_47 : tensor<1x1200x1280xf32>
-    %6780 = stablehlo.multiply %6779, %6775 : tensor<1x1200x1280xf32>
-    %6781 = stablehlo.add %6780, %cst_48 : tensor<1x1200x1280xf32>
-    %6782 = stablehlo.multiply %6781, %6775 : tensor<1x1200x1280xf32>
-    %6783 = stablehlo.add %6782, %cst_49 : tensor<1x1200x1280xf32>
-    %6784 = stablehlo.multiply %6783, %6775 : tensor<1x1200x1280xf32>
-    %6785 = stablehlo.add %6784, %cst_50 : tensor<1x1200x1280xf32>
-    %6786 = stablehlo.multiply %6785, %6775 : tensor<1x1200x1280xf32>
-    %6787 = stablehlo.add %6786, %cst_51 : tensor<1x1200x1280xf32>
-    %6788 = stablehlo.multiply %cst_52, %6775 : tensor<1x1200x1280xf32>
-    %6789 = stablehlo.add %6788, %cst_53 : tensor<1x1200x1280xf32>
-    %6790 = stablehlo.multiply %6789, %6775 : tensor<1x1200x1280xf32>
-    %6791 = stablehlo.add %6790, %cst_54 : tensor<1x1200x1280xf32>
-    %6792 = stablehlo.multiply %6791, %6775 : tensor<1x1200x1280xf32>
-    %6793 = stablehlo.add %6792, %cst_55 : tensor<1x1200x1280xf32>
-    %6794 = stablehlo.multiply %6793, %6775 : tensor<1x1200x1280xf32>
-    %6795 = stablehlo.add %6794, %cst_56 : tensor<1x1200x1280xf32>
-    %6796 = stablehlo.multiply %6774, %6787 : tensor<1x1200x1280xf32>
-    %6797 = stablehlo.divide %6796, %6795 : tensor<1x1200x1280xf32>
-    %6798 = stablehlo.clamp %cst_57, %6797, %cst_58 : tensor<1x1200x1280xf32>
-    %6799 = stablehlo.convert %6798 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %6800 = stablehlo.add %6799, %cst_40 : tensor<1x1200x1280xbf16>
-    %6801 = stablehlo.multiply %6800, %6771 : tensor<1x1200x1280xbf16>
-    %6802 = stablehlo.reshape %6801 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %6803 = stablehlo.dot_general %6802, %arg777, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %6804 = stablehlo.reshape %6803 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6805 = stablehlo.broadcast_in_dim %6804, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6806 = stablehlo.broadcast_in_dim %arg302, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %6807 = stablehlo.add %6805, %6806 : tensor<1x1200x320xbf16>
-    %6808 = stablehlo.reshape %6807 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6809 = stablehlo.reshape %6808 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6810 = stablehlo.add %6809, %6714 : tensor<1x1200x320xbf16>
-    %6811 = stablehlo.convert %6810 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6812 = stablehlo.convert %6811 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6813 = stablehlo.reduce(%6812 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6814 = stablehlo.reshape %6813 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6815 = stablehlo.broadcast_in_dim %6814, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6816 = stablehlo.divide %6815, %2987 : tensor<1x1200x1xf64>
-    %6817 = stablehlo.broadcast_in_dim %6812, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6818 = stablehlo.broadcast_in_dim %6816, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6819 = stablehlo.subtract %6817, %6818 : tensor<1x1200x320xf64>
-    %6820 = stablehlo.multiply %6819, %6819 : tensor<1x1200x320xf64>
-    %6821 = stablehlo.reduce(%6820 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6822 = stablehlo.reshape %6821 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6823 = stablehlo.broadcast_in_dim %6822, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6824 = stablehlo.divide %6823, %2987 : tensor<1x1200x1xf64>
-    %6825 = stablehlo.convert %6824 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6826 = stablehlo.reduce(%6811 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6827 = stablehlo.reshape %6826 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6828 = stablehlo.broadcast_in_dim %6827, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6829 = stablehlo.divide %6828, %3003 : tensor<1x1200x1xf32>
-    %6830 = stablehlo.broadcast_in_dim %6825, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6831 = stablehlo.add %6830, %3006 : tensor<1x1200x1xf32>
-    %6832 = stablehlo.rsqrt %6831 : tensor<1x1200x1xf32>
-    %6833 = stablehlo.broadcast_in_dim %6811, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6834 = stablehlo.broadcast_in_dim %6829, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6835 = stablehlo.subtract %6833, %6834 : tensor<1x1200x320xf32>
-    %6836 = stablehlo.broadcast_in_dim %6835, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6837 = stablehlo.broadcast_in_dim %6832, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6838 = stablehlo.multiply %6836, %6837 : tensor<1x1200x320xf32>
-    %6839 = stablehlo.convert %arg303 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6840 = stablehlo.broadcast_in_dim %6838, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6841 = stablehlo.broadcast_in_dim %6839, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6842 = stablehlo.multiply %6840, %6841 : tensor<1x1200x320xf32>
-    %6843 = stablehlo.convert %arg304 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6844 = stablehlo.broadcast_in_dim %6842, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6845 = stablehlo.broadcast_in_dim %6843, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6846 = stablehlo.add %6844, %6845 : tensor<1x1200x320xf32>
-    %6847 = stablehlo.convert %6846 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %6848 = stablehlo.reshape %6847 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6849 = stablehlo.convert %6848 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6850 = stablehlo.dot_general %6849, %arg778, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6851 = stablehlo.broadcast_in_dim %6850, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6852 = stablehlo.multiply %6851, %3065 : tensor<1200x320xf32>
-    %6853 = stablehlo.broadcast_in_dim %6852, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6854 = stablehlo.broadcast_in_dim %arg779, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6855 = stablehlo.add %6853, %6854 : tensor<1200x320xf32>
-    %6856 = stablehlo.convert %6855 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6857 = stablehlo.reshape %6856 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6858 = stablehlo.reshape %6857 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6859 = stablehlo.transpose %6858, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6860 = stablehlo.transpose %6847, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %6861 = stablehlo.reshape %6860 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %6862 = stablehlo.convolution(%6861, %arg305) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %6863 = stablehlo.reshape %arg306 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %6864 = stablehlo.broadcast_in_dim %6862, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %6865 = stablehlo.broadcast_in_dim %6863, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %6866 = stablehlo.add %6864, %6865 : tensor<1x320x15x20xbf16>
-    %6867 = stablehlo.reshape %6866 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %6868 = stablehlo.transpose %6867, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %6869 = stablehlo.convert %6868 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %6870 = stablehlo.convert %6869 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %6871 = stablehlo.reduce(%6870 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6872 = stablehlo.reshape %6871 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6873 = stablehlo.broadcast_in_dim %6872, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6874 = stablehlo.divide %6873, %3088 : tensor<1x300x1xf64>
-    %6875 = stablehlo.broadcast_in_dim %6870, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %6876 = stablehlo.broadcast_in_dim %6874, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %6877 = stablehlo.subtract %6875, %6876 : tensor<1x300x320xf64>
-    %6878 = stablehlo.multiply %6877, %6877 : tensor<1x300x320xf64>
-    %6879 = stablehlo.reduce(%6878 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %6880 = stablehlo.reshape %6879 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %6881 = stablehlo.broadcast_in_dim %6880, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %6882 = stablehlo.divide %6881, %3088 : tensor<1x300x1xf64>
-    %6883 = stablehlo.convert %6882 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %6884 = stablehlo.reduce(%6869 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %6885 = stablehlo.reshape %6884 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %6886 = stablehlo.broadcast_in_dim %6885, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6887 = stablehlo.divide %6886, %3102 : tensor<1x300x1xf32>
-    %6888 = stablehlo.broadcast_in_dim %6883, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %6889 = stablehlo.add %6888, %136 : tensor<1x300x1xf32>
-    %6890 = stablehlo.rsqrt %6889 : tensor<1x300x1xf32>
-    %6891 = stablehlo.broadcast_in_dim %6869, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6892 = stablehlo.broadcast_in_dim %6887, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6893 = stablehlo.subtract %6891, %6892 : tensor<1x300x320xf32>
-    %6894 = stablehlo.broadcast_in_dim %6893, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6895 = stablehlo.broadcast_in_dim %6890, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %6896 = stablehlo.multiply %6894, %6895 : tensor<1x300x320xf32>
-    %6897 = stablehlo.convert %arg307 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6898 = stablehlo.broadcast_in_dim %6896, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6899 = stablehlo.broadcast_in_dim %6897, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6900 = stablehlo.multiply %6898, %6899 : tensor<1x300x320xf32>
-    %6901 = stablehlo.convert %arg308 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6902 = stablehlo.broadcast_in_dim %6900, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %6903 = stablehlo.broadcast_in_dim %6901, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %6904 = stablehlo.add %6902, %6903 : tensor<1x300x320xf32>
-    %6905 = stablehlo.convert %6904 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %6906 = stablehlo.reshape %6905 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %6907 = stablehlo.convert %6906 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %6908 = stablehlo.dot_general %6907, %arg780, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6909 = stablehlo.broadcast_in_dim %6908, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6910 = stablehlo.multiply %6909, %3126 : tensor<300x320xf32>
-    %6911 = stablehlo.broadcast_in_dim %6910, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6912 = stablehlo.broadcast_in_dim %arg781, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6913 = stablehlo.add %6911, %6912 : tensor<300x320xf32>
-    %6914 = stablehlo.convert %6913 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6915 = stablehlo.reshape %6914 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6916 = stablehlo.reshape %6915 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6917 = stablehlo.transpose %6916, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6918 = stablehlo.dot_general %6907, %arg782, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %6919 = stablehlo.broadcast_in_dim %6918, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6920 = stablehlo.multiply %6919, %3126 : tensor<300x320xf32>
-    %6921 = stablehlo.broadcast_in_dim %6920, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %6922 = stablehlo.broadcast_in_dim %arg783, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %6923 = stablehlo.add %6921, %6922 : tensor<300x320xf32>
-    %6924 = stablehlo.convert %6923 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %6925 = stablehlo.reshape %6924 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %6926 = stablehlo.reshape %6925 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %6927 = stablehlo.transpose %6926, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %6928 = stablehlo.transpose %6917, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %6929 = stablehlo.reshape %6859 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6930 = stablehlo.reshape %6928 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6931 = stablehlo.broadcast_in_dim %6930, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %6932 = stablehlo.dot_general %6929, %6931, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6933 = stablehlo.reshape %6932 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6934 = stablehlo.broadcast_in_dim %6933, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %6935 = stablehlo.divide %6934, %3152 : tensor<1x5x1200x300xbf16>
-    %6936 = stablehlo.convert %6935 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %6937 = stablehlo.reduce(%6936 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6938 = stablehlo.reshape %6937 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6939 = stablehlo.broadcast_in_dim %6936, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6940 = stablehlo.broadcast_in_dim %6938, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6941 = stablehlo.subtract %6939, %6940 : tensor<1x5x1200x300xf32>
-    %6942 = stablehlo.exponential %6941 : tensor<1x5x1200x300xf32>
-    %6943 = stablehlo.reduce(%6942 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %6944 = stablehlo.reshape %6943 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %6945 = stablehlo.broadcast_in_dim %6942, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %6946 = stablehlo.broadcast_in_dim %6944, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %6947 = stablehlo.divide %6945, %6946 : tensor<1x5x1200x300xf32>
-    %6948 = stablehlo.convert %6947 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %6949 = stablehlo.reshape %6948 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %6950 = stablehlo.reshape %6927 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6951 = stablehlo.broadcast_in_dim %6950, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %6952 = stablehlo.dot_general %6949, %6951, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %6953 = stablehlo.reshape %6952 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %6954 = stablehlo.transpose %6953, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %6955 = stablehlo.reshape %6954 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %6956 = stablehlo.reshape %6955 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %6957 = stablehlo.convert %6956 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %6958 = stablehlo.dot_general %6957, %arg784, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %6959 = stablehlo.broadcast_in_dim %6958, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6960 = stablehlo.multiply %6959, %3065 : tensor<1200x320xf32>
-    %6961 = stablehlo.broadcast_in_dim %6960, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %6962 = stablehlo.broadcast_in_dim %arg785, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %6963 = stablehlo.add %6961, %6962 : tensor<1200x320xf32>
-    %6964 = stablehlo.convert %6963 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %6965 = stablehlo.reshape %6964 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %6966 = stablehlo.add %6965, %6810 : tensor<1x1200x320xbf16>
-    %6967 = stablehlo.convert %6966 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %6968 = stablehlo.convert %6967 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %6969 = stablehlo.reduce(%6968 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6970 = stablehlo.reshape %6969 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6971 = stablehlo.broadcast_in_dim %6970, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6972 = stablehlo.divide %6971, %2987 : tensor<1x1200x1xf64>
-    %6973 = stablehlo.broadcast_in_dim %6968, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %6974 = stablehlo.broadcast_in_dim %6972, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %6975 = stablehlo.subtract %6973, %6974 : tensor<1x1200x320xf64>
-    %6976 = stablehlo.multiply %6975, %6975 : tensor<1x1200x320xf64>
-    %6977 = stablehlo.reduce(%6976 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %6978 = stablehlo.reshape %6977 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %6979 = stablehlo.broadcast_in_dim %6978, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %6980 = stablehlo.divide %6979, %2987 : tensor<1x1200x1xf64>
-    %6981 = stablehlo.convert %6980 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %6982 = stablehlo.reduce(%6967 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %6983 = stablehlo.reshape %6982 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %6984 = stablehlo.broadcast_in_dim %6983, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6985 = stablehlo.divide %6984, %3003 : tensor<1x1200x1xf32>
-    %6986 = stablehlo.broadcast_in_dim %6981, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %6987 = stablehlo.add %6986, %3006 : tensor<1x1200x1xf32>
-    %6988 = stablehlo.rsqrt %6987 : tensor<1x1200x1xf32>
-    %6989 = stablehlo.broadcast_in_dim %6967, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6990 = stablehlo.broadcast_in_dim %6985, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6991 = stablehlo.subtract %6989, %6990 : tensor<1x1200x320xf32>
-    %6992 = stablehlo.broadcast_in_dim %6991, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6993 = stablehlo.broadcast_in_dim %6988, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %6994 = stablehlo.multiply %6992, %6993 : tensor<1x1200x320xf32>
-    %6995 = stablehlo.convert %arg309 : (tensor<320xbf16>) -> tensor<320xf32>
-    %6996 = stablehlo.broadcast_in_dim %6994, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %6997 = stablehlo.broadcast_in_dim %6995, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %6998 = stablehlo.multiply %6996, %6997 : tensor<1x1200x320xf32>
-    %6999 = stablehlo.convert %arg310 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7000 = stablehlo.broadcast_in_dim %6998, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7001 = stablehlo.broadcast_in_dim %6999, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7002 = stablehlo.add %7000, %7001 : tensor<1x1200x320xf32>
-    %7003 = stablehlo.convert %7002 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7004 = stablehlo.reshape %7003 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7005 = stablehlo.convert %7004 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7006 = stablehlo.dot_general %7005, %arg786, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %7007 = stablehlo.broadcast_in_dim %7006, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7008 = stablehlo.multiply %7007, %3226 : tensor<1200x1280xf32>
-    %7009 = stablehlo.broadcast_in_dim %7008, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7010 = stablehlo.broadcast_in_dim %arg787, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %7011 = stablehlo.add %7009, %7010 : tensor<1200x1280xf32>
-    %7012 = stablehlo.convert %7011 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %7013 = stablehlo.reshape %7012 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %7014 = stablehlo.transpose %7013, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %7015 = stablehlo.reshape %7014 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7016 = stablehlo.convolution(%7015, %arg311) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7017 = stablehlo.reshape %arg312 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %7018 = stablehlo.broadcast_in_dim %7016, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7019 = stablehlo.broadcast_in_dim %7017, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7020 = stablehlo.add %7018, %7019 : tensor<1x1280x30x40xbf16>
-    %7021 = stablehlo.reshape %7020 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %7022 = stablehlo.transpose %7021, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %7023 = stablehlo.multiply %7022, %cst_42 : tensor<1x1200x1280xbf16>
-    %7024 = stablehlo.multiply %7022, %3243 : tensor<1x1200x1280xbf16>
-    %7025 = stablehlo.convert %7024 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %7026 = stablehlo.clamp %cst_43, %7025, %cst_44 : tensor<1x1200x1280xf32>
-    %7027 = stablehlo.multiply %7026, %7026 : tensor<1x1200x1280xf32>
-    %7028 = stablehlo.multiply %cst_45, %7027 : tensor<1x1200x1280xf32>
-    %7029 = stablehlo.add %7028, %cst_46 : tensor<1x1200x1280xf32>
-    %7030 = stablehlo.multiply %7029, %7027 : tensor<1x1200x1280xf32>
-    %7031 = stablehlo.add %7030, %cst_47 : tensor<1x1200x1280xf32>
-    %7032 = stablehlo.multiply %7031, %7027 : tensor<1x1200x1280xf32>
-    %7033 = stablehlo.add %7032, %cst_48 : tensor<1x1200x1280xf32>
-    %7034 = stablehlo.multiply %7033, %7027 : tensor<1x1200x1280xf32>
-    %7035 = stablehlo.add %7034, %cst_49 : tensor<1x1200x1280xf32>
-    %7036 = stablehlo.multiply %7035, %7027 : tensor<1x1200x1280xf32>
-    %7037 = stablehlo.add %7036, %cst_50 : tensor<1x1200x1280xf32>
-    %7038 = stablehlo.multiply %7037, %7027 : tensor<1x1200x1280xf32>
-    %7039 = stablehlo.add %7038, %cst_51 : tensor<1x1200x1280xf32>
-    %7040 = stablehlo.multiply %cst_52, %7027 : tensor<1x1200x1280xf32>
-    %7041 = stablehlo.add %7040, %cst_53 : tensor<1x1200x1280xf32>
-    %7042 = stablehlo.multiply %7041, %7027 : tensor<1x1200x1280xf32>
-    %7043 = stablehlo.add %7042, %cst_54 : tensor<1x1200x1280xf32>
-    %7044 = stablehlo.multiply %7043, %7027 : tensor<1x1200x1280xf32>
-    %7045 = stablehlo.add %7044, %cst_55 : tensor<1x1200x1280xf32>
-    %7046 = stablehlo.multiply %7045, %7027 : tensor<1x1200x1280xf32>
-    %7047 = stablehlo.add %7046, %cst_56 : tensor<1x1200x1280xf32>
-    %7048 = stablehlo.multiply %7026, %7039 : tensor<1x1200x1280xf32>
-    %7049 = stablehlo.divide %7048, %7047 : tensor<1x1200x1280xf32>
-    %7050 = stablehlo.clamp %cst_57, %7049, %cst_58 : tensor<1x1200x1280xf32>
-    %7051 = stablehlo.convert %7050 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %7052 = stablehlo.add %7051, %cst_40 : tensor<1x1200x1280xbf16>
-    %7053 = stablehlo.multiply %7052, %7023 : tensor<1x1200x1280xbf16>
-    %7054 = stablehlo.reshape %7053 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %7055 = stablehlo.dot_general %7054, %arg788, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %7056 = stablehlo.reshape %7055 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7057 = stablehlo.broadcast_in_dim %7056, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7058 = stablehlo.broadcast_in_dim %arg313, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %7059 = stablehlo.add %7057, %7058 : tensor<1x1200x320xbf16>
-    %7060 = stablehlo.reshape %7059 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7061 = stablehlo.reshape %7060 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7062 = stablehlo.add %7061, %6966 : tensor<1x1200x320xbf16>
-    %7063 = stablehlo.convert %7062 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7064 = stablehlo.convert %7063 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7065 = stablehlo.reduce(%7064 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7066 = stablehlo.reshape %7065 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7067 = stablehlo.broadcast_in_dim %7066, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7068 = stablehlo.divide %7067, %2987 : tensor<1x1200x1xf64>
-    %7069 = stablehlo.broadcast_in_dim %7064, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7070 = stablehlo.broadcast_in_dim %7068, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7071 = stablehlo.subtract %7069, %7070 : tensor<1x1200x320xf64>
-    %7072 = stablehlo.multiply %7071, %7071 : tensor<1x1200x320xf64>
-    %7073 = stablehlo.reduce(%7072 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7074 = stablehlo.reshape %7073 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7075 = stablehlo.broadcast_in_dim %7074, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7076 = stablehlo.divide %7075, %2987 : tensor<1x1200x1xf64>
-    %7077 = stablehlo.convert %7076 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7078 = stablehlo.reduce(%7063 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7079 = stablehlo.reshape %7078 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7080 = stablehlo.broadcast_in_dim %7079, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7081 = stablehlo.divide %7080, %3003 : tensor<1x1200x1xf32>
-    %7082 = stablehlo.broadcast_in_dim %7077, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7083 = stablehlo.add %7082, %3006 : tensor<1x1200x1xf32>
-    %7084 = stablehlo.rsqrt %7083 : tensor<1x1200x1xf32>
-    %7085 = stablehlo.broadcast_in_dim %7063, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7086 = stablehlo.broadcast_in_dim %7081, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7087 = stablehlo.subtract %7085, %7086 : tensor<1x1200x320xf32>
-    %7088 = stablehlo.broadcast_in_dim %7087, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7089 = stablehlo.broadcast_in_dim %7084, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7090 = stablehlo.multiply %7088, %7089 : tensor<1x1200x320xf32>
-    %7091 = stablehlo.convert %arg314 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7092 = stablehlo.broadcast_in_dim %7090, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7093 = stablehlo.broadcast_in_dim %7091, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7094 = stablehlo.multiply %7092, %7093 : tensor<1x1200x320xf32>
-    %7095 = stablehlo.convert %arg315 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7096 = stablehlo.broadcast_in_dim %7094, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7097 = stablehlo.broadcast_in_dim %7095, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7098 = stablehlo.add %7096, %7097 : tensor<1x1200x320xf32>
-    %7099 = stablehlo.convert %7098 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7100 = stablehlo.reshape %7099 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7101 = stablehlo.convert %7100 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7102 = stablehlo.dot_general %7101, %arg789, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7103 = stablehlo.broadcast_in_dim %7102, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7104 = stablehlo.multiply %7103, %3065 : tensor<1200x320xf32>
-    %7105 = stablehlo.broadcast_in_dim %7104, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7106 = stablehlo.broadcast_in_dim %arg790, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7107 = stablehlo.add %7105, %7106 : tensor<1200x320xf32>
-    %7108 = stablehlo.convert %7107 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7109 = stablehlo.reshape %7108 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7110 = stablehlo.reshape %7109 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7111 = stablehlo.transpose %7110, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7112 = stablehlo.transpose %7099, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %7113 = stablehlo.reshape %7112 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %7114 = stablehlo.convolution(%7113, %arg316) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %7115 = stablehlo.reshape %arg317 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %7116 = stablehlo.broadcast_in_dim %7114, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %7117 = stablehlo.broadcast_in_dim %7115, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %7118 = stablehlo.add %7116, %7117 : tensor<1x320x15x20xbf16>
-    %7119 = stablehlo.reshape %7118 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %7120 = stablehlo.transpose %7119, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %7121 = stablehlo.convert %7120 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %7122 = stablehlo.convert %7121 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %7123 = stablehlo.reduce(%7122 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7124 = stablehlo.reshape %7123 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7125 = stablehlo.broadcast_in_dim %7124, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7126 = stablehlo.divide %7125, %3088 : tensor<1x300x1xf64>
-    %7127 = stablehlo.broadcast_in_dim %7122, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %7128 = stablehlo.broadcast_in_dim %7126, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %7129 = stablehlo.subtract %7127, %7128 : tensor<1x300x320xf64>
-    %7130 = stablehlo.multiply %7129, %7129 : tensor<1x300x320xf64>
-    %7131 = stablehlo.reduce(%7130 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7132 = stablehlo.reshape %7131 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7133 = stablehlo.broadcast_in_dim %7132, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7134 = stablehlo.divide %7133, %3088 : tensor<1x300x1xf64>
-    %7135 = stablehlo.convert %7134 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %7136 = stablehlo.reduce(%7121 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %7137 = stablehlo.reshape %7136 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %7138 = stablehlo.broadcast_in_dim %7137, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7139 = stablehlo.divide %7138, %3102 : tensor<1x300x1xf32>
-    %7140 = stablehlo.broadcast_in_dim %7135, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7141 = stablehlo.add %7140, %136 : tensor<1x300x1xf32>
-    %7142 = stablehlo.rsqrt %7141 : tensor<1x300x1xf32>
-    %7143 = stablehlo.broadcast_in_dim %7121, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7144 = stablehlo.broadcast_in_dim %7139, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7145 = stablehlo.subtract %7143, %7144 : tensor<1x300x320xf32>
-    %7146 = stablehlo.broadcast_in_dim %7145, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7147 = stablehlo.broadcast_in_dim %7142, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7148 = stablehlo.multiply %7146, %7147 : tensor<1x300x320xf32>
-    %7149 = stablehlo.convert %arg318 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7150 = stablehlo.broadcast_in_dim %7148, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7151 = stablehlo.broadcast_in_dim %7149, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7152 = stablehlo.multiply %7150, %7151 : tensor<1x300x320xf32>
-    %7153 = stablehlo.convert %arg319 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7154 = stablehlo.broadcast_in_dim %7152, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7155 = stablehlo.broadcast_in_dim %7153, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7156 = stablehlo.add %7154, %7155 : tensor<1x300x320xf32>
-    %7157 = stablehlo.convert %7156 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %7158 = stablehlo.reshape %7157 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %7159 = stablehlo.convert %7158 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %7160 = stablehlo.dot_general %7159, %arg791, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7161 = stablehlo.broadcast_in_dim %7160, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7162 = stablehlo.multiply %7161, %3126 : tensor<300x320xf32>
-    %7163 = stablehlo.broadcast_in_dim %7162, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7164 = stablehlo.broadcast_in_dim %arg792, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7165 = stablehlo.add %7163, %7164 : tensor<300x320xf32>
-    %7166 = stablehlo.convert %7165 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7167 = stablehlo.reshape %7166 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7168 = stablehlo.reshape %7167 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7169 = stablehlo.transpose %7168, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7170 = stablehlo.dot_general %7159, %arg793, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7171 = stablehlo.broadcast_in_dim %7170, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7172 = stablehlo.multiply %7171, %3126 : tensor<300x320xf32>
-    %7173 = stablehlo.broadcast_in_dim %7172, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7174 = stablehlo.broadcast_in_dim %arg794, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7175 = stablehlo.add %7173, %7174 : tensor<300x320xf32>
-    %7176 = stablehlo.convert %7175 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7177 = stablehlo.reshape %7176 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7178 = stablehlo.reshape %7177 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7179 = stablehlo.transpose %7178, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7180 = stablehlo.transpose %7169, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %7181 = stablehlo.reshape %7111 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7182 = stablehlo.reshape %7180 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7183 = stablehlo.broadcast_in_dim %7182, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7184 = stablehlo.dot_general %7181, %7183, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7185 = stablehlo.reshape %7184 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7186 = stablehlo.broadcast_in_dim %7185, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7187 = stablehlo.divide %7186, %3152 : tensor<1x5x1200x300xbf16>
-    %7188 = stablehlo.convert %7187 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %7189 = stablehlo.reduce(%7188 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7190 = stablehlo.reshape %7189 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7191 = stablehlo.broadcast_in_dim %7188, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7192 = stablehlo.broadcast_in_dim %7190, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7193 = stablehlo.subtract %7191, %7192 : tensor<1x5x1200x300xf32>
-    %7194 = stablehlo.exponential %7193 : tensor<1x5x1200x300xf32>
-    %7195 = stablehlo.reduce(%7194 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7196 = stablehlo.reshape %7195 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7197 = stablehlo.broadcast_in_dim %7194, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7198 = stablehlo.broadcast_in_dim %7196, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7199 = stablehlo.divide %7197, %7198 : tensor<1x5x1200x300xf32>
-    %7200 = stablehlo.convert %7199 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %7201 = stablehlo.reshape %7200 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7202 = stablehlo.reshape %7179 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7203 = stablehlo.broadcast_in_dim %7202, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7204 = stablehlo.dot_general %7201, %7203, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7205 = stablehlo.reshape %7204 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7206 = stablehlo.transpose %7205, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7207 = stablehlo.reshape %7206 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %7208 = stablehlo.reshape %7207 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7209 = stablehlo.convert %7208 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7210 = stablehlo.dot_general %7209, %arg795, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7211 = stablehlo.broadcast_in_dim %7210, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7212 = stablehlo.multiply %7211, %3065 : tensor<1200x320xf32>
-    %7213 = stablehlo.broadcast_in_dim %7212, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7214 = stablehlo.broadcast_in_dim %arg796, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7215 = stablehlo.add %7213, %7214 : tensor<1200x320xf32>
-    %7216 = stablehlo.convert %7215 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7217 = stablehlo.reshape %7216 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7218 = stablehlo.add %7217, %7062 : tensor<1x1200x320xbf16>
-    %7219 = stablehlo.convert %7218 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7220 = stablehlo.convert %7219 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7221 = stablehlo.reduce(%7220 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7222 = stablehlo.reshape %7221 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7223 = stablehlo.broadcast_in_dim %7222, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7224 = stablehlo.divide %7223, %2987 : tensor<1x1200x1xf64>
-    %7225 = stablehlo.broadcast_in_dim %7220, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7226 = stablehlo.broadcast_in_dim %7224, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7227 = stablehlo.subtract %7225, %7226 : tensor<1x1200x320xf64>
-    %7228 = stablehlo.multiply %7227, %7227 : tensor<1x1200x320xf64>
-    %7229 = stablehlo.reduce(%7228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7230 = stablehlo.reshape %7229 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7231 = stablehlo.broadcast_in_dim %7230, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7232 = stablehlo.divide %7231, %2987 : tensor<1x1200x1xf64>
-    %7233 = stablehlo.convert %7232 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7234 = stablehlo.reduce(%7219 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7235 = stablehlo.reshape %7234 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7236 = stablehlo.broadcast_in_dim %7235, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7237 = stablehlo.divide %7236, %3003 : tensor<1x1200x1xf32>
-    %7238 = stablehlo.broadcast_in_dim %7233, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7239 = stablehlo.add %7238, %3006 : tensor<1x1200x1xf32>
-    %7240 = stablehlo.rsqrt %7239 : tensor<1x1200x1xf32>
-    %7241 = stablehlo.broadcast_in_dim %7219, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7242 = stablehlo.broadcast_in_dim %7237, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7243 = stablehlo.subtract %7241, %7242 : tensor<1x1200x320xf32>
-    %7244 = stablehlo.broadcast_in_dim %7243, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7245 = stablehlo.broadcast_in_dim %7240, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7246 = stablehlo.multiply %7244, %7245 : tensor<1x1200x320xf32>
-    %7247 = stablehlo.convert %arg320 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7248 = stablehlo.broadcast_in_dim %7246, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7249 = stablehlo.broadcast_in_dim %7247, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7250 = stablehlo.multiply %7248, %7249 : tensor<1x1200x320xf32>
-    %7251 = stablehlo.convert %arg321 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7252 = stablehlo.broadcast_in_dim %7250, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7253 = stablehlo.broadcast_in_dim %7251, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7254 = stablehlo.add %7252, %7253 : tensor<1x1200x320xf32>
-    %7255 = stablehlo.convert %7254 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7256 = stablehlo.reshape %7255 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7257 = stablehlo.convert %7256 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7258 = stablehlo.dot_general %7257, %arg797, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %7259 = stablehlo.broadcast_in_dim %7258, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7260 = stablehlo.multiply %7259, %3226 : tensor<1200x1280xf32>
-    %7261 = stablehlo.broadcast_in_dim %7260, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7262 = stablehlo.broadcast_in_dim %arg798, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %7263 = stablehlo.add %7261, %7262 : tensor<1200x1280xf32>
-    %7264 = stablehlo.convert %7263 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %7265 = stablehlo.reshape %7264 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %7266 = stablehlo.transpose %7265, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %7267 = stablehlo.reshape %7266 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7268 = stablehlo.convolution(%7267, %arg322) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7269 = stablehlo.reshape %arg323 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %7270 = stablehlo.broadcast_in_dim %7268, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7271 = stablehlo.broadcast_in_dim %7269, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7272 = stablehlo.add %7270, %7271 : tensor<1x1280x30x40xbf16>
-    %7273 = stablehlo.reshape %7272 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %7274 = stablehlo.transpose %7273, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %7275 = stablehlo.multiply %7274, %cst_42 : tensor<1x1200x1280xbf16>
-    %7276 = stablehlo.multiply %7274, %3243 : tensor<1x1200x1280xbf16>
-    %7277 = stablehlo.convert %7276 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %7278 = stablehlo.clamp %cst_43, %7277, %cst_44 : tensor<1x1200x1280xf32>
-    %7279 = stablehlo.multiply %7278, %7278 : tensor<1x1200x1280xf32>
-    %7280 = stablehlo.multiply %cst_45, %7279 : tensor<1x1200x1280xf32>
-    %7281 = stablehlo.add %7280, %cst_46 : tensor<1x1200x1280xf32>
-    %7282 = stablehlo.multiply %7281, %7279 : tensor<1x1200x1280xf32>
-    %7283 = stablehlo.add %7282, %cst_47 : tensor<1x1200x1280xf32>
-    %7284 = stablehlo.multiply %7283, %7279 : tensor<1x1200x1280xf32>
-    %7285 = stablehlo.add %7284, %cst_48 : tensor<1x1200x1280xf32>
-    %7286 = stablehlo.multiply %7285, %7279 : tensor<1x1200x1280xf32>
-    %7287 = stablehlo.add %7286, %cst_49 : tensor<1x1200x1280xf32>
-    %7288 = stablehlo.multiply %7287, %7279 : tensor<1x1200x1280xf32>
-    %7289 = stablehlo.add %7288, %cst_50 : tensor<1x1200x1280xf32>
-    %7290 = stablehlo.multiply %7289, %7279 : tensor<1x1200x1280xf32>
-    %7291 = stablehlo.add %7290, %cst_51 : tensor<1x1200x1280xf32>
-    %7292 = stablehlo.multiply %cst_52, %7279 : tensor<1x1200x1280xf32>
-    %7293 = stablehlo.add %7292, %cst_53 : tensor<1x1200x1280xf32>
-    %7294 = stablehlo.multiply %7293, %7279 : tensor<1x1200x1280xf32>
-    %7295 = stablehlo.add %7294, %cst_54 : tensor<1x1200x1280xf32>
-    %7296 = stablehlo.multiply %7295, %7279 : tensor<1x1200x1280xf32>
-    %7297 = stablehlo.add %7296, %cst_55 : tensor<1x1200x1280xf32>
-    %7298 = stablehlo.multiply %7297, %7279 : tensor<1x1200x1280xf32>
-    %7299 = stablehlo.add %7298, %cst_56 : tensor<1x1200x1280xf32>
-    %7300 = stablehlo.multiply %7278, %7291 : tensor<1x1200x1280xf32>
-    %7301 = stablehlo.divide %7300, %7299 : tensor<1x1200x1280xf32>
-    %7302 = stablehlo.clamp %cst_57, %7301, %cst_58 : tensor<1x1200x1280xf32>
-    %7303 = stablehlo.convert %7302 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %7304 = stablehlo.add %7303, %cst_40 : tensor<1x1200x1280xbf16>
-    %7305 = stablehlo.multiply %7304, %7275 : tensor<1x1200x1280xbf16>
-    %7306 = stablehlo.reshape %7305 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %7307 = stablehlo.dot_general %7306, %arg799, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %7308 = stablehlo.reshape %7307 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7309 = stablehlo.broadcast_in_dim %7308, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7310 = stablehlo.broadcast_in_dim %arg324, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %7311 = stablehlo.add %7309, %7310 : tensor<1x1200x320xbf16>
-    %7312 = stablehlo.reshape %7311 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7313 = stablehlo.reshape %7312 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7314 = stablehlo.add %7313, %7218 : tensor<1x1200x320xbf16>
-    %7315 = stablehlo.convert %7314 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7316 = stablehlo.convert %7315 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7317 = stablehlo.reduce(%7316 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7318 = stablehlo.reshape %7317 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7319 = stablehlo.broadcast_in_dim %7318, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7320 = stablehlo.divide %7319, %2987 : tensor<1x1200x1xf64>
-    %7321 = stablehlo.broadcast_in_dim %7316, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7322 = stablehlo.broadcast_in_dim %7320, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7323 = stablehlo.subtract %7321, %7322 : tensor<1x1200x320xf64>
-    %7324 = stablehlo.multiply %7323, %7323 : tensor<1x1200x320xf64>
-    %7325 = stablehlo.reduce(%7324 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7326 = stablehlo.reshape %7325 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7327 = stablehlo.broadcast_in_dim %7326, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7328 = stablehlo.divide %7327, %2987 : tensor<1x1200x1xf64>
-    %7329 = stablehlo.convert %7328 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7330 = stablehlo.reduce(%7315 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7331 = stablehlo.reshape %7330 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7332 = stablehlo.broadcast_in_dim %7331, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7333 = stablehlo.divide %7332, %3003 : tensor<1x1200x1xf32>
-    %7334 = stablehlo.broadcast_in_dim %7329, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7335 = stablehlo.add %7334, %3006 : tensor<1x1200x1xf32>
-    %7336 = stablehlo.rsqrt %7335 : tensor<1x1200x1xf32>
-    %7337 = stablehlo.broadcast_in_dim %7315, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7338 = stablehlo.broadcast_in_dim %7333, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7339 = stablehlo.subtract %7337, %7338 : tensor<1x1200x320xf32>
-    %7340 = stablehlo.broadcast_in_dim %7339, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7341 = stablehlo.broadcast_in_dim %7336, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7342 = stablehlo.multiply %7340, %7341 : tensor<1x1200x320xf32>
-    %7343 = stablehlo.convert %arg325 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7344 = stablehlo.broadcast_in_dim %7342, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7345 = stablehlo.broadcast_in_dim %7343, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7346 = stablehlo.multiply %7344, %7345 : tensor<1x1200x320xf32>
-    %7347 = stablehlo.convert %arg326 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7348 = stablehlo.broadcast_in_dim %7346, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7349 = stablehlo.broadcast_in_dim %7347, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7350 = stablehlo.add %7348, %7349 : tensor<1x1200x320xf32>
-    %7351 = stablehlo.convert %7350 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7352 = stablehlo.reshape %7351 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7353 = stablehlo.convert %7352 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7354 = stablehlo.dot_general %7353, %arg800, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7355 = stablehlo.broadcast_in_dim %7354, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7356 = stablehlo.multiply %7355, %3065 : tensor<1200x320xf32>
-    %7357 = stablehlo.broadcast_in_dim %7356, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7358 = stablehlo.broadcast_in_dim %arg801, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7359 = stablehlo.add %7357, %7358 : tensor<1200x320xf32>
-    %7360 = stablehlo.convert %7359 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7361 = stablehlo.reshape %7360 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7362 = stablehlo.reshape %7361 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7363 = stablehlo.transpose %7362, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7364 = stablehlo.transpose %7351, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %7365 = stablehlo.reshape %7364 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %7366 = stablehlo.convolution(%7365, %arg327) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %7367 = stablehlo.reshape %arg328 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %7368 = stablehlo.broadcast_in_dim %7366, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %7369 = stablehlo.broadcast_in_dim %7367, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %7370 = stablehlo.add %7368, %7369 : tensor<1x320x15x20xbf16>
-    %7371 = stablehlo.reshape %7370 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %7372 = stablehlo.transpose %7371, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %7373 = stablehlo.convert %7372 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %7374 = stablehlo.convert %7373 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %7375 = stablehlo.reduce(%7374 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7376 = stablehlo.reshape %7375 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7377 = stablehlo.broadcast_in_dim %7376, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7378 = stablehlo.divide %7377, %3088 : tensor<1x300x1xf64>
-    %7379 = stablehlo.broadcast_in_dim %7374, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %7380 = stablehlo.broadcast_in_dim %7378, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %7381 = stablehlo.subtract %7379, %7380 : tensor<1x300x320xf64>
-    %7382 = stablehlo.multiply %7381, %7381 : tensor<1x300x320xf64>
-    %7383 = stablehlo.reduce(%7382 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7384 = stablehlo.reshape %7383 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7385 = stablehlo.broadcast_in_dim %7384, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7386 = stablehlo.divide %7385, %3088 : tensor<1x300x1xf64>
-    %7387 = stablehlo.convert %7386 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %7388 = stablehlo.reduce(%7373 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %7389 = stablehlo.reshape %7388 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %7390 = stablehlo.broadcast_in_dim %7389, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7391 = stablehlo.divide %7390, %3102 : tensor<1x300x1xf32>
-    %7392 = stablehlo.broadcast_in_dim %7387, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7393 = stablehlo.add %7392, %136 : tensor<1x300x1xf32>
-    %7394 = stablehlo.rsqrt %7393 : tensor<1x300x1xf32>
-    %7395 = stablehlo.broadcast_in_dim %7373, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7396 = stablehlo.broadcast_in_dim %7391, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7397 = stablehlo.subtract %7395, %7396 : tensor<1x300x320xf32>
-    %7398 = stablehlo.broadcast_in_dim %7397, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7399 = stablehlo.broadcast_in_dim %7394, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7400 = stablehlo.multiply %7398, %7399 : tensor<1x300x320xf32>
-    %7401 = stablehlo.convert %arg329 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7402 = stablehlo.broadcast_in_dim %7400, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7403 = stablehlo.broadcast_in_dim %7401, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7404 = stablehlo.multiply %7402, %7403 : tensor<1x300x320xf32>
-    %7405 = stablehlo.convert %arg330 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7406 = stablehlo.broadcast_in_dim %7404, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7407 = stablehlo.broadcast_in_dim %7405, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7408 = stablehlo.add %7406, %7407 : tensor<1x300x320xf32>
-    %7409 = stablehlo.convert %7408 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %7410 = stablehlo.reshape %7409 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %7411 = stablehlo.convert %7410 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %7412 = stablehlo.dot_general %7411, %arg802, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7413 = stablehlo.broadcast_in_dim %7412, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7414 = stablehlo.multiply %7413, %3126 : tensor<300x320xf32>
-    %7415 = stablehlo.broadcast_in_dim %7414, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7416 = stablehlo.broadcast_in_dim %arg803, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7417 = stablehlo.add %7415, %7416 : tensor<300x320xf32>
-    %7418 = stablehlo.convert %7417 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7419 = stablehlo.reshape %7418 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7420 = stablehlo.reshape %7419 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7421 = stablehlo.transpose %7420, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7422 = stablehlo.dot_general %7411, %arg804, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7423 = stablehlo.broadcast_in_dim %7422, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7424 = stablehlo.multiply %7423, %3126 : tensor<300x320xf32>
-    %7425 = stablehlo.broadcast_in_dim %7424, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7426 = stablehlo.broadcast_in_dim %arg805, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7427 = stablehlo.add %7425, %7426 : tensor<300x320xf32>
-    %7428 = stablehlo.convert %7427 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7429 = stablehlo.reshape %7428 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7430 = stablehlo.reshape %7429 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7431 = stablehlo.transpose %7430, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7432 = stablehlo.transpose %7421, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %7433 = stablehlo.reshape %7363 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7434 = stablehlo.reshape %7432 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7435 = stablehlo.broadcast_in_dim %7434, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7436 = stablehlo.dot_general %7433, %7435, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7437 = stablehlo.reshape %7436 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7438 = stablehlo.broadcast_in_dim %7437, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7439 = stablehlo.divide %7438, %3152 : tensor<1x5x1200x300xbf16>
-    %7440 = stablehlo.convert %7439 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %7441 = stablehlo.reduce(%7440 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7442 = stablehlo.reshape %7441 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7443 = stablehlo.broadcast_in_dim %7440, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7444 = stablehlo.broadcast_in_dim %7442, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7445 = stablehlo.subtract %7443, %7444 : tensor<1x5x1200x300xf32>
-    %7446 = stablehlo.exponential %7445 : tensor<1x5x1200x300xf32>
-    %7447 = stablehlo.reduce(%7446 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7448 = stablehlo.reshape %7447 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7449 = stablehlo.broadcast_in_dim %7446, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7450 = stablehlo.broadcast_in_dim %7448, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7451 = stablehlo.divide %7449, %7450 : tensor<1x5x1200x300xf32>
-    %7452 = stablehlo.convert %7451 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %7453 = stablehlo.reshape %7452 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7454 = stablehlo.reshape %7431 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7455 = stablehlo.broadcast_in_dim %7454, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7456 = stablehlo.dot_general %7453, %7455, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7457 = stablehlo.reshape %7456 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7458 = stablehlo.transpose %7457, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7459 = stablehlo.reshape %7458 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %7460 = stablehlo.reshape %7459 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7461 = stablehlo.convert %7460 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7462 = stablehlo.dot_general %7461, %arg806, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7463 = stablehlo.broadcast_in_dim %7462, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7464 = stablehlo.multiply %7463, %3065 : tensor<1200x320xf32>
-    %7465 = stablehlo.broadcast_in_dim %7464, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7466 = stablehlo.broadcast_in_dim %arg807, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7467 = stablehlo.add %7465, %7466 : tensor<1200x320xf32>
-    %7468 = stablehlo.convert %7467 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7469 = stablehlo.reshape %7468 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7470 = stablehlo.add %7469, %7314 : tensor<1x1200x320xbf16>
-    %7471 = stablehlo.convert %7470 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7472 = stablehlo.convert %7471 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7473 = stablehlo.reduce(%7472 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7474 = stablehlo.reshape %7473 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7475 = stablehlo.broadcast_in_dim %7474, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7476 = stablehlo.divide %7475, %2987 : tensor<1x1200x1xf64>
-    %7477 = stablehlo.broadcast_in_dim %7472, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7478 = stablehlo.broadcast_in_dim %7476, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7479 = stablehlo.subtract %7477, %7478 : tensor<1x1200x320xf64>
-    %7480 = stablehlo.multiply %7479, %7479 : tensor<1x1200x320xf64>
-    %7481 = stablehlo.reduce(%7480 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7482 = stablehlo.reshape %7481 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7483 = stablehlo.broadcast_in_dim %7482, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7484 = stablehlo.divide %7483, %2987 : tensor<1x1200x1xf64>
-    %7485 = stablehlo.convert %7484 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7486 = stablehlo.reduce(%7471 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7487 = stablehlo.reshape %7486 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7488 = stablehlo.broadcast_in_dim %7487, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7489 = stablehlo.divide %7488, %3003 : tensor<1x1200x1xf32>
-    %7490 = stablehlo.broadcast_in_dim %7485, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7491 = stablehlo.add %7490, %3006 : tensor<1x1200x1xf32>
-    %7492 = stablehlo.rsqrt %7491 : tensor<1x1200x1xf32>
-    %7493 = stablehlo.broadcast_in_dim %7471, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7494 = stablehlo.broadcast_in_dim %7489, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7495 = stablehlo.subtract %7493, %7494 : tensor<1x1200x320xf32>
-    %7496 = stablehlo.broadcast_in_dim %7495, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7497 = stablehlo.broadcast_in_dim %7492, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7498 = stablehlo.multiply %7496, %7497 : tensor<1x1200x320xf32>
-    %7499 = stablehlo.convert %arg331 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7500 = stablehlo.broadcast_in_dim %7498, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7501 = stablehlo.broadcast_in_dim %7499, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7502 = stablehlo.multiply %7500, %7501 : tensor<1x1200x320xf32>
-    %7503 = stablehlo.convert %arg332 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7504 = stablehlo.broadcast_in_dim %7502, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7505 = stablehlo.broadcast_in_dim %7503, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7506 = stablehlo.add %7504, %7505 : tensor<1x1200x320xf32>
-    %7507 = stablehlo.convert %7506 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7508 = stablehlo.reshape %7507 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7509 = stablehlo.convert %7508 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7510 = stablehlo.dot_general %7509, %arg808, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %7511 = stablehlo.broadcast_in_dim %7510, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7512 = stablehlo.multiply %7511, %3226 : tensor<1200x1280xf32>
-    %7513 = stablehlo.broadcast_in_dim %7512, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7514 = stablehlo.broadcast_in_dim %arg809, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %7515 = stablehlo.add %7513, %7514 : tensor<1200x1280xf32>
-    %7516 = stablehlo.convert %7515 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %7517 = stablehlo.reshape %7516 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %7518 = stablehlo.transpose %7517, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %7519 = stablehlo.reshape %7518 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7520 = stablehlo.convolution(%7519, %arg333) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7521 = stablehlo.reshape %arg334 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %7522 = stablehlo.broadcast_in_dim %7520, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7523 = stablehlo.broadcast_in_dim %7521, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7524 = stablehlo.add %7522, %7523 : tensor<1x1280x30x40xbf16>
-    %7525 = stablehlo.reshape %7524 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %7526 = stablehlo.transpose %7525, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %7527 = stablehlo.multiply %7526, %cst_42 : tensor<1x1200x1280xbf16>
-    %7528 = stablehlo.multiply %7526, %3243 : tensor<1x1200x1280xbf16>
-    %7529 = stablehlo.convert %7528 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %7530 = stablehlo.clamp %cst_43, %7529, %cst_44 : tensor<1x1200x1280xf32>
-    %7531 = stablehlo.multiply %7530, %7530 : tensor<1x1200x1280xf32>
-    %7532 = stablehlo.multiply %cst_45, %7531 : tensor<1x1200x1280xf32>
-    %7533 = stablehlo.add %7532, %cst_46 : tensor<1x1200x1280xf32>
-    %7534 = stablehlo.multiply %7533, %7531 : tensor<1x1200x1280xf32>
-    %7535 = stablehlo.add %7534, %cst_47 : tensor<1x1200x1280xf32>
-    %7536 = stablehlo.multiply %7535, %7531 : tensor<1x1200x1280xf32>
-    %7537 = stablehlo.add %7536, %cst_48 : tensor<1x1200x1280xf32>
-    %7538 = stablehlo.multiply %7537, %7531 : tensor<1x1200x1280xf32>
-    %7539 = stablehlo.add %7538, %cst_49 : tensor<1x1200x1280xf32>
-    %7540 = stablehlo.multiply %7539, %7531 : tensor<1x1200x1280xf32>
-    %7541 = stablehlo.add %7540, %cst_50 : tensor<1x1200x1280xf32>
-    %7542 = stablehlo.multiply %7541, %7531 : tensor<1x1200x1280xf32>
-    %7543 = stablehlo.add %7542, %cst_51 : tensor<1x1200x1280xf32>
-    %7544 = stablehlo.multiply %cst_52, %7531 : tensor<1x1200x1280xf32>
-    %7545 = stablehlo.add %7544, %cst_53 : tensor<1x1200x1280xf32>
-    %7546 = stablehlo.multiply %7545, %7531 : tensor<1x1200x1280xf32>
-    %7547 = stablehlo.add %7546, %cst_54 : tensor<1x1200x1280xf32>
-    %7548 = stablehlo.multiply %7547, %7531 : tensor<1x1200x1280xf32>
-    %7549 = stablehlo.add %7548, %cst_55 : tensor<1x1200x1280xf32>
-    %7550 = stablehlo.multiply %7549, %7531 : tensor<1x1200x1280xf32>
-    %7551 = stablehlo.add %7550, %cst_56 : tensor<1x1200x1280xf32>
-    %7552 = stablehlo.multiply %7530, %7543 : tensor<1x1200x1280xf32>
-    %7553 = stablehlo.divide %7552, %7551 : tensor<1x1200x1280xf32>
-    %7554 = stablehlo.clamp %cst_57, %7553, %cst_58 : tensor<1x1200x1280xf32>
-    %7555 = stablehlo.convert %7554 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %7556 = stablehlo.add %7555, %cst_40 : tensor<1x1200x1280xbf16>
-    %7557 = stablehlo.multiply %7556, %7527 : tensor<1x1200x1280xbf16>
-    %7558 = stablehlo.reshape %7557 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %7559 = stablehlo.dot_general %7558, %arg810, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %7560 = stablehlo.reshape %7559 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7561 = stablehlo.broadcast_in_dim %7560, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7562 = stablehlo.broadcast_in_dim %arg335, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %7563 = stablehlo.add %7561, %7562 : tensor<1x1200x320xbf16>
-    %7564 = stablehlo.reshape %7563 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7565 = stablehlo.reshape %7564 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7566 = stablehlo.add %7565, %7470 : tensor<1x1200x320xbf16>
-    %7567 = stablehlo.convert %7566 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7568 = stablehlo.convert %7567 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7569 = stablehlo.reduce(%7568 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7570 = stablehlo.reshape %7569 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7571 = stablehlo.broadcast_in_dim %7570, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7572 = stablehlo.divide %7571, %2987 : tensor<1x1200x1xf64>
-    %7573 = stablehlo.broadcast_in_dim %7568, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7574 = stablehlo.broadcast_in_dim %7572, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7575 = stablehlo.subtract %7573, %7574 : tensor<1x1200x320xf64>
-    %7576 = stablehlo.multiply %7575, %7575 : tensor<1x1200x320xf64>
-    %7577 = stablehlo.reduce(%7576 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7578 = stablehlo.reshape %7577 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7579 = stablehlo.broadcast_in_dim %7578, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7580 = stablehlo.divide %7579, %2987 : tensor<1x1200x1xf64>
-    %7581 = stablehlo.convert %7580 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7582 = stablehlo.reduce(%7567 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7583 = stablehlo.reshape %7582 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7584 = stablehlo.broadcast_in_dim %7583, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7585 = stablehlo.divide %7584, %3003 : tensor<1x1200x1xf32>
-    %7586 = stablehlo.broadcast_in_dim %7581, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7587 = stablehlo.add %7586, %3006 : tensor<1x1200x1xf32>
-    %7588 = stablehlo.rsqrt %7587 : tensor<1x1200x1xf32>
-    %7589 = stablehlo.broadcast_in_dim %7567, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7590 = stablehlo.broadcast_in_dim %7585, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7591 = stablehlo.subtract %7589, %7590 : tensor<1x1200x320xf32>
-    %7592 = stablehlo.broadcast_in_dim %7591, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7593 = stablehlo.broadcast_in_dim %7588, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7594 = stablehlo.multiply %7592, %7593 : tensor<1x1200x320xf32>
-    %7595 = stablehlo.convert %arg336 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7596 = stablehlo.broadcast_in_dim %7594, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7597 = stablehlo.broadcast_in_dim %7595, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7598 = stablehlo.multiply %7596, %7597 : tensor<1x1200x320xf32>
-    %7599 = stablehlo.convert %arg337 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7600 = stablehlo.broadcast_in_dim %7598, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7601 = stablehlo.broadcast_in_dim %7599, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7602 = stablehlo.add %7600, %7601 : tensor<1x1200x320xf32>
-    %7603 = stablehlo.convert %7602 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7604 = stablehlo.reshape %7603 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7605 = stablehlo.convert %7604 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7606 = stablehlo.dot_general %7605, %arg811, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7607 = stablehlo.broadcast_in_dim %7606, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7608 = stablehlo.multiply %7607, %3065 : tensor<1200x320xf32>
-    %7609 = stablehlo.broadcast_in_dim %7608, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7610 = stablehlo.broadcast_in_dim %arg812, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7611 = stablehlo.add %7609, %7610 : tensor<1200x320xf32>
-    %7612 = stablehlo.convert %7611 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7613 = stablehlo.reshape %7612 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7614 = stablehlo.reshape %7613 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7615 = stablehlo.transpose %7614, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7616 = stablehlo.transpose %7603, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %7617 = stablehlo.reshape %7616 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %7618 = stablehlo.convolution(%7617, %arg338) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %7619 = stablehlo.reshape %arg339 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %7620 = stablehlo.broadcast_in_dim %7618, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %7621 = stablehlo.broadcast_in_dim %7619, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %7622 = stablehlo.add %7620, %7621 : tensor<1x320x15x20xbf16>
-    %7623 = stablehlo.reshape %7622 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %7624 = stablehlo.transpose %7623, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %7625 = stablehlo.convert %7624 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %7626 = stablehlo.convert %7625 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %7627 = stablehlo.reduce(%7626 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7628 = stablehlo.reshape %7627 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7629 = stablehlo.broadcast_in_dim %7628, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7630 = stablehlo.divide %7629, %3088 : tensor<1x300x1xf64>
-    %7631 = stablehlo.broadcast_in_dim %7626, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %7632 = stablehlo.broadcast_in_dim %7630, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %7633 = stablehlo.subtract %7631, %7632 : tensor<1x300x320xf64>
-    %7634 = stablehlo.multiply %7633, %7633 : tensor<1x300x320xf64>
-    %7635 = stablehlo.reduce(%7634 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7636 = stablehlo.reshape %7635 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7637 = stablehlo.broadcast_in_dim %7636, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7638 = stablehlo.divide %7637, %3088 : tensor<1x300x1xf64>
-    %7639 = stablehlo.convert %7638 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %7640 = stablehlo.reduce(%7625 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %7641 = stablehlo.reshape %7640 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %7642 = stablehlo.broadcast_in_dim %7641, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7643 = stablehlo.divide %7642, %3102 : tensor<1x300x1xf32>
-    %7644 = stablehlo.broadcast_in_dim %7639, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7645 = stablehlo.add %7644, %136 : tensor<1x300x1xf32>
-    %7646 = stablehlo.rsqrt %7645 : tensor<1x300x1xf32>
-    %7647 = stablehlo.broadcast_in_dim %7625, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7648 = stablehlo.broadcast_in_dim %7643, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7649 = stablehlo.subtract %7647, %7648 : tensor<1x300x320xf32>
-    %7650 = stablehlo.broadcast_in_dim %7649, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7651 = stablehlo.broadcast_in_dim %7646, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7652 = stablehlo.multiply %7650, %7651 : tensor<1x300x320xf32>
-    %7653 = stablehlo.convert %arg340 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7654 = stablehlo.broadcast_in_dim %7652, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7655 = stablehlo.broadcast_in_dim %7653, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7656 = stablehlo.multiply %7654, %7655 : tensor<1x300x320xf32>
-    %7657 = stablehlo.convert %arg341 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7658 = stablehlo.broadcast_in_dim %7656, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7659 = stablehlo.broadcast_in_dim %7657, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7660 = stablehlo.add %7658, %7659 : tensor<1x300x320xf32>
-    %7661 = stablehlo.convert %7660 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %7662 = stablehlo.reshape %7661 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %7663 = stablehlo.convert %7662 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %7664 = stablehlo.dot_general %7663, %arg813, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7665 = stablehlo.broadcast_in_dim %7664, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7666 = stablehlo.multiply %7665, %3126 : tensor<300x320xf32>
-    %7667 = stablehlo.broadcast_in_dim %7666, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7668 = stablehlo.broadcast_in_dim %arg814, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7669 = stablehlo.add %7667, %7668 : tensor<300x320xf32>
-    %7670 = stablehlo.convert %7669 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7671 = stablehlo.reshape %7670 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7672 = stablehlo.reshape %7671 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7673 = stablehlo.transpose %7672, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7674 = stablehlo.dot_general %7663, %arg815, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7675 = stablehlo.broadcast_in_dim %7674, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7676 = stablehlo.multiply %7675, %3126 : tensor<300x320xf32>
-    %7677 = stablehlo.broadcast_in_dim %7676, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7678 = stablehlo.broadcast_in_dim %arg816, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7679 = stablehlo.add %7677, %7678 : tensor<300x320xf32>
-    %7680 = stablehlo.convert %7679 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7681 = stablehlo.reshape %7680 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7682 = stablehlo.reshape %7681 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7683 = stablehlo.transpose %7682, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7684 = stablehlo.transpose %7673, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %7685 = stablehlo.reshape %7615 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7686 = stablehlo.reshape %7684 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7687 = stablehlo.broadcast_in_dim %7686, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7688 = stablehlo.dot_general %7685, %7687, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7689 = stablehlo.reshape %7688 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7690 = stablehlo.broadcast_in_dim %7689, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7691 = stablehlo.divide %7690, %3152 : tensor<1x5x1200x300xbf16>
-    %7692 = stablehlo.convert %7691 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %7693 = stablehlo.reduce(%7692 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7694 = stablehlo.reshape %7693 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7695 = stablehlo.broadcast_in_dim %7692, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7696 = stablehlo.broadcast_in_dim %7694, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7697 = stablehlo.subtract %7695, %7696 : tensor<1x5x1200x300xf32>
-    %7698 = stablehlo.exponential %7697 : tensor<1x5x1200x300xf32>
-    %7699 = stablehlo.reduce(%7698 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7700 = stablehlo.reshape %7699 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7701 = stablehlo.broadcast_in_dim %7698, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7702 = stablehlo.broadcast_in_dim %7700, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7703 = stablehlo.divide %7701, %7702 : tensor<1x5x1200x300xf32>
-    %7704 = stablehlo.convert %7703 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %7705 = stablehlo.reshape %7704 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7706 = stablehlo.reshape %7683 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7707 = stablehlo.broadcast_in_dim %7706, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7708 = stablehlo.dot_general %7705, %7707, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7709 = stablehlo.reshape %7708 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7710 = stablehlo.transpose %7709, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7711 = stablehlo.reshape %7710 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %7712 = stablehlo.reshape %7711 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7713 = stablehlo.convert %7712 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7714 = stablehlo.dot_general %7713, %arg817, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7715 = stablehlo.broadcast_in_dim %7714, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7716 = stablehlo.multiply %7715, %3065 : tensor<1200x320xf32>
-    %7717 = stablehlo.broadcast_in_dim %7716, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7718 = stablehlo.broadcast_in_dim %arg818, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7719 = stablehlo.add %7717, %7718 : tensor<1200x320xf32>
-    %7720 = stablehlo.convert %7719 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7721 = stablehlo.reshape %7720 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7722 = stablehlo.add %7721, %7566 : tensor<1x1200x320xbf16>
-    %7723 = stablehlo.convert %7722 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7724 = stablehlo.convert %7723 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7725 = stablehlo.reduce(%7724 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7726 = stablehlo.reshape %7725 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7727 = stablehlo.broadcast_in_dim %7726, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7728 = stablehlo.divide %7727, %2987 : tensor<1x1200x1xf64>
-    %7729 = stablehlo.broadcast_in_dim %7724, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7730 = stablehlo.broadcast_in_dim %7728, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7731 = stablehlo.subtract %7729, %7730 : tensor<1x1200x320xf64>
-    %7732 = stablehlo.multiply %7731, %7731 : tensor<1x1200x320xf64>
-    %7733 = stablehlo.reduce(%7732 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7734 = stablehlo.reshape %7733 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7735 = stablehlo.broadcast_in_dim %7734, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7736 = stablehlo.divide %7735, %2987 : tensor<1x1200x1xf64>
-    %7737 = stablehlo.convert %7736 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7738 = stablehlo.reduce(%7723 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7739 = stablehlo.reshape %7738 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7740 = stablehlo.broadcast_in_dim %7739, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7741 = stablehlo.divide %7740, %3003 : tensor<1x1200x1xf32>
-    %7742 = stablehlo.broadcast_in_dim %7737, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7743 = stablehlo.add %7742, %3006 : tensor<1x1200x1xf32>
-    %7744 = stablehlo.rsqrt %7743 : tensor<1x1200x1xf32>
-    %7745 = stablehlo.broadcast_in_dim %7723, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7746 = stablehlo.broadcast_in_dim %7741, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7747 = stablehlo.subtract %7745, %7746 : tensor<1x1200x320xf32>
-    %7748 = stablehlo.broadcast_in_dim %7747, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7749 = stablehlo.broadcast_in_dim %7744, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7750 = stablehlo.multiply %7748, %7749 : tensor<1x1200x320xf32>
-    %7751 = stablehlo.convert %arg342 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7752 = stablehlo.broadcast_in_dim %7750, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7753 = stablehlo.broadcast_in_dim %7751, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7754 = stablehlo.multiply %7752, %7753 : tensor<1x1200x320xf32>
-    %7755 = stablehlo.convert %arg343 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7756 = stablehlo.broadcast_in_dim %7754, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7757 = stablehlo.broadcast_in_dim %7755, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7758 = stablehlo.add %7756, %7757 : tensor<1x1200x320xf32>
-    %7759 = stablehlo.convert %7758 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7760 = stablehlo.reshape %7759 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7761 = stablehlo.convert %7760 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7762 = stablehlo.dot_general %7761, %arg819, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %7763 = stablehlo.broadcast_in_dim %7762, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7764 = stablehlo.multiply %7763, %3226 : tensor<1200x1280xf32>
-    %7765 = stablehlo.broadcast_in_dim %7764, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %7766 = stablehlo.broadcast_in_dim %arg820, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %7767 = stablehlo.add %7765, %7766 : tensor<1200x1280xf32>
-    %7768 = stablehlo.convert %7767 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %7769 = stablehlo.reshape %7768 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %7770 = stablehlo.transpose %7769, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %7771 = stablehlo.reshape %7770 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7772 = stablehlo.convolution(%7771, %arg344) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7773 = stablehlo.reshape %arg345 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %7774 = stablehlo.broadcast_in_dim %7772, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7775 = stablehlo.broadcast_in_dim %7773, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %7776 = stablehlo.add %7774, %7775 : tensor<1x1280x30x40xbf16>
-    %7777 = stablehlo.reshape %7776 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %7778 = stablehlo.transpose %7777, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %7779 = stablehlo.multiply %7778, %cst_42 : tensor<1x1200x1280xbf16>
-    %7780 = stablehlo.multiply %7778, %3243 : tensor<1x1200x1280xbf16>
-    %7781 = stablehlo.convert %7780 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %7782 = stablehlo.clamp %cst_43, %7781, %cst_44 : tensor<1x1200x1280xf32>
-    %7783 = stablehlo.multiply %7782, %7782 : tensor<1x1200x1280xf32>
-    %7784 = stablehlo.multiply %cst_45, %7783 : tensor<1x1200x1280xf32>
-    %7785 = stablehlo.add %7784, %cst_46 : tensor<1x1200x1280xf32>
-    %7786 = stablehlo.multiply %7785, %7783 : tensor<1x1200x1280xf32>
-    %7787 = stablehlo.add %7786, %cst_47 : tensor<1x1200x1280xf32>
-    %7788 = stablehlo.multiply %7787, %7783 : tensor<1x1200x1280xf32>
-    %7789 = stablehlo.add %7788, %cst_48 : tensor<1x1200x1280xf32>
-    %7790 = stablehlo.multiply %7789, %7783 : tensor<1x1200x1280xf32>
-    %7791 = stablehlo.add %7790, %cst_49 : tensor<1x1200x1280xf32>
-    %7792 = stablehlo.multiply %7791, %7783 : tensor<1x1200x1280xf32>
-    %7793 = stablehlo.add %7792, %cst_50 : tensor<1x1200x1280xf32>
-    %7794 = stablehlo.multiply %7793, %7783 : tensor<1x1200x1280xf32>
-    %7795 = stablehlo.add %7794, %cst_51 : tensor<1x1200x1280xf32>
-    %7796 = stablehlo.multiply %cst_52, %7783 : tensor<1x1200x1280xf32>
-    %7797 = stablehlo.add %7796, %cst_53 : tensor<1x1200x1280xf32>
-    %7798 = stablehlo.multiply %7797, %7783 : tensor<1x1200x1280xf32>
-    %7799 = stablehlo.add %7798, %cst_54 : tensor<1x1200x1280xf32>
-    %7800 = stablehlo.multiply %7799, %7783 : tensor<1x1200x1280xf32>
-    %7801 = stablehlo.add %7800, %cst_55 : tensor<1x1200x1280xf32>
-    %7802 = stablehlo.multiply %7801, %7783 : tensor<1x1200x1280xf32>
-    %7803 = stablehlo.add %7802, %cst_56 : tensor<1x1200x1280xf32>
-    %7804 = stablehlo.multiply %7782, %7795 : tensor<1x1200x1280xf32>
-    %7805 = stablehlo.divide %7804, %7803 : tensor<1x1200x1280xf32>
-    %7806 = stablehlo.clamp %cst_57, %7805, %cst_58 : tensor<1x1200x1280xf32>
-    %7807 = stablehlo.convert %7806 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %7808 = stablehlo.add %7807, %cst_40 : tensor<1x1200x1280xbf16>
-    %7809 = stablehlo.multiply %7808, %7779 : tensor<1x1200x1280xbf16>
-    %7810 = stablehlo.reshape %7809 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %7811 = stablehlo.dot_general %7810, %arg821, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %7812 = stablehlo.reshape %7811 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7813 = stablehlo.broadcast_in_dim %7812, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7814 = stablehlo.broadcast_in_dim %arg346, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %7815 = stablehlo.add %7813, %7814 : tensor<1x1200x320xbf16>
-    %7816 = stablehlo.reshape %7815 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7817 = stablehlo.reshape %7816 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7818 = stablehlo.add %7817, %7722 : tensor<1x1200x320xbf16>
-    %7819 = stablehlo.convert %7818 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7820 = stablehlo.convert %7819 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7821 = stablehlo.reduce(%7820 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7822 = stablehlo.reshape %7821 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7823 = stablehlo.broadcast_in_dim %7822, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7824 = stablehlo.divide %7823, %2987 : tensor<1x1200x1xf64>
-    %7825 = stablehlo.broadcast_in_dim %7820, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7826 = stablehlo.broadcast_in_dim %7824, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7827 = stablehlo.subtract %7825, %7826 : tensor<1x1200x320xf64>
-    %7828 = stablehlo.multiply %7827, %7827 : tensor<1x1200x320xf64>
-    %7829 = stablehlo.reduce(%7828 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7830 = stablehlo.reshape %7829 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7831 = stablehlo.broadcast_in_dim %7830, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7832 = stablehlo.divide %7831, %2987 : tensor<1x1200x1xf64>
-    %7833 = stablehlo.convert %7832 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7834 = stablehlo.reduce(%7819 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7835 = stablehlo.reshape %7834 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7836 = stablehlo.broadcast_in_dim %7835, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7837 = stablehlo.divide %7836, %3003 : tensor<1x1200x1xf32>
-    %7838 = stablehlo.broadcast_in_dim %7833, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7839 = stablehlo.add %7838, %3006 : tensor<1x1200x1xf32>
-    %7840 = stablehlo.rsqrt %7839 : tensor<1x1200x1xf32>
-    %7841 = stablehlo.broadcast_in_dim %7819, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7842 = stablehlo.broadcast_in_dim %7837, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7843 = stablehlo.subtract %7841, %7842 : tensor<1x1200x320xf32>
-    %7844 = stablehlo.broadcast_in_dim %7843, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7845 = stablehlo.broadcast_in_dim %7840, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7846 = stablehlo.multiply %7844, %7845 : tensor<1x1200x320xf32>
-    %7847 = stablehlo.convert %arg347 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7848 = stablehlo.broadcast_in_dim %7846, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7849 = stablehlo.broadcast_in_dim %7847, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7850 = stablehlo.multiply %7848, %7849 : tensor<1x1200x320xf32>
-    %7851 = stablehlo.convert %arg348 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7852 = stablehlo.broadcast_in_dim %7850, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7853 = stablehlo.broadcast_in_dim %7851, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %7854 = stablehlo.add %7852, %7853 : tensor<1x1200x320xf32>
-    %7855 = stablehlo.convert %7854 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %7856 = stablehlo.reshape %7855 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7857 = stablehlo.convert %7856 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7858 = stablehlo.dot_general %7857, %arg822, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7859 = stablehlo.broadcast_in_dim %7858, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7860 = stablehlo.multiply %7859, %3065 : tensor<1200x320xf32>
-    %7861 = stablehlo.broadcast_in_dim %7860, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7862 = stablehlo.broadcast_in_dim %arg823, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7863 = stablehlo.add %7861, %7862 : tensor<1200x320xf32>
-    %7864 = stablehlo.convert %7863 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7865 = stablehlo.reshape %7864 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7866 = stablehlo.reshape %7865 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7867 = stablehlo.transpose %7866, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7868 = stablehlo.transpose %7855, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %7869 = stablehlo.reshape %7868 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %7870 = stablehlo.convolution(%7869, %arg349) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %7871 = stablehlo.reshape %arg350 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %7872 = stablehlo.broadcast_in_dim %7870, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %7873 = stablehlo.broadcast_in_dim %7871, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %7874 = stablehlo.add %7872, %7873 : tensor<1x320x15x20xbf16>
-    %7875 = stablehlo.reshape %7874 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %7876 = stablehlo.transpose %7875, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %7877 = stablehlo.convert %7876 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %7878 = stablehlo.convert %7877 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %7879 = stablehlo.reduce(%7878 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7880 = stablehlo.reshape %7879 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7881 = stablehlo.broadcast_in_dim %7880, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7882 = stablehlo.divide %7881, %3088 : tensor<1x300x1xf64>
-    %7883 = stablehlo.broadcast_in_dim %7878, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %7884 = stablehlo.broadcast_in_dim %7882, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %7885 = stablehlo.subtract %7883, %7884 : tensor<1x300x320xf64>
-    %7886 = stablehlo.multiply %7885, %7885 : tensor<1x300x320xf64>
-    %7887 = stablehlo.reduce(%7886 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %7888 = stablehlo.reshape %7887 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %7889 = stablehlo.broadcast_in_dim %7888, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %7890 = stablehlo.divide %7889, %3088 : tensor<1x300x1xf64>
-    %7891 = stablehlo.convert %7890 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %7892 = stablehlo.reduce(%7877 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %7893 = stablehlo.reshape %7892 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %7894 = stablehlo.broadcast_in_dim %7893, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7895 = stablehlo.divide %7894, %3102 : tensor<1x300x1xf32>
-    %7896 = stablehlo.broadcast_in_dim %7891, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %7897 = stablehlo.add %7896, %136 : tensor<1x300x1xf32>
-    %7898 = stablehlo.rsqrt %7897 : tensor<1x300x1xf32>
-    %7899 = stablehlo.broadcast_in_dim %7877, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7900 = stablehlo.broadcast_in_dim %7895, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7901 = stablehlo.subtract %7899, %7900 : tensor<1x300x320xf32>
-    %7902 = stablehlo.broadcast_in_dim %7901, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7903 = stablehlo.broadcast_in_dim %7898, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %7904 = stablehlo.multiply %7902, %7903 : tensor<1x300x320xf32>
-    %7905 = stablehlo.convert %arg351 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7906 = stablehlo.broadcast_in_dim %7904, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7907 = stablehlo.broadcast_in_dim %7905, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7908 = stablehlo.multiply %7906, %7907 : tensor<1x300x320xf32>
-    %7909 = stablehlo.convert %arg352 : (tensor<320xbf16>) -> tensor<320xf32>
-    %7910 = stablehlo.broadcast_in_dim %7908, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %7911 = stablehlo.broadcast_in_dim %7909, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %7912 = stablehlo.add %7910, %7911 : tensor<1x300x320xf32>
-    %7913 = stablehlo.convert %7912 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %7914 = stablehlo.reshape %7913 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %7915 = stablehlo.convert %7914 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %7916 = stablehlo.dot_general %7915, %arg824, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7917 = stablehlo.broadcast_in_dim %7916, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7918 = stablehlo.multiply %7917, %3126 : tensor<300x320xf32>
-    %7919 = stablehlo.broadcast_in_dim %7918, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7920 = stablehlo.broadcast_in_dim %arg825, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7921 = stablehlo.add %7919, %7920 : tensor<300x320xf32>
-    %7922 = stablehlo.convert %7921 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7923 = stablehlo.reshape %7922 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7924 = stablehlo.reshape %7923 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7925 = stablehlo.transpose %7924, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7926 = stablehlo.dot_general %7915, %arg826, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %7927 = stablehlo.broadcast_in_dim %7926, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7928 = stablehlo.multiply %7927, %3126 : tensor<300x320xf32>
-    %7929 = stablehlo.broadcast_in_dim %7928, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %7930 = stablehlo.broadcast_in_dim %arg827, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %7931 = stablehlo.add %7929, %7930 : tensor<300x320xf32>
-    %7932 = stablehlo.convert %7931 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %7933 = stablehlo.reshape %7932 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %7934 = stablehlo.reshape %7933 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %7935 = stablehlo.transpose %7934, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %7936 = stablehlo.transpose %7925, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %7937 = stablehlo.reshape %7867 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7938 = stablehlo.reshape %7936 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7939 = stablehlo.broadcast_in_dim %7938, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %7940 = stablehlo.dot_general %7937, %7939, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7941 = stablehlo.reshape %7940 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7942 = stablehlo.broadcast_in_dim %7941, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %7943 = stablehlo.divide %7942, %3152 : tensor<1x5x1200x300xbf16>
-    %7944 = stablehlo.convert %7943 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %7945 = stablehlo.reduce(%7944 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7946 = stablehlo.reshape %7945 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7947 = stablehlo.broadcast_in_dim %7944, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7948 = stablehlo.broadcast_in_dim %7946, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7949 = stablehlo.subtract %7947, %7948 : tensor<1x5x1200x300xf32>
-    %7950 = stablehlo.exponential %7949 : tensor<1x5x1200x300xf32>
-    %7951 = stablehlo.reduce(%7950 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %7952 = stablehlo.reshape %7951 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %7953 = stablehlo.broadcast_in_dim %7950, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %7954 = stablehlo.broadcast_in_dim %7952, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %7955 = stablehlo.divide %7953, %7954 : tensor<1x5x1200x300xf32>
-    %7956 = stablehlo.convert %7955 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %7957 = stablehlo.reshape %7956 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %7958 = stablehlo.reshape %7935 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7959 = stablehlo.broadcast_in_dim %7958, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %7960 = stablehlo.dot_general %7957, %7959, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %7961 = stablehlo.reshape %7960 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %7962 = stablehlo.transpose %7961, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %7963 = stablehlo.reshape %7962 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %7964 = stablehlo.reshape %7963 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %7965 = stablehlo.convert %7964 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %7966 = stablehlo.dot_general %7965, %arg828, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %7967 = stablehlo.broadcast_in_dim %7966, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7968 = stablehlo.multiply %7967, %3065 : tensor<1200x320xf32>
-    %7969 = stablehlo.broadcast_in_dim %7968, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %7970 = stablehlo.broadcast_in_dim %arg829, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %7971 = stablehlo.add %7969, %7970 : tensor<1200x320xf32>
-    %7972 = stablehlo.convert %7971 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %7973 = stablehlo.reshape %7972 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %7974 = stablehlo.add %7973, %7818 : tensor<1x1200x320xbf16>
-    %7975 = stablehlo.convert %7974 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %7976 = stablehlo.convert %7975 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %7977 = stablehlo.reduce(%7976 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7978 = stablehlo.reshape %7977 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7979 = stablehlo.broadcast_in_dim %7978, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7980 = stablehlo.divide %7979, %2987 : tensor<1x1200x1xf64>
-    %7981 = stablehlo.broadcast_in_dim %7976, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %7982 = stablehlo.broadcast_in_dim %7980, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %7983 = stablehlo.subtract %7981, %7982 : tensor<1x1200x320xf64>
-    %7984 = stablehlo.multiply %7983, %7983 : tensor<1x1200x320xf64>
-    %7985 = stablehlo.reduce(%7984 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %7986 = stablehlo.reshape %7985 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %7987 = stablehlo.broadcast_in_dim %7986, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %7988 = stablehlo.divide %7987, %2987 : tensor<1x1200x1xf64>
-    %7989 = stablehlo.convert %7988 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %7990 = stablehlo.reduce(%7975 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %7991 = stablehlo.reshape %7990 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %7992 = stablehlo.broadcast_in_dim %7991, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7993 = stablehlo.divide %7992, %3003 : tensor<1x1200x1xf32>
-    %7994 = stablehlo.broadcast_in_dim %7989, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %7995 = stablehlo.add %7994, %3006 : tensor<1x1200x1xf32>
-    %7996 = stablehlo.rsqrt %7995 : tensor<1x1200x1xf32>
-    %7997 = stablehlo.broadcast_in_dim %7975, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %7998 = stablehlo.broadcast_in_dim %7993, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %7999 = stablehlo.subtract %7997, %7998 : tensor<1x1200x320xf32>
-    %8000 = stablehlo.broadcast_in_dim %7999, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8001 = stablehlo.broadcast_in_dim %7996, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8002 = stablehlo.multiply %8000, %8001 : tensor<1x1200x320xf32>
-    %8003 = stablehlo.convert %arg353 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8004 = stablehlo.broadcast_in_dim %8002, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8005 = stablehlo.broadcast_in_dim %8003, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8006 = stablehlo.multiply %8004, %8005 : tensor<1x1200x320xf32>
-    %8007 = stablehlo.convert %arg354 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8008 = stablehlo.broadcast_in_dim %8006, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8009 = stablehlo.broadcast_in_dim %8007, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8010 = stablehlo.add %8008, %8009 : tensor<1x1200x320xf32>
-    %8011 = stablehlo.convert %8010 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8012 = stablehlo.reshape %8011 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8013 = stablehlo.convert %8012 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8014 = stablehlo.dot_general %8013, %arg830, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %8015 = stablehlo.broadcast_in_dim %8014, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8016 = stablehlo.multiply %8015, %3226 : tensor<1200x1280xf32>
-    %8017 = stablehlo.broadcast_in_dim %8016, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8018 = stablehlo.broadcast_in_dim %arg831, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %8019 = stablehlo.add %8017, %8018 : tensor<1200x1280xf32>
-    %8020 = stablehlo.convert %8019 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %8021 = stablehlo.reshape %8020 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %8022 = stablehlo.transpose %8021, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %8023 = stablehlo.reshape %8022 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8024 = stablehlo.convolution(%8023, %arg355) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8025 = stablehlo.reshape %arg356 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %8026 = stablehlo.broadcast_in_dim %8024, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8027 = stablehlo.broadcast_in_dim %8025, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8028 = stablehlo.add %8026, %8027 : tensor<1x1280x30x40xbf16>
-    %8029 = stablehlo.reshape %8028 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %8030 = stablehlo.transpose %8029, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %8031 = stablehlo.multiply %8030, %cst_42 : tensor<1x1200x1280xbf16>
-    %8032 = stablehlo.multiply %8030, %3243 : tensor<1x1200x1280xbf16>
-    %8033 = stablehlo.convert %8032 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %8034 = stablehlo.clamp %cst_43, %8033, %cst_44 : tensor<1x1200x1280xf32>
-    %8035 = stablehlo.multiply %8034, %8034 : tensor<1x1200x1280xf32>
-    %8036 = stablehlo.multiply %cst_45, %8035 : tensor<1x1200x1280xf32>
-    %8037 = stablehlo.add %8036, %cst_46 : tensor<1x1200x1280xf32>
-    %8038 = stablehlo.multiply %8037, %8035 : tensor<1x1200x1280xf32>
-    %8039 = stablehlo.add %8038, %cst_47 : tensor<1x1200x1280xf32>
-    %8040 = stablehlo.multiply %8039, %8035 : tensor<1x1200x1280xf32>
-    %8041 = stablehlo.add %8040, %cst_48 : tensor<1x1200x1280xf32>
-    %8042 = stablehlo.multiply %8041, %8035 : tensor<1x1200x1280xf32>
-    %8043 = stablehlo.add %8042, %cst_49 : tensor<1x1200x1280xf32>
-    %8044 = stablehlo.multiply %8043, %8035 : tensor<1x1200x1280xf32>
-    %8045 = stablehlo.add %8044, %cst_50 : tensor<1x1200x1280xf32>
-    %8046 = stablehlo.multiply %8045, %8035 : tensor<1x1200x1280xf32>
-    %8047 = stablehlo.add %8046, %cst_51 : tensor<1x1200x1280xf32>
-    %8048 = stablehlo.multiply %cst_52, %8035 : tensor<1x1200x1280xf32>
-    %8049 = stablehlo.add %8048, %cst_53 : tensor<1x1200x1280xf32>
-    %8050 = stablehlo.multiply %8049, %8035 : tensor<1x1200x1280xf32>
-    %8051 = stablehlo.add %8050, %cst_54 : tensor<1x1200x1280xf32>
-    %8052 = stablehlo.multiply %8051, %8035 : tensor<1x1200x1280xf32>
-    %8053 = stablehlo.add %8052, %cst_55 : tensor<1x1200x1280xf32>
-    %8054 = stablehlo.multiply %8053, %8035 : tensor<1x1200x1280xf32>
-    %8055 = stablehlo.add %8054, %cst_56 : tensor<1x1200x1280xf32>
-    %8056 = stablehlo.multiply %8034, %8047 : tensor<1x1200x1280xf32>
-    %8057 = stablehlo.divide %8056, %8055 : tensor<1x1200x1280xf32>
-    %8058 = stablehlo.clamp %cst_57, %8057, %cst_58 : tensor<1x1200x1280xf32>
-    %8059 = stablehlo.convert %8058 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %8060 = stablehlo.add %8059, %cst_40 : tensor<1x1200x1280xbf16>
-    %8061 = stablehlo.multiply %8060, %8031 : tensor<1x1200x1280xbf16>
-    %8062 = stablehlo.reshape %8061 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %8063 = stablehlo.dot_general %8062, %arg832, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %8064 = stablehlo.reshape %8063 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8065 = stablehlo.broadcast_in_dim %8064, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8066 = stablehlo.broadcast_in_dim %arg357, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %8067 = stablehlo.add %8065, %8066 : tensor<1x1200x320xbf16>
-    %8068 = stablehlo.reshape %8067 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8069 = stablehlo.reshape %8068 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8070 = stablehlo.add %8069, %7974 : tensor<1x1200x320xbf16>
-    %8071 = stablehlo.convert %8070 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8072 = stablehlo.convert %8071 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8073 = stablehlo.reduce(%8072 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8074 = stablehlo.reshape %8073 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8075 = stablehlo.broadcast_in_dim %8074, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8076 = stablehlo.divide %8075, %2987 : tensor<1x1200x1xf64>
-    %8077 = stablehlo.broadcast_in_dim %8072, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8078 = stablehlo.broadcast_in_dim %8076, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8079 = stablehlo.subtract %8077, %8078 : tensor<1x1200x320xf64>
-    %8080 = stablehlo.multiply %8079, %8079 : tensor<1x1200x320xf64>
-    %8081 = stablehlo.reduce(%8080 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8082 = stablehlo.reshape %8081 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8083 = stablehlo.broadcast_in_dim %8082, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8084 = stablehlo.divide %8083, %2987 : tensor<1x1200x1xf64>
-    %8085 = stablehlo.convert %8084 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8086 = stablehlo.reduce(%8071 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8087 = stablehlo.reshape %8086 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8088 = stablehlo.broadcast_in_dim %8087, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8089 = stablehlo.divide %8088, %3003 : tensor<1x1200x1xf32>
-    %8090 = stablehlo.broadcast_in_dim %8085, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8091 = stablehlo.add %8090, %3006 : tensor<1x1200x1xf32>
-    %8092 = stablehlo.rsqrt %8091 : tensor<1x1200x1xf32>
-    %8093 = stablehlo.broadcast_in_dim %8071, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8094 = stablehlo.broadcast_in_dim %8089, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8095 = stablehlo.subtract %8093, %8094 : tensor<1x1200x320xf32>
-    %8096 = stablehlo.broadcast_in_dim %8095, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8097 = stablehlo.broadcast_in_dim %8092, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8098 = stablehlo.multiply %8096, %8097 : tensor<1x1200x320xf32>
-    %8099 = stablehlo.convert %arg358 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8100 = stablehlo.broadcast_in_dim %8098, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8101 = stablehlo.broadcast_in_dim %8099, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8102 = stablehlo.multiply %8100, %8101 : tensor<1x1200x320xf32>
-    %8103 = stablehlo.convert %arg359 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8104 = stablehlo.broadcast_in_dim %8102, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8105 = stablehlo.broadcast_in_dim %8103, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8106 = stablehlo.add %8104, %8105 : tensor<1x1200x320xf32>
-    %8107 = stablehlo.convert %8106 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8108 = stablehlo.reshape %8107 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8109 = stablehlo.convert %8108 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8110 = stablehlo.dot_general %8109, %arg833, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8111 = stablehlo.broadcast_in_dim %8110, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8112 = stablehlo.multiply %8111, %3065 : tensor<1200x320xf32>
-    %8113 = stablehlo.broadcast_in_dim %8112, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8114 = stablehlo.broadcast_in_dim %arg834, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8115 = stablehlo.add %8113, %8114 : tensor<1200x320xf32>
-    %8116 = stablehlo.convert %8115 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8117 = stablehlo.reshape %8116 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8118 = stablehlo.reshape %8117 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8119 = stablehlo.transpose %8118, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8120 = stablehlo.transpose %8107, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %8121 = stablehlo.reshape %8120 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %8122 = stablehlo.convolution(%8121, %arg360) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %8123 = stablehlo.reshape %arg361 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %8124 = stablehlo.broadcast_in_dim %8122, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %8125 = stablehlo.broadcast_in_dim %8123, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %8126 = stablehlo.add %8124, %8125 : tensor<1x320x15x20xbf16>
-    %8127 = stablehlo.reshape %8126 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %8128 = stablehlo.transpose %8127, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %8129 = stablehlo.convert %8128 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %8130 = stablehlo.convert %8129 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %8131 = stablehlo.reduce(%8130 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8132 = stablehlo.reshape %8131 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8133 = stablehlo.broadcast_in_dim %8132, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8134 = stablehlo.divide %8133, %3088 : tensor<1x300x1xf64>
-    %8135 = stablehlo.broadcast_in_dim %8130, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %8136 = stablehlo.broadcast_in_dim %8134, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %8137 = stablehlo.subtract %8135, %8136 : tensor<1x300x320xf64>
-    %8138 = stablehlo.multiply %8137, %8137 : tensor<1x300x320xf64>
-    %8139 = stablehlo.reduce(%8138 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8140 = stablehlo.reshape %8139 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8141 = stablehlo.broadcast_in_dim %8140, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8142 = stablehlo.divide %8141, %3088 : tensor<1x300x1xf64>
-    %8143 = stablehlo.convert %8142 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %8144 = stablehlo.reduce(%8129 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %8145 = stablehlo.reshape %8144 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %8146 = stablehlo.broadcast_in_dim %8145, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8147 = stablehlo.divide %8146, %3102 : tensor<1x300x1xf32>
-    %8148 = stablehlo.broadcast_in_dim %8143, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8149 = stablehlo.add %8148, %136 : tensor<1x300x1xf32>
-    %8150 = stablehlo.rsqrt %8149 : tensor<1x300x1xf32>
-    %8151 = stablehlo.broadcast_in_dim %8129, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8152 = stablehlo.broadcast_in_dim %8147, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8153 = stablehlo.subtract %8151, %8152 : tensor<1x300x320xf32>
-    %8154 = stablehlo.broadcast_in_dim %8153, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8155 = stablehlo.broadcast_in_dim %8150, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8156 = stablehlo.multiply %8154, %8155 : tensor<1x300x320xf32>
-    %8157 = stablehlo.convert %arg362 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8158 = stablehlo.broadcast_in_dim %8156, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8159 = stablehlo.broadcast_in_dim %8157, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8160 = stablehlo.multiply %8158, %8159 : tensor<1x300x320xf32>
-    %8161 = stablehlo.convert %arg363 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8162 = stablehlo.broadcast_in_dim %8160, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8163 = stablehlo.broadcast_in_dim %8161, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8164 = stablehlo.add %8162, %8163 : tensor<1x300x320xf32>
-    %8165 = stablehlo.convert %8164 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %8166 = stablehlo.reshape %8165 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %8167 = stablehlo.convert %8166 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %8168 = stablehlo.dot_general %8167, %arg835, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8169 = stablehlo.broadcast_in_dim %8168, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8170 = stablehlo.multiply %8169, %3126 : tensor<300x320xf32>
-    %8171 = stablehlo.broadcast_in_dim %8170, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8172 = stablehlo.broadcast_in_dim %arg836, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8173 = stablehlo.add %8171, %8172 : tensor<300x320xf32>
-    %8174 = stablehlo.convert %8173 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8175 = stablehlo.reshape %8174 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8176 = stablehlo.reshape %8175 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8177 = stablehlo.transpose %8176, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8178 = stablehlo.dot_general %8167, %arg837, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8179 = stablehlo.broadcast_in_dim %8178, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8180 = stablehlo.multiply %8179, %3126 : tensor<300x320xf32>
-    %8181 = stablehlo.broadcast_in_dim %8180, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8182 = stablehlo.broadcast_in_dim %arg838, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8183 = stablehlo.add %8181, %8182 : tensor<300x320xf32>
-    %8184 = stablehlo.convert %8183 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8185 = stablehlo.reshape %8184 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8186 = stablehlo.reshape %8185 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8187 = stablehlo.transpose %8186, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8188 = stablehlo.transpose %8177, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %8189 = stablehlo.reshape %8119 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8190 = stablehlo.reshape %8188 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8191 = stablehlo.broadcast_in_dim %8190, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8192 = stablehlo.dot_general %8189, %8191, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8193 = stablehlo.reshape %8192 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8194 = stablehlo.broadcast_in_dim %8193, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8195 = stablehlo.divide %8194, %3152 : tensor<1x5x1200x300xbf16>
-    %8196 = stablehlo.convert %8195 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %8197 = stablehlo.reduce(%8196 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8198 = stablehlo.reshape %8197 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8199 = stablehlo.broadcast_in_dim %8196, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8200 = stablehlo.broadcast_in_dim %8198, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8201 = stablehlo.subtract %8199, %8200 : tensor<1x5x1200x300xf32>
-    %8202 = stablehlo.exponential %8201 : tensor<1x5x1200x300xf32>
-    %8203 = stablehlo.reduce(%8202 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8204 = stablehlo.reshape %8203 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8205 = stablehlo.broadcast_in_dim %8202, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8206 = stablehlo.broadcast_in_dim %8204, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8207 = stablehlo.divide %8205, %8206 : tensor<1x5x1200x300xf32>
-    %8208 = stablehlo.convert %8207 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %8209 = stablehlo.reshape %8208 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8210 = stablehlo.reshape %8187 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8211 = stablehlo.broadcast_in_dim %8210, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8212 = stablehlo.dot_general %8209, %8211, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8213 = stablehlo.reshape %8212 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8214 = stablehlo.transpose %8213, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8215 = stablehlo.reshape %8214 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %8216 = stablehlo.reshape %8215 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8217 = stablehlo.convert %8216 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8218 = stablehlo.dot_general %8217, %arg839, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8219 = stablehlo.broadcast_in_dim %8218, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8220 = stablehlo.multiply %8219, %3065 : tensor<1200x320xf32>
-    %8221 = stablehlo.broadcast_in_dim %8220, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8222 = stablehlo.broadcast_in_dim %arg840, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8223 = stablehlo.add %8221, %8222 : tensor<1200x320xf32>
-    %8224 = stablehlo.convert %8223 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8225 = stablehlo.reshape %8224 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8226 = stablehlo.add %8225, %8070 : tensor<1x1200x320xbf16>
-    %8227 = stablehlo.convert %8226 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8228 = stablehlo.convert %8227 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8229 = stablehlo.reduce(%8228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8230 = stablehlo.reshape %8229 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8231 = stablehlo.broadcast_in_dim %8230, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8232 = stablehlo.divide %8231, %2987 : tensor<1x1200x1xf64>
-    %8233 = stablehlo.broadcast_in_dim %8228, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8234 = stablehlo.broadcast_in_dim %8232, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8235 = stablehlo.subtract %8233, %8234 : tensor<1x1200x320xf64>
-    %8236 = stablehlo.multiply %8235, %8235 : tensor<1x1200x320xf64>
-    %8237 = stablehlo.reduce(%8236 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8238 = stablehlo.reshape %8237 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8239 = stablehlo.broadcast_in_dim %8238, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8240 = stablehlo.divide %8239, %2987 : tensor<1x1200x1xf64>
-    %8241 = stablehlo.convert %8240 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8242 = stablehlo.reduce(%8227 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8243 = stablehlo.reshape %8242 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8244 = stablehlo.broadcast_in_dim %8243, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8245 = stablehlo.divide %8244, %3003 : tensor<1x1200x1xf32>
-    %8246 = stablehlo.broadcast_in_dim %8241, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8247 = stablehlo.add %8246, %3006 : tensor<1x1200x1xf32>
-    %8248 = stablehlo.rsqrt %8247 : tensor<1x1200x1xf32>
-    %8249 = stablehlo.broadcast_in_dim %8227, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8250 = stablehlo.broadcast_in_dim %8245, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8251 = stablehlo.subtract %8249, %8250 : tensor<1x1200x320xf32>
-    %8252 = stablehlo.broadcast_in_dim %8251, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8253 = stablehlo.broadcast_in_dim %8248, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8254 = stablehlo.multiply %8252, %8253 : tensor<1x1200x320xf32>
-    %8255 = stablehlo.convert %arg364 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8256 = stablehlo.broadcast_in_dim %8254, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8257 = stablehlo.broadcast_in_dim %8255, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8258 = stablehlo.multiply %8256, %8257 : tensor<1x1200x320xf32>
-    %8259 = stablehlo.convert %arg365 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8260 = stablehlo.broadcast_in_dim %8258, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8261 = stablehlo.broadcast_in_dim %8259, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8262 = stablehlo.add %8260, %8261 : tensor<1x1200x320xf32>
-    %8263 = stablehlo.convert %8262 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8264 = stablehlo.reshape %8263 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8265 = stablehlo.convert %8264 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8266 = stablehlo.dot_general %8265, %arg841, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %8267 = stablehlo.broadcast_in_dim %8266, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8268 = stablehlo.multiply %8267, %3226 : tensor<1200x1280xf32>
-    %8269 = stablehlo.broadcast_in_dim %8268, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8270 = stablehlo.broadcast_in_dim %arg842, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %8271 = stablehlo.add %8269, %8270 : tensor<1200x1280xf32>
-    %8272 = stablehlo.convert %8271 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %8273 = stablehlo.reshape %8272 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %8274 = stablehlo.transpose %8273, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %8275 = stablehlo.reshape %8274 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8276 = stablehlo.convolution(%8275, %arg366) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8277 = stablehlo.reshape %arg367 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %8278 = stablehlo.broadcast_in_dim %8276, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8279 = stablehlo.broadcast_in_dim %8277, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8280 = stablehlo.add %8278, %8279 : tensor<1x1280x30x40xbf16>
-    %8281 = stablehlo.reshape %8280 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %8282 = stablehlo.transpose %8281, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %8283 = stablehlo.multiply %8282, %cst_42 : tensor<1x1200x1280xbf16>
-    %8284 = stablehlo.multiply %8282, %3243 : tensor<1x1200x1280xbf16>
-    %8285 = stablehlo.convert %8284 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %8286 = stablehlo.clamp %cst_43, %8285, %cst_44 : tensor<1x1200x1280xf32>
-    %8287 = stablehlo.multiply %8286, %8286 : tensor<1x1200x1280xf32>
-    %8288 = stablehlo.multiply %cst_45, %8287 : tensor<1x1200x1280xf32>
-    %8289 = stablehlo.add %8288, %cst_46 : tensor<1x1200x1280xf32>
-    %8290 = stablehlo.multiply %8289, %8287 : tensor<1x1200x1280xf32>
-    %8291 = stablehlo.add %8290, %cst_47 : tensor<1x1200x1280xf32>
-    %8292 = stablehlo.multiply %8291, %8287 : tensor<1x1200x1280xf32>
-    %8293 = stablehlo.add %8292, %cst_48 : tensor<1x1200x1280xf32>
-    %8294 = stablehlo.multiply %8293, %8287 : tensor<1x1200x1280xf32>
-    %8295 = stablehlo.add %8294, %cst_49 : tensor<1x1200x1280xf32>
-    %8296 = stablehlo.multiply %8295, %8287 : tensor<1x1200x1280xf32>
-    %8297 = stablehlo.add %8296, %cst_50 : tensor<1x1200x1280xf32>
-    %8298 = stablehlo.multiply %8297, %8287 : tensor<1x1200x1280xf32>
-    %8299 = stablehlo.add %8298, %cst_51 : tensor<1x1200x1280xf32>
-    %8300 = stablehlo.multiply %cst_52, %8287 : tensor<1x1200x1280xf32>
-    %8301 = stablehlo.add %8300, %cst_53 : tensor<1x1200x1280xf32>
-    %8302 = stablehlo.multiply %8301, %8287 : tensor<1x1200x1280xf32>
-    %8303 = stablehlo.add %8302, %cst_54 : tensor<1x1200x1280xf32>
-    %8304 = stablehlo.multiply %8303, %8287 : tensor<1x1200x1280xf32>
-    %8305 = stablehlo.add %8304, %cst_55 : tensor<1x1200x1280xf32>
-    %8306 = stablehlo.multiply %8305, %8287 : tensor<1x1200x1280xf32>
-    %8307 = stablehlo.add %8306, %cst_56 : tensor<1x1200x1280xf32>
-    %8308 = stablehlo.multiply %8286, %8299 : tensor<1x1200x1280xf32>
-    %8309 = stablehlo.divide %8308, %8307 : tensor<1x1200x1280xf32>
-    %8310 = stablehlo.clamp %cst_57, %8309, %cst_58 : tensor<1x1200x1280xf32>
-    %8311 = stablehlo.convert %8310 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %8312 = stablehlo.add %8311, %cst_40 : tensor<1x1200x1280xbf16>
-    %8313 = stablehlo.multiply %8312, %8283 : tensor<1x1200x1280xbf16>
-    %8314 = stablehlo.reshape %8313 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %8315 = stablehlo.dot_general %8314, %arg843, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %8316 = stablehlo.reshape %8315 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8317 = stablehlo.broadcast_in_dim %8316, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8318 = stablehlo.broadcast_in_dim %arg368, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %8319 = stablehlo.add %8317, %8318 : tensor<1x1200x320xbf16>
-    %8320 = stablehlo.reshape %8319 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8321 = stablehlo.reshape %8320 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8322 = stablehlo.add %8321, %8226 : tensor<1x1200x320xbf16>
-    %8323 = stablehlo.convert %8322 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8324 = stablehlo.convert %8323 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8325 = stablehlo.reduce(%8324 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8326 = stablehlo.reshape %8325 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8327 = stablehlo.broadcast_in_dim %8326, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8328 = stablehlo.divide %8327, %2987 : tensor<1x1200x1xf64>
-    %8329 = stablehlo.broadcast_in_dim %8324, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8330 = stablehlo.broadcast_in_dim %8328, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8331 = stablehlo.subtract %8329, %8330 : tensor<1x1200x320xf64>
-    %8332 = stablehlo.multiply %8331, %8331 : tensor<1x1200x320xf64>
-    %8333 = stablehlo.reduce(%8332 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8334 = stablehlo.reshape %8333 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8335 = stablehlo.broadcast_in_dim %8334, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8336 = stablehlo.divide %8335, %2987 : tensor<1x1200x1xf64>
-    %8337 = stablehlo.convert %8336 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8338 = stablehlo.reduce(%8323 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8339 = stablehlo.reshape %8338 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8340 = stablehlo.broadcast_in_dim %8339, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8341 = stablehlo.divide %8340, %3003 : tensor<1x1200x1xf32>
-    %8342 = stablehlo.broadcast_in_dim %8337, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8343 = stablehlo.add %8342, %3006 : tensor<1x1200x1xf32>
-    %8344 = stablehlo.rsqrt %8343 : tensor<1x1200x1xf32>
-    %8345 = stablehlo.broadcast_in_dim %8323, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8346 = stablehlo.broadcast_in_dim %8341, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8347 = stablehlo.subtract %8345, %8346 : tensor<1x1200x320xf32>
-    %8348 = stablehlo.broadcast_in_dim %8347, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8349 = stablehlo.broadcast_in_dim %8344, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8350 = stablehlo.multiply %8348, %8349 : tensor<1x1200x320xf32>
-    %8351 = stablehlo.convert %arg369 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8352 = stablehlo.broadcast_in_dim %8350, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8353 = stablehlo.broadcast_in_dim %8351, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8354 = stablehlo.multiply %8352, %8353 : tensor<1x1200x320xf32>
-    %8355 = stablehlo.convert %arg370 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8356 = stablehlo.broadcast_in_dim %8354, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8357 = stablehlo.broadcast_in_dim %8355, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8358 = stablehlo.add %8356, %8357 : tensor<1x1200x320xf32>
-    %8359 = stablehlo.convert %8358 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8360 = stablehlo.reshape %8359 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8361 = stablehlo.convert %8360 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8362 = stablehlo.dot_general %8361, %arg844, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8363 = stablehlo.broadcast_in_dim %8362, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8364 = stablehlo.multiply %8363, %3065 : tensor<1200x320xf32>
-    %8365 = stablehlo.broadcast_in_dim %8364, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8366 = stablehlo.broadcast_in_dim %arg845, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8367 = stablehlo.add %8365, %8366 : tensor<1200x320xf32>
-    %8368 = stablehlo.convert %8367 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8369 = stablehlo.reshape %8368 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8370 = stablehlo.reshape %8369 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8371 = stablehlo.transpose %8370, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8372 = stablehlo.transpose %8359, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %8373 = stablehlo.reshape %8372 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %8374 = stablehlo.convolution(%8373, %arg371) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %8375 = stablehlo.reshape %arg372 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %8376 = stablehlo.broadcast_in_dim %8374, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %8377 = stablehlo.broadcast_in_dim %8375, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %8378 = stablehlo.add %8376, %8377 : tensor<1x320x15x20xbf16>
-    %8379 = stablehlo.reshape %8378 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %8380 = stablehlo.transpose %8379, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %8381 = stablehlo.convert %8380 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %8382 = stablehlo.convert %8381 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %8383 = stablehlo.reduce(%8382 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8384 = stablehlo.reshape %8383 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8385 = stablehlo.broadcast_in_dim %8384, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8386 = stablehlo.divide %8385, %3088 : tensor<1x300x1xf64>
-    %8387 = stablehlo.broadcast_in_dim %8382, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %8388 = stablehlo.broadcast_in_dim %8386, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %8389 = stablehlo.subtract %8387, %8388 : tensor<1x300x320xf64>
-    %8390 = stablehlo.multiply %8389, %8389 : tensor<1x300x320xf64>
-    %8391 = stablehlo.reduce(%8390 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8392 = stablehlo.reshape %8391 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8393 = stablehlo.broadcast_in_dim %8392, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8394 = stablehlo.divide %8393, %3088 : tensor<1x300x1xf64>
-    %8395 = stablehlo.convert %8394 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %8396 = stablehlo.reduce(%8381 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %8397 = stablehlo.reshape %8396 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %8398 = stablehlo.broadcast_in_dim %8397, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8399 = stablehlo.divide %8398, %3102 : tensor<1x300x1xf32>
-    %8400 = stablehlo.broadcast_in_dim %8395, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8401 = stablehlo.add %8400, %136 : tensor<1x300x1xf32>
-    %8402 = stablehlo.rsqrt %8401 : tensor<1x300x1xf32>
-    %8403 = stablehlo.broadcast_in_dim %8381, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8404 = stablehlo.broadcast_in_dim %8399, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8405 = stablehlo.subtract %8403, %8404 : tensor<1x300x320xf32>
-    %8406 = stablehlo.broadcast_in_dim %8405, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8407 = stablehlo.broadcast_in_dim %8402, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8408 = stablehlo.multiply %8406, %8407 : tensor<1x300x320xf32>
-    %8409 = stablehlo.convert %arg373 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8410 = stablehlo.broadcast_in_dim %8408, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8411 = stablehlo.broadcast_in_dim %8409, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8412 = stablehlo.multiply %8410, %8411 : tensor<1x300x320xf32>
-    %8413 = stablehlo.convert %arg374 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8414 = stablehlo.broadcast_in_dim %8412, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8415 = stablehlo.broadcast_in_dim %8413, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8416 = stablehlo.add %8414, %8415 : tensor<1x300x320xf32>
-    %8417 = stablehlo.convert %8416 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %8418 = stablehlo.reshape %8417 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %8419 = stablehlo.convert %8418 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %8420 = stablehlo.dot_general %8419, %arg846, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8421 = stablehlo.broadcast_in_dim %8420, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8422 = stablehlo.multiply %8421, %3126 : tensor<300x320xf32>
-    %8423 = stablehlo.broadcast_in_dim %8422, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8424 = stablehlo.broadcast_in_dim %arg847, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8425 = stablehlo.add %8423, %8424 : tensor<300x320xf32>
-    %8426 = stablehlo.convert %8425 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8427 = stablehlo.reshape %8426 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8428 = stablehlo.reshape %8427 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8429 = stablehlo.transpose %8428, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8430 = stablehlo.dot_general %8419, %arg848, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8431 = stablehlo.broadcast_in_dim %8430, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8432 = stablehlo.multiply %8431, %3126 : tensor<300x320xf32>
-    %8433 = stablehlo.broadcast_in_dim %8432, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8434 = stablehlo.broadcast_in_dim %arg849, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8435 = stablehlo.add %8433, %8434 : tensor<300x320xf32>
-    %8436 = stablehlo.convert %8435 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8437 = stablehlo.reshape %8436 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8438 = stablehlo.reshape %8437 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8439 = stablehlo.transpose %8438, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8440 = stablehlo.transpose %8429, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %8441 = stablehlo.reshape %8371 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8442 = stablehlo.reshape %8440 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8443 = stablehlo.broadcast_in_dim %8442, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8444 = stablehlo.dot_general %8441, %8443, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8445 = stablehlo.reshape %8444 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8446 = stablehlo.broadcast_in_dim %8445, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8447 = stablehlo.divide %8446, %3152 : tensor<1x5x1200x300xbf16>
-    %8448 = stablehlo.convert %8447 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %8449 = stablehlo.reduce(%8448 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8450 = stablehlo.reshape %8449 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8451 = stablehlo.broadcast_in_dim %8448, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8452 = stablehlo.broadcast_in_dim %8450, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8453 = stablehlo.subtract %8451, %8452 : tensor<1x5x1200x300xf32>
-    %8454 = stablehlo.exponential %8453 : tensor<1x5x1200x300xf32>
-    %8455 = stablehlo.reduce(%8454 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8456 = stablehlo.reshape %8455 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8457 = stablehlo.broadcast_in_dim %8454, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8458 = stablehlo.broadcast_in_dim %8456, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8459 = stablehlo.divide %8457, %8458 : tensor<1x5x1200x300xf32>
-    %8460 = stablehlo.convert %8459 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %8461 = stablehlo.reshape %8460 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8462 = stablehlo.reshape %8439 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8463 = stablehlo.broadcast_in_dim %8462, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8464 = stablehlo.dot_general %8461, %8463, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8465 = stablehlo.reshape %8464 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8466 = stablehlo.transpose %8465, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8467 = stablehlo.reshape %8466 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %8468 = stablehlo.reshape %8467 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8469 = stablehlo.convert %8468 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8470 = stablehlo.dot_general %8469, %arg850, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8471 = stablehlo.broadcast_in_dim %8470, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8472 = stablehlo.multiply %8471, %3065 : tensor<1200x320xf32>
-    %8473 = stablehlo.broadcast_in_dim %8472, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8474 = stablehlo.broadcast_in_dim %arg851, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8475 = stablehlo.add %8473, %8474 : tensor<1200x320xf32>
-    %8476 = stablehlo.convert %8475 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8477 = stablehlo.reshape %8476 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8478 = stablehlo.add %8477, %8322 : tensor<1x1200x320xbf16>
-    %8479 = stablehlo.convert %8478 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8480 = stablehlo.convert %8479 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8481 = stablehlo.reduce(%8480 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8482 = stablehlo.reshape %8481 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8483 = stablehlo.broadcast_in_dim %8482, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8484 = stablehlo.divide %8483, %2987 : tensor<1x1200x1xf64>
-    %8485 = stablehlo.broadcast_in_dim %8480, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8486 = stablehlo.broadcast_in_dim %8484, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8487 = stablehlo.subtract %8485, %8486 : tensor<1x1200x320xf64>
-    %8488 = stablehlo.multiply %8487, %8487 : tensor<1x1200x320xf64>
-    %8489 = stablehlo.reduce(%8488 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8490 = stablehlo.reshape %8489 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8491 = stablehlo.broadcast_in_dim %8490, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8492 = stablehlo.divide %8491, %2987 : tensor<1x1200x1xf64>
-    %8493 = stablehlo.convert %8492 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8494 = stablehlo.reduce(%8479 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8495 = stablehlo.reshape %8494 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8496 = stablehlo.broadcast_in_dim %8495, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8497 = stablehlo.divide %8496, %3003 : tensor<1x1200x1xf32>
-    %8498 = stablehlo.broadcast_in_dim %8493, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8499 = stablehlo.add %8498, %3006 : tensor<1x1200x1xf32>
-    %8500 = stablehlo.rsqrt %8499 : tensor<1x1200x1xf32>
-    %8501 = stablehlo.broadcast_in_dim %8479, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8502 = stablehlo.broadcast_in_dim %8497, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8503 = stablehlo.subtract %8501, %8502 : tensor<1x1200x320xf32>
-    %8504 = stablehlo.broadcast_in_dim %8503, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8505 = stablehlo.broadcast_in_dim %8500, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8506 = stablehlo.multiply %8504, %8505 : tensor<1x1200x320xf32>
-    %8507 = stablehlo.convert %arg375 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8508 = stablehlo.broadcast_in_dim %8506, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8509 = stablehlo.broadcast_in_dim %8507, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8510 = stablehlo.multiply %8508, %8509 : tensor<1x1200x320xf32>
-    %8511 = stablehlo.convert %arg376 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8512 = stablehlo.broadcast_in_dim %8510, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8513 = stablehlo.broadcast_in_dim %8511, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8514 = stablehlo.add %8512, %8513 : tensor<1x1200x320xf32>
-    %8515 = stablehlo.convert %8514 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8516 = stablehlo.reshape %8515 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8517 = stablehlo.convert %8516 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8518 = stablehlo.dot_general %8517, %arg852, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %8519 = stablehlo.broadcast_in_dim %8518, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8520 = stablehlo.multiply %8519, %3226 : tensor<1200x1280xf32>
-    %8521 = stablehlo.broadcast_in_dim %8520, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8522 = stablehlo.broadcast_in_dim %arg853, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %8523 = stablehlo.add %8521, %8522 : tensor<1200x1280xf32>
-    %8524 = stablehlo.convert %8523 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %8525 = stablehlo.reshape %8524 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %8526 = stablehlo.transpose %8525, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %8527 = stablehlo.reshape %8526 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8528 = stablehlo.convolution(%8527, %arg377) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8529 = stablehlo.reshape %arg378 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %8530 = stablehlo.broadcast_in_dim %8528, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8531 = stablehlo.broadcast_in_dim %8529, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8532 = stablehlo.add %8530, %8531 : tensor<1x1280x30x40xbf16>
-    %8533 = stablehlo.reshape %8532 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %8534 = stablehlo.transpose %8533, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %8535 = stablehlo.multiply %8534, %cst_42 : tensor<1x1200x1280xbf16>
-    %8536 = stablehlo.multiply %8534, %3243 : tensor<1x1200x1280xbf16>
-    %8537 = stablehlo.convert %8536 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %8538 = stablehlo.clamp %cst_43, %8537, %cst_44 : tensor<1x1200x1280xf32>
-    %8539 = stablehlo.multiply %8538, %8538 : tensor<1x1200x1280xf32>
-    %8540 = stablehlo.multiply %cst_45, %8539 : tensor<1x1200x1280xf32>
-    %8541 = stablehlo.add %8540, %cst_46 : tensor<1x1200x1280xf32>
-    %8542 = stablehlo.multiply %8541, %8539 : tensor<1x1200x1280xf32>
-    %8543 = stablehlo.add %8542, %cst_47 : tensor<1x1200x1280xf32>
-    %8544 = stablehlo.multiply %8543, %8539 : tensor<1x1200x1280xf32>
-    %8545 = stablehlo.add %8544, %cst_48 : tensor<1x1200x1280xf32>
-    %8546 = stablehlo.multiply %8545, %8539 : tensor<1x1200x1280xf32>
-    %8547 = stablehlo.add %8546, %cst_49 : tensor<1x1200x1280xf32>
-    %8548 = stablehlo.multiply %8547, %8539 : tensor<1x1200x1280xf32>
-    %8549 = stablehlo.add %8548, %cst_50 : tensor<1x1200x1280xf32>
-    %8550 = stablehlo.multiply %8549, %8539 : tensor<1x1200x1280xf32>
-    %8551 = stablehlo.add %8550, %cst_51 : tensor<1x1200x1280xf32>
-    %8552 = stablehlo.multiply %cst_52, %8539 : tensor<1x1200x1280xf32>
-    %8553 = stablehlo.add %8552, %cst_53 : tensor<1x1200x1280xf32>
-    %8554 = stablehlo.multiply %8553, %8539 : tensor<1x1200x1280xf32>
-    %8555 = stablehlo.add %8554, %cst_54 : tensor<1x1200x1280xf32>
-    %8556 = stablehlo.multiply %8555, %8539 : tensor<1x1200x1280xf32>
-    %8557 = stablehlo.add %8556, %cst_55 : tensor<1x1200x1280xf32>
-    %8558 = stablehlo.multiply %8557, %8539 : tensor<1x1200x1280xf32>
-    %8559 = stablehlo.add %8558, %cst_56 : tensor<1x1200x1280xf32>
-    %8560 = stablehlo.multiply %8538, %8551 : tensor<1x1200x1280xf32>
-    %8561 = stablehlo.divide %8560, %8559 : tensor<1x1200x1280xf32>
-    %8562 = stablehlo.clamp %cst_57, %8561, %cst_58 : tensor<1x1200x1280xf32>
-    %8563 = stablehlo.convert %8562 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %8564 = stablehlo.add %8563, %cst_40 : tensor<1x1200x1280xbf16>
-    %8565 = stablehlo.multiply %8564, %8535 : tensor<1x1200x1280xbf16>
-    %8566 = stablehlo.reshape %8565 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %8567 = stablehlo.dot_general %8566, %arg854, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %8568 = stablehlo.reshape %8567 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8569 = stablehlo.broadcast_in_dim %8568, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8570 = stablehlo.broadcast_in_dim %arg379, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %8571 = stablehlo.add %8569, %8570 : tensor<1x1200x320xbf16>
-    %8572 = stablehlo.reshape %8571 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8573 = stablehlo.reshape %8572 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8574 = stablehlo.add %8573, %8478 : tensor<1x1200x320xbf16>
-    %8575 = stablehlo.convert %8574 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8576 = stablehlo.convert %8575 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8577 = stablehlo.reduce(%8576 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8578 = stablehlo.reshape %8577 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8579 = stablehlo.broadcast_in_dim %8578, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8580 = stablehlo.divide %8579, %2987 : tensor<1x1200x1xf64>
-    %8581 = stablehlo.broadcast_in_dim %8576, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8582 = stablehlo.broadcast_in_dim %8580, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8583 = stablehlo.subtract %8581, %8582 : tensor<1x1200x320xf64>
-    %8584 = stablehlo.multiply %8583, %8583 : tensor<1x1200x320xf64>
-    %8585 = stablehlo.reduce(%8584 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8586 = stablehlo.reshape %8585 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8587 = stablehlo.broadcast_in_dim %8586, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8588 = stablehlo.divide %8587, %2987 : tensor<1x1200x1xf64>
-    %8589 = stablehlo.convert %8588 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8590 = stablehlo.reduce(%8575 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8591 = stablehlo.reshape %8590 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8592 = stablehlo.broadcast_in_dim %8591, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8593 = stablehlo.divide %8592, %3003 : tensor<1x1200x1xf32>
-    %8594 = stablehlo.broadcast_in_dim %8589, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8595 = stablehlo.add %8594, %3006 : tensor<1x1200x1xf32>
-    %8596 = stablehlo.rsqrt %8595 : tensor<1x1200x1xf32>
-    %8597 = stablehlo.broadcast_in_dim %8575, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8598 = stablehlo.broadcast_in_dim %8593, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8599 = stablehlo.subtract %8597, %8598 : tensor<1x1200x320xf32>
-    %8600 = stablehlo.broadcast_in_dim %8599, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8601 = stablehlo.broadcast_in_dim %8596, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8602 = stablehlo.multiply %8600, %8601 : tensor<1x1200x320xf32>
-    %8603 = stablehlo.convert %arg380 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8604 = stablehlo.broadcast_in_dim %8602, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8605 = stablehlo.broadcast_in_dim %8603, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8606 = stablehlo.multiply %8604, %8605 : tensor<1x1200x320xf32>
-    %8607 = stablehlo.convert %arg381 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8608 = stablehlo.broadcast_in_dim %8606, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8609 = stablehlo.broadcast_in_dim %8607, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8610 = stablehlo.add %8608, %8609 : tensor<1x1200x320xf32>
-    %8611 = stablehlo.convert %8610 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8612 = stablehlo.reshape %8611 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8613 = stablehlo.convert %8612 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8614 = stablehlo.dot_general %8613, %arg855, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8615 = stablehlo.broadcast_in_dim %8614, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8616 = stablehlo.multiply %8615, %3065 : tensor<1200x320xf32>
-    %8617 = stablehlo.broadcast_in_dim %8616, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8618 = stablehlo.broadcast_in_dim %arg856, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8619 = stablehlo.add %8617, %8618 : tensor<1200x320xf32>
-    %8620 = stablehlo.convert %8619 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8621 = stablehlo.reshape %8620 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8622 = stablehlo.reshape %8621 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8623 = stablehlo.transpose %8622, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8624 = stablehlo.transpose %8611, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %8625 = stablehlo.reshape %8624 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %8626 = stablehlo.convolution(%8625, %arg382) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %8627 = stablehlo.reshape %arg383 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %8628 = stablehlo.broadcast_in_dim %8626, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %8629 = stablehlo.broadcast_in_dim %8627, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %8630 = stablehlo.add %8628, %8629 : tensor<1x320x15x20xbf16>
-    %8631 = stablehlo.reshape %8630 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %8632 = stablehlo.transpose %8631, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %8633 = stablehlo.convert %8632 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %8634 = stablehlo.convert %8633 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %8635 = stablehlo.reduce(%8634 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8636 = stablehlo.reshape %8635 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8637 = stablehlo.broadcast_in_dim %8636, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8638 = stablehlo.divide %8637, %3088 : tensor<1x300x1xf64>
-    %8639 = stablehlo.broadcast_in_dim %8634, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %8640 = stablehlo.broadcast_in_dim %8638, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %8641 = stablehlo.subtract %8639, %8640 : tensor<1x300x320xf64>
-    %8642 = stablehlo.multiply %8641, %8641 : tensor<1x300x320xf64>
-    %8643 = stablehlo.reduce(%8642 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8644 = stablehlo.reshape %8643 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8645 = stablehlo.broadcast_in_dim %8644, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8646 = stablehlo.divide %8645, %3088 : tensor<1x300x1xf64>
-    %8647 = stablehlo.convert %8646 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %8648 = stablehlo.reduce(%8633 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %8649 = stablehlo.reshape %8648 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %8650 = stablehlo.broadcast_in_dim %8649, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8651 = stablehlo.divide %8650, %3102 : tensor<1x300x1xf32>
-    %8652 = stablehlo.broadcast_in_dim %8647, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8653 = stablehlo.add %8652, %136 : tensor<1x300x1xf32>
-    %8654 = stablehlo.rsqrt %8653 : tensor<1x300x1xf32>
-    %8655 = stablehlo.broadcast_in_dim %8633, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8656 = stablehlo.broadcast_in_dim %8651, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8657 = stablehlo.subtract %8655, %8656 : tensor<1x300x320xf32>
-    %8658 = stablehlo.broadcast_in_dim %8657, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8659 = stablehlo.broadcast_in_dim %8654, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8660 = stablehlo.multiply %8658, %8659 : tensor<1x300x320xf32>
-    %8661 = stablehlo.convert %arg384 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8662 = stablehlo.broadcast_in_dim %8660, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8663 = stablehlo.broadcast_in_dim %8661, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8664 = stablehlo.multiply %8662, %8663 : tensor<1x300x320xf32>
-    %8665 = stablehlo.convert %arg385 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8666 = stablehlo.broadcast_in_dim %8664, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8667 = stablehlo.broadcast_in_dim %8665, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8668 = stablehlo.add %8666, %8667 : tensor<1x300x320xf32>
-    %8669 = stablehlo.convert %8668 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %8670 = stablehlo.reshape %8669 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %8671 = stablehlo.convert %8670 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %8672 = stablehlo.dot_general %8671, %arg857, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8673 = stablehlo.broadcast_in_dim %8672, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8674 = stablehlo.multiply %8673, %3126 : tensor<300x320xf32>
-    %8675 = stablehlo.broadcast_in_dim %8674, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8676 = stablehlo.broadcast_in_dim %arg858, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8677 = stablehlo.add %8675, %8676 : tensor<300x320xf32>
-    %8678 = stablehlo.convert %8677 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8679 = stablehlo.reshape %8678 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8680 = stablehlo.reshape %8679 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8681 = stablehlo.transpose %8680, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8682 = stablehlo.dot_general %8671, %arg859, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8683 = stablehlo.broadcast_in_dim %8682, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8684 = stablehlo.multiply %8683, %3126 : tensor<300x320xf32>
-    %8685 = stablehlo.broadcast_in_dim %8684, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8686 = stablehlo.broadcast_in_dim %arg860, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8687 = stablehlo.add %8685, %8686 : tensor<300x320xf32>
-    %8688 = stablehlo.convert %8687 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8689 = stablehlo.reshape %8688 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8690 = stablehlo.reshape %8689 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8691 = stablehlo.transpose %8690, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8692 = stablehlo.transpose %8681, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %8693 = stablehlo.reshape %8623 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8694 = stablehlo.reshape %8692 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8695 = stablehlo.broadcast_in_dim %8694, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8696 = stablehlo.dot_general %8693, %8695, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8697 = stablehlo.reshape %8696 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8698 = stablehlo.broadcast_in_dim %8697, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8699 = stablehlo.divide %8698, %3152 : tensor<1x5x1200x300xbf16>
-    %8700 = stablehlo.convert %8699 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %8701 = stablehlo.reduce(%8700 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8702 = stablehlo.reshape %8701 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8703 = stablehlo.broadcast_in_dim %8700, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8704 = stablehlo.broadcast_in_dim %8702, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8705 = stablehlo.subtract %8703, %8704 : tensor<1x5x1200x300xf32>
-    %8706 = stablehlo.exponential %8705 : tensor<1x5x1200x300xf32>
-    %8707 = stablehlo.reduce(%8706 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8708 = stablehlo.reshape %8707 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8709 = stablehlo.broadcast_in_dim %8706, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8710 = stablehlo.broadcast_in_dim %8708, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8711 = stablehlo.divide %8709, %8710 : tensor<1x5x1200x300xf32>
-    %8712 = stablehlo.convert %8711 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %8713 = stablehlo.reshape %8712 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8714 = stablehlo.reshape %8691 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8715 = stablehlo.broadcast_in_dim %8714, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8716 = stablehlo.dot_general %8713, %8715, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8717 = stablehlo.reshape %8716 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8718 = stablehlo.transpose %8717, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8719 = stablehlo.reshape %8718 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %8720 = stablehlo.reshape %8719 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8721 = stablehlo.convert %8720 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8722 = stablehlo.dot_general %8721, %arg861, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8723 = stablehlo.broadcast_in_dim %8722, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8724 = stablehlo.multiply %8723, %3065 : tensor<1200x320xf32>
-    %8725 = stablehlo.broadcast_in_dim %8724, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8726 = stablehlo.broadcast_in_dim %arg862, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8727 = stablehlo.add %8725, %8726 : tensor<1200x320xf32>
-    %8728 = stablehlo.convert %8727 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8729 = stablehlo.reshape %8728 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8730 = stablehlo.add %8729, %8574 : tensor<1x1200x320xbf16>
-    %8731 = stablehlo.convert %8730 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8732 = stablehlo.convert %8731 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8733 = stablehlo.reduce(%8732 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8734 = stablehlo.reshape %8733 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8735 = stablehlo.broadcast_in_dim %8734, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8736 = stablehlo.divide %8735, %2987 : tensor<1x1200x1xf64>
-    %8737 = stablehlo.broadcast_in_dim %8732, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8738 = stablehlo.broadcast_in_dim %8736, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8739 = stablehlo.subtract %8737, %8738 : tensor<1x1200x320xf64>
-    %8740 = stablehlo.multiply %8739, %8739 : tensor<1x1200x320xf64>
-    %8741 = stablehlo.reduce(%8740 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8742 = stablehlo.reshape %8741 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8743 = stablehlo.broadcast_in_dim %8742, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8744 = stablehlo.divide %8743, %2987 : tensor<1x1200x1xf64>
-    %8745 = stablehlo.convert %8744 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8746 = stablehlo.reduce(%8731 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8747 = stablehlo.reshape %8746 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8748 = stablehlo.broadcast_in_dim %8747, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8749 = stablehlo.divide %8748, %3003 : tensor<1x1200x1xf32>
-    %8750 = stablehlo.broadcast_in_dim %8745, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8751 = stablehlo.add %8750, %3006 : tensor<1x1200x1xf32>
-    %8752 = stablehlo.rsqrt %8751 : tensor<1x1200x1xf32>
-    %8753 = stablehlo.broadcast_in_dim %8731, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8754 = stablehlo.broadcast_in_dim %8749, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8755 = stablehlo.subtract %8753, %8754 : tensor<1x1200x320xf32>
-    %8756 = stablehlo.broadcast_in_dim %8755, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8757 = stablehlo.broadcast_in_dim %8752, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8758 = stablehlo.multiply %8756, %8757 : tensor<1x1200x320xf32>
-    %8759 = stablehlo.convert %arg386 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8760 = stablehlo.broadcast_in_dim %8758, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8761 = stablehlo.broadcast_in_dim %8759, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8762 = stablehlo.multiply %8760, %8761 : tensor<1x1200x320xf32>
-    %8763 = stablehlo.convert %arg387 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8764 = stablehlo.broadcast_in_dim %8762, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8765 = stablehlo.broadcast_in_dim %8763, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8766 = stablehlo.add %8764, %8765 : tensor<1x1200x320xf32>
-    %8767 = stablehlo.convert %8766 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8768 = stablehlo.reshape %8767 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8769 = stablehlo.convert %8768 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8770 = stablehlo.dot_general %8769, %arg863, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %8771 = stablehlo.broadcast_in_dim %8770, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8772 = stablehlo.multiply %8771, %3226 : tensor<1200x1280xf32>
-    %8773 = stablehlo.broadcast_in_dim %8772, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %8774 = stablehlo.broadcast_in_dim %arg864, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %8775 = stablehlo.add %8773, %8774 : tensor<1200x1280xf32>
-    %8776 = stablehlo.convert %8775 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %8777 = stablehlo.reshape %8776 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %8778 = stablehlo.transpose %8777, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %8779 = stablehlo.reshape %8778 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8780 = stablehlo.convolution(%8779, %arg388) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8781 = stablehlo.reshape %arg389 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %8782 = stablehlo.broadcast_in_dim %8780, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8783 = stablehlo.broadcast_in_dim %8781, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %8784 = stablehlo.add %8782, %8783 : tensor<1x1280x30x40xbf16>
-    %8785 = stablehlo.reshape %8784 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %8786 = stablehlo.transpose %8785, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %8787 = stablehlo.multiply %8786, %cst_42 : tensor<1x1200x1280xbf16>
-    %8788 = stablehlo.multiply %8786, %3243 : tensor<1x1200x1280xbf16>
-    %8789 = stablehlo.convert %8788 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %8790 = stablehlo.clamp %cst_43, %8789, %cst_44 : tensor<1x1200x1280xf32>
-    %8791 = stablehlo.multiply %8790, %8790 : tensor<1x1200x1280xf32>
-    %8792 = stablehlo.multiply %cst_45, %8791 : tensor<1x1200x1280xf32>
-    %8793 = stablehlo.add %8792, %cst_46 : tensor<1x1200x1280xf32>
-    %8794 = stablehlo.multiply %8793, %8791 : tensor<1x1200x1280xf32>
-    %8795 = stablehlo.add %8794, %cst_47 : tensor<1x1200x1280xf32>
-    %8796 = stablehlo.multiply %8795, %8791 : tensor<1x1200x1280xf32>
-    %8797 = stablehlo.add %8796, %cst_48 : tensor<1x1200x1280xf32>
-    %8798 = stablehlo.multiply %8797, %8791 : tensor<1x1200x1280xf32>
-    %8799 = stablehlo.add %8798, %cst_49 : tensor<1x1200x1280xf32>
-    %8800 = stablehlo.multiply %8799, %8791 : tensor<1x1200x1280xf32>
-    %8801 = stablehlo.add %8800, %cst_50 : tensor<1x1200x1280xf32>
-    %8802 = stablehlo.multiply %8801, %8791 : tensor<1x1200x1280xf32>
-    %8803 = stablehlo.add %8802, %cst_51 : tensor<1x1200x1280xf32>
-    %8804 = stablehlo.multiply %cst_52, %8791 : tensor<1x1200x1280xf32>
-    %8805 = stablehlo.add %8804, %cst_53 : tensor<1x1200x1280xf32>
-    %8806 = stablehlo.multiply %8805, %8791 : tensor<1x1200x1280xf32>
-    %8807 = stablehlo.add %8806, %cst_54 : tensor<1x1200x1280xf32>
-    %8808 = stablehlo.multiply %8807, %8791 : tensor<1x1200x1280xf32>
-    %8809 = stablehlo.add %8808, %cst_55 : tensor<1x1200x1280xf32>
-    %8810 = stablehlo.multiply %8809, %8791 : tensor<1x1200x1280xf32>
-    %8811 = stablehlo.add %8810, %cst_56 : tensor<1x1200x1280xf32>
-    %8812 = stablehlo.multiply %8790, %8803 : tensor<1x1200x1280xf32>
-    %8813 = stablehlo.divide %8812, %8811 : tensor<1x1200x1280xf32>
-    %8814 = stablehlo.clamp %cst_57, %8813, %cst_58 : tensor<1x1200x1280xf32>
-    %8815 = stablehlo.convert %8814 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %8816 = stablehlo.add %8815, %cst_40 : tensor<1x1200x1280xbf16>
-    %8817 = stablehlo.multiply %8816, %8787 : tensor<1x1200x1280xbf16>
-    %8818 = stablehlo.reshape %8817 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %8819 = stablehlo.dot_general %8818, %arg865, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %8820 = stablehlo.reshape %8819 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8821 = stablehlo.broadcast_in_dim %8820, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8822 = stablehlo.broadcast_in_dim %arg390, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %8823 = stablehlo.add %8821, %8822 : tensor<1x1200x320xbf16>
-    %8824 = stablehlo.reshape %8823 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8825 = stablehlo.reshape %8824 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8826 = stablehlo.add %8825, %8730 : tensor<1x1200x320xbf16>
-    %8827 = stablehlo.convert %8826 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8828 = stablehlo.convert %8827 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8829 = stablehlo.reduce(%8828 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8830 = stablehlo.reshape %8829 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8831 = stablehlo.broadcast_in_dim %8830, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8832 = stablehlo.divide %8831, %2987 : tensor<1x1200x1xf64>
-    %8833 = stablehlo.broadcast_in_dim %8828, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8834 = stablehlo.broadcast_in_dim %8832, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8835 = stablehlo.subtract %8833, %8834 : tensor<1x1200x320xf64>
-    %8836 = stablehlo.multiply %8835, %8835 : tensor<1x1200x320xf64>
-    %8837 = stablehlo.reduce(%8836 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8838 = stablehlo.reshape %8837 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8839 = stablehlo.broadcast_in_dim %8838, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8840 = stablehlo.divide %8839, %2987 : tensor<1x1200x1xf64>
-    %8841 = stablehlo.convert %8840 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8842 = stablehlo.reduce(%8827 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8843 = stablehlo.reshape %8842 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %8844 = stablehlo.broadcast_in_dim %8843, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8845 = stablehlo.divide %8844, %3003 : tensor<1x1200x1xf32>
-    %8846 = stablehlo.broadcast_in_dim %8841, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %8847 = stablehlo.add %8846, %3006 : tensor<1x1200x1xf32>
-    %8848 = stablehlo.rsqrt %8847 : tensor<1x1200x1xf32>
-    %8849 = stablehlo.broadcast_in_dim %8827, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8850 = stablehlo.broadcast_in_dim %8845, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8851 = stablehlo.subtract %8849, %8850 : tensor<1x1200x320xf32>
-    %8852 = stablehlo.broadcast_in_dim %8851, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8853 = stablehlo.broadcast_in_dim %8848, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %8854 = stablehlo.multiply %8852, %8853 : tensor<1x1200x320xf32>
-    %8855 = stablehlo.convert %arg391 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8856 = stablehlo.broadcast_in_dim %8854, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8857 = stablehlo.broadcast_in_dim %8855, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8858 = stablehlo.multiply %8856, %8857 : tensor<1x1200x320xf32>
-    %8859 = stablehlo.convert %arg392 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8860 = stablehlo.broadcast_in_dim %8858, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %8861 = stablehlo.broadcast_in_dim %8859, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %8862 = stablehlo.add %8860, %8861 : tensor<1x1200x320xf32>
-    %8863 = stablehlo.convert %8862 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %8864 = stablehlo.reshape %8863 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8865 = stablehlo.convert %8864 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8866 = stablehlo.dot_general %8865, %arg866, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8867 = stablehlo.broadcast_in_dim %8866, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8868 = stablehlo.multiply %8867, %3065 : tensor<1200x320xf32>
-    %8869 = stablehlo.broadcast_in_dim %8868, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8870 = stablehlo.broadcast_in_dim %arg867, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8871 = stablehlo.add %8869, %8870 : tensor<1200x320xf32>
-    %8872 = stablehlo.convert %8871 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8873 = stablehlo.reshape %8872 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8874 = stablehlo.reshape %8873 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8875 = stablehlo.transpose %8874, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8876 = stablehlo.transpose %8863, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %8877 = stablehlo.reshape %8876 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %8878 = stablehlo.convolution(%8877, %arg393) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %8879 = stablehlo.reshape %arg394 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %8880 = stablehlo.broadcast_in_dim %8878, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %8881 = stablehlo.broadcast_in_dim %8879, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %8882 = stablehlo.add %8880, %8881 : tensor<1x320x15x20xbf16>
-    %8883 = stablehlo.reshape %8882 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %8884 = stablehlo.transpose %8883, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %8885 = stablehlo.convert %8884 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %8886 = stablehlo.convert %8885 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %8887 = stablehlo.reduce(%8886 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8888 = stablehlo.reshape %8887 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8889 = stablehlo.broadcast_in_dim %8888, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8890 = stablehlo.divide %8889, %3088 : tensor<1x300x1xf64>
-    %8891 = stablehlo.broadcast_in_dim %8886, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %8892 = stablehlo.broadcast_in_dim %8890, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %8893 = stablehlo.subtract %8891, %8892 : tensor<1x300x320xf64>
-    %8894 = stablehlo.multiply %8893, %8893 : tensor<1x300x320xf64>
-    %8895 = stablehlo.reduce(%8894 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %8896 = stablehlo.reshape %8895 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %8897 = stablehlo.broadcast_in_dim %8896, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %8898 = stablehlo.divide %8897, %3088 : tensor<1x300x1xf64>
-    %8899 = stablehlo.convert %8898 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %8900 = stablehlo.reduce(%8885 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %8901 = stablehlo.reshape %8900 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %8902 = stablehlo.broadcast_in_dim %8901, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8903 = stablehlo.divide %8902, %3102 : tensor<1x300x1xf32>
-    %8904 = stablehlo.broadcast_in_dim %8899, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %8905 = stablehlo.add %8904, %136 : tensor<1x300x1xf32>
-    %8906 = stablehlo.rsqrt %8905 : tensor<1x300x1xf32>
-    %8907 = stablehlo.broadcast_in_dim %8885, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8908 = stablehlo.broadcast_in_dim %8903, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8909 = stablehlo.subtract %8907, %8908 : tensor<1x300x320xf32>
-    %8910 = stablehlo.broadcast_in_dim %8909, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8911 = stablehlo.broadcast_in_dim %8906, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %8912 = stablehlo.multiply %8910, %8911 : tensor<1x300x320xf32>
-    %8913 = stablehlo.convert %arg395 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8914 = stablehlo.broadcast_in_dim %8912, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8915 = stablehlo.broadcast_in_dim %8913, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8916 = stablehlo.multiply %8914, %8915 : tensor<1x300x320xf32>
-    %8917 = stablehlo.convert %arg396 : (tensor<320xbf16>) -> tensor<320xf32>
-    %8918 = stablehlo.broadcast_in_dim %8916, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %8919 = stablehlo.broadcast_in_dim %8917, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %8920 = stablehlo.add %8918, %8919 : tensor<1x300x320xf32>
-    %8921 = stablehlo.convert %8920 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %8922 = stablehlo.reshape %8921 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %8923 = stablehlo.convert %8922 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %8924 = stablehlo.dot_general %8923, %arg868, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8925 = stablehlo.broadcast_in_dim %8924, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8926 = stablehlo.multiply %8925, %3126 : tensor<300x320xf32>
-    %8927 = stablehlo.broadcast_in_dim %8926, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8928 = stablehlo.broadcast_in_dim %arg869, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8929 = stablehlo.add %8927, %8928 : tensor<300x320xf32>
-    %8930 = stablehlo.convert %8929 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8931 = stablehlo.reshape %8930 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8932 = stablehlo.reshape %8931 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8933 = stablehlo.transpose %8932, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8934 = stablehlo.dot_general %8923, %arg870, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %8935 = stablehlo.broadcast_in_dim %8934, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8936 = stablehlo.multiply %8935, %3126 : tensor<300x320xf32>
-    %8937 = stablehlo.broadcast_in_dim %8936, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %8938 = stablehlo.broadcast_in_dim %arg871, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %8939 = stablehlo.add %8937, %8938 : tensor<300x320xf32>
-    %8940 = stablehlo.convert %8939 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %8941 = stablehlo.reshape %8940 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %8942 = stablehlo.reshape %8941 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %8943 = stablehlo.transpose %8942, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %8944 = stablehlo.transpose %8933, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %8945 = stablehlo.reshape %8875 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8946 = stablehlo.reshape %8944 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8947 = stablehlo.broadcast_in_dim %8946, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %8948 = stablehlo.dot_general %8945, %8947, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8949 = stablehlo.reshape %8948 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8950 = stablehlo.broadcast_in_dim %8949, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %8951 = stablehlo.divide %8950, %3152 : tensor<1x5x1200x300xbf16>
-    %8952 = stablehlo.convert %8951 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %8953 = stablehlo.reduce(%8952 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8954 = stablehlo.reshape %8953 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8955 = stablehlo.broadcast_in_dim %8952, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8956 = stablehlo.broadcast_in_dim %8954, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8957 = stablehlo.subtract %8955, %8956 : tensor<1x5x1200x300xf32>
-    %8958 = stablehlo.exponential %8957 : tensor<1x5x1200x300xf32>
-    %8959 = stablehlo.reduce(%8958 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %8960 = stablehlo.reshape %8959 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %8961 = stablehlo.broadcast_in_dim %8958, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %8962 = stablehlo.broadcast_in_dim %8960, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %8963 = stablehlo.divide %8961, %8962 : tensor<1x5x1200x300xf32>
-    %8964 = stablehlo.convert %8963 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %8965 = stablehlo.reshape %8964 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %8966 = stablehlo.reshape %8943 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8967 = stablehlo.broadcast_in_dim %8966, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %8968 = stablehlo.dot_general %8965, %8967, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %8969 = stablehlo.reshape %8968 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %8970 = stablehlo.transpose %8969, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %8971 = stablehlo.reshape %8970 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %8972 = stablehlo.reshape %8971 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %8973 = stablehlo.convert %8972 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %8974 = stablehlo.dot_general %8973, %arg872, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %8975 = stablehlo.broadcast_in_dim %8974, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8976 = stablehlo.multiply %8975, %3065 : tensor<1200x320xf32>
-    %8977 = stablehlo.broadcast_in_dim %8976, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %8978 = stablehlo.broadcast_in_dim %arg873, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %8979 = stablehlo.add %8977, %8978 : tensor<1200x320xf32>
-    %8980 = stablehlo.convert %8979 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %8981 = stablehlo.reshape %8980 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %8982 = stablehlo.add %8981, %8826 : tensor<1x1200x320xbf16>
-    %8983 = stablehlo.convert %8982 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %8984 = stablehlo.convert %8983 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %8985 = stablehlo.reduce(%8984 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8986 = stablehlo.reshape %8985 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8987 = stablehlo.broadcast_in_dim %8986, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8988 = stablehlo.divide %8987, %2987 : tensor<1x1200x1xf64>
-    %8989 = stablehlo.broadcast_in_dim %8984, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %8990 = stablehlo.broadcast_in_dim %8988, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %8991 = stablehlo.subtract %8989, %8990 : tensor<1x1200x320xf64>
-    %8992 = stablehlo.multiply %8991, %8991 : tensor<1x1200x320xf64>
-    %8993 = stablehlo.reduce(%8992 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %8994 = stablehlo.reshape %8993 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %8995 = stablehlo.broadcast_in_dim %8994, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %8996 = stablehlo.divide %8995, %2987 : tensor<1x1200x1xf64>
-    %8997 = stablehlo.convert %8996 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %8998 = stablehlo.reduce(%8983 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %8999 = stablehlo.reshape %8998 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9000 = stablehlo.broadcast_in_dim %8999, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9001 = stablehlo.divide %9000, %3003 : tensor<1x1200x1xf32>
-    %9002 = stablehlo.broadcast_in_dim %8997, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9003 = stablehlo.add %9002, %3006 : tensor<1x1200x1xf32>
-    %9004 = stablehlo.rsqrt %9003 : tensor<1x1200x1xf32>
-    %9005 = stablehlo.broadcast_in_dim %8983, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9006 = stablehlo.broadcast_in_dim %9001, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9007 = stablehlo.subtract %9005, %9006 : tensor<1x1200x320xf32>
-    %9008 = stablehlo.broadcast_in_dim %9007, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9009 = stablehlo.broadcast_in_dim %9004, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9010 = stablehlo.multiply %9008, %9009 : tensor<1x1200x320xf32>
-    %9011 = stablehlo.convert %arg397 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9012 = stablehlo.broadcast_in_dim %9010, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9013 = stablehlo.broadcast_in_dim %9011, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9014 = stablehlo.multiply %9012, %9013 : tensor<1x1200x320xf32>
-    %9015 = stablehlo.convert %arg398 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9016 = stablehlo.broadcast_in_dim %9014, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9017 = stablehlo.broadcast_in_dim %9015, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9018 = stablehlo.add %9016, %9017 : tensor<1x1200x320xf32>
-    %9019 = stablehlo.convert %9018 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9020 = stablehlo.reshape %9019 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9021 = stablehlo.convert %9020 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9022 = stablehlo.dot_general %9021, %arg874, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %9023 = stablehlo.broadcast_in_dim %9022, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9024 = stablehlo.multiply %9023, %3226 : tensor<1200x1280xf32>
-    %9025 = stablehlo.broadcast_in_dim %9024, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9026 = stablehlo.broadcast_in_dim %arg875, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %9027 = stablehlo.add %9025, %9026 : tensor<1200x1280xf32>
-    %9028 = stablehlo.convert %9027 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %9029 = stablehlo.reshape %9028 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %9030 = stablehlo.transpose %9029, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %9031 = stablehlo.reshape %9030 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9032 = stablehlo.convolution(%9031, %arg399) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9033 = stablehlo.reshape %arg400 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %9034 = stablehlo.broadcast_in_dim %9032, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9035 = stablehlo.broadcast_in_dim %9033, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9036 = stablehlo.add %9034, %9035 : tensor<1x1280x30x40xbf16>
-    %9037 = stablehlo.reshape %9036 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %9038 = stablehlo.transpose %9037, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %9039 = stablehlo.multiply %9038, %cst_42 : tensor<1x1200x1280xbf16>
-    %9040 = stablehlo.multiply %9038, %3243 : tensor<1x1200x1280xbf16>
-    %9041 = stablehlo.convert %9040 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %9042 = stablehlo.clamp %cst_43, %9041, %cst_44 : tensor<1x1200x1280xf32>
-    %9043 = stablehlo.multiply %9042, %9042 : tensor<1x1200x1280xf32>
-    %9044 = stablehlo.multiply %cst_45, %9043 : tensor<1x1200x1280xf32>
-    %9045 = stablehlo.add %9044, %cst_46 : tensor<1x1200x1280xf32>
-    %9046 = stablehlo.multiply %9045, %9043 : tensor<1x1200x1280xf32>
-    %9047 = stablehlo.add %9046, %cst_47 : tensor<1x1200x1280xf32>
-    %9048 = stablehlo.multiply %9047, %9043 : tensor<1x1200x1280xf32>
-    %9049 = stablehlo.add %9048, %cst_48 : tensor<1x1200x1280xf32>
-    %9050 = stablehlo.multiply %9049, %9043 : tensor<1x1200x1280xf32>
-    %9051 = stablehlo.add %9050, %cst_49 : tensor<1x1200x1280xf32>
-    %9052 = stablehlo.multiply %9051, %9043 : tensor<1x1200x1280xf32>
-    %9053 = stablehlo.add %9052, %cst_50 : tensor<1x1200x1280xf32>
-    %9054 = stablehlo.multiply %9053, %9043 : tensor<1x1200x1280xf32>
-    %9055 = stablehlo.add %9054, %cst_51 : tensor<1x1200x1280xf32>
-    %9056 = stablehlo.multiply %cst_52, %9043 : tensor<1x1200x1280xf32>
-    %9057 = stablehlo.add %9056, %cst_53 : tensor<1x1200x1280xf32>
-    %9058 = stablehlo.multiply %9057, %9043 : tensor<1x1200x1280xf32>
-    %9059 = stablehlo.add %9058, %cst_54 : tensor<1x1200x1280xf32>
-    %9060 = stablehlo.multiply %9059, %9043 : tensor<1x1200x1280xf32>
-    %9061 = stablehlo.add %9060, %cst_55 : tensor<1x1200x1280xf32>
-    %9062 = stablehlo.multiply %9061, %9043 : tensor<1x1200x1280xf32>
-    %9063 = stablehlo.add %9062, %cst_56 : tensor<1x1200x1280xf32>
-    %9064 = stablehlo.multiply %9042, %9055 : tensor<1x1200x1280xf32>
-    %9065 = stablehlo.divide %9064, %9063 : tensor<1x1200x1280xf32>
-    %9066 = stablehlo.clamp %cst_57, %9065, %cst_58 : tensor<1x1200x1280xf32>
-    %9067 = stablehlo.convert %9066 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %9068 = stablehlo.add %9067, %cst_40 : tensor<1x1200x1280xbf16>
-    %9069 = stablehlo.multiply %9068, %9039 : tensor<1x1200x1280xbf16>
-    %9070 = stablehlo.reshape %9069 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %9071 = stablehlo.dot_general %9070, %arg876, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %9072 = stablehlo.reshape %9071 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9073 = stablehlo.broadcast_in_dim %9072, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9074 = stablehlo.broadcast_in_dim %arg401, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %9075 = stablehlo.add %9073, %9074 : tensor<1x1200x320xbf16>
-    %9076 = stablehlo.reshape %9075 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9077 = stablehlo.reshape %9076 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9078 = stablehlo.add %9077, %8982 : tensor<1x1200x320xbf16>
-    %9079 = stablehlo.convert %9078 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9080 = stablehlo.convert %9079 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9081 = stablehlo.reduce(%9080 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9082 = stablehlo.reshape %9081 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9083 = stablehlo.broadcast_in_dim %9082, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9084 = stablehlo.divide %9083, %2987 : tensor<1x1200x1xf64>
-    %9085 = stablehlo.broadcast_in_dim %9080, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9086 = stablehlo.broadcast_in_dim %9084, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9087 = stablehlo.subtract %9085, %9086 : tensor<1x1200x320xf64>
-    %9088 = stablehlo.multiply %9087, %9087 : tensor<1x1200x320xf64>
-    %9089 = stablehlo.reduce(%9088 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9090 = stablehlo.reshape %9089 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9091 = stablehlo.broadcast_in_dim %9090, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9092 = stablehlo.divide %9091, %2987 : tensor<1x1200x1xf64>
-    %9093 = stablehlo.convert %9092 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9094 = stablehlo.reduce(%9079 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9095 = stablehlo.reshape %9094 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9096 = stablehlo.broadcast_in_dim %9095, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9097 = stablehlo.divide %9096, %3003 : tensor<1x1200x1xf32>
-    %9098 = stablehlo.broadcast_in_dim %9093, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9099 = stablehlo.add %9098, %3006 : tensor<1x1200x1xf32>
-    %9100 = stablehlo.rsqrt %9099 : tensor<1x1200x1xf32>
-    %9101 = stablehlo.broadcast_in_dim %9079, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9102 = stablehlo.broadcast_in_dim %9097, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9103 = stablehlo.subtract %9101, %9102 : tensor<1x1200x320xf32>
-    %9104 = stablehlo.broadcast_in_dim %9103, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9105 = stablehlo.broadcast_in_dim %9100, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9106 = stablehlo.multiply %9104, %9105 : tensor<1x1200x320xf32>
-    %9107 = stablehlo.convert %arg402 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9108 = stablehlo.broadcast_in_dim %9106, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9109 = stablehlo.broadcast_in_dim %9107, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9110 = stablehlo.multiply %9108, %9109 : tensor<1x1200x320xf32>
-    %9111 = stablehlo.convert %arg403 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9112 = stablehlo.broadcast_in_dim %9110, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9113 = stablehlo.broadcast_in_dim %9111, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9114 = stablehlo.add %9112, %9113 : tensor<1x1200x320xf32>
-    %9115 = stablehlo.convert %9114 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9116 = stablehlo.reshape %9115 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9117 = stablehlo.convert %9116 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9118 = stablehlo.dot_general %9117, %arg877, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %9119 = stablehlo.broadcast_in_dim %9118, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9120 = stablehlo.multiply %9119, %3065 : tensor<1200x320xf32>
-    %9121 = stablehlo.broadcast_in_dim %9120, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9122 = stablehlo.broadcast_in_dim %arg878, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %9123 = stablehlo.add %9121, %9122 : tensor<1200x320xf32>
-    %9124 = stablehlo.convert %9123 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %9125 = stablehlo.reshape %9124 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9126 = stablehlo.reshape %9125 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %9127 = stablehlo.transpose %9126, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %9128 = stablehlo.transpose %9115, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %9129 = stablehlo.reshape %9128 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %9130 = stablehlo.convolution(%9129, %arg404) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %9131 = stablehlo.reshape %arg405 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %9132 = stablehlo.broadcast_in_dim %9130, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %9133 = stablehlo.broadcast_in_dim %9131, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %9134 = stablehlo.add %9132, %9133 : tensor<1x320x15x20xbf16>
-    %9135 = stablehlo.reshape %9134 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %9136 = stablehlo.transpose %9135, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %9137 = stablehlo.convert %9136 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %9138 = stablehlo.convert %9137 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %9139 = stablehlo.reduce(%9138 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9140 = stablehlo.reshape %9139 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9141 = stablehlo.broadcast_in_dim %9140, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9142 = stablehlo.divide %9141, %3088 : tensor<1x300x1xf64>
-    %9143 = stablehlo.broadcast_in_dim %9138, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %9144 = stablehlo.broadcast_in_dim %9142, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %9145 = stablehlo.subtract %9143, %9144 : tensor<1x300x320xf64>
-    %9146 = stablehlo.multiply %9145, %9145 : tensor<1x300x320xf64>
-    %9147 = stablehlo.reduce(%9146 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9148 = stablehlo.reshape %9147 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9149 = stablehlo.broadcast_in_dim %9148, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9150 = stablehlo.divide %9149, %3088 : tensor<1x300x1xf64>
-    %9151 = stablehlo.convert %9150 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %9152 = stablehlo.reduce(%9137 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %9153 = stablehlo.reshape %9152 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %9154 = stablehlo.broadcast_in_dim %9153, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9155 = stablehlo.divide %9154, %3102 : tensor<1x300x1xf32>
-    %9156 = stablehlo.broadcast_in_dim %9151, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9157 = stablehlo.add %9156, %136 : tensor<1x300x1xf32>
-    %9158 = stablehlo.rsqrt %9157 : tensor<1x300x1xf32>
-    %9159 = stablehlo.broadcast_in_dim %9137, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9160 = stablehlo.broadcast_in_dim %9155, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %9161 = stablehlo.subtract %9159, %9160 : tensor<1x300x320xf32>
-    %9162 = stablehlo.broadcast_in_dim %9161, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9163 = stablehlo.broadcast_in_dim %9158, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %9164 = stablehlo.multiply %9162, %9163 : tensor<1x300x320xf32>
-    %9165 = stablehlo.convert %arg406 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9166 = stablehlo.broadcast_in_dim %9164, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9167 = stablehlo.broadcast_in_dim %9165, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %9168 = stablehlo.multiply %9166, %9167 : tensor<1x300x320xf32>
-    %9169 = stablehlo.convert %arg407 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9170 = stablehlo.broadcast_in_dim %9168, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9171 = stablehlo.broadcast_in_dim %9169, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %9172 = stablehlo.add %9170, %9171 : tensor<1x300x320xf32>
-    %9173 = stablehlo.convert %9172 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %9174 = stablehlo.reshape %9173 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %9175 = stablehlo.convert %9174 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %9176 = stablehlo.dot_general %9175, %arg879, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %9177 = stablehlo.broadcast_in_dim %9176, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9178 = stablehlo.multiply %9177, %3126 : tensor<300x320xf32>
-    %9179 = stablehlo.broadcast_in_dim %9178, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9180 = stablehlo.broadcast_in_dim %arg880, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %9181 = stablehlo.add %9179, %9180 : tensor<300x320xf32>
-    %9182 = stablehlo.convert %9181 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %9183 = stablehlo.reshape %9182 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %9184 = stablehlo.reshape %9183 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %9185 = stablehlo.transpose %9184, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %9186 = stablehlo.dot_general %9175, %arg881, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %9187 = stablehlo.broadcast_in_dim %9186, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9188 = stablehlo.multiply %9187, %3126 : tensor<300x320xf32>
-    %9189 = stablehlo.broadcast_in_dim %9188, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9190 = stablehlo.broadcast_in_dim %arg882, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %9191 = stablehlo.add %9189, %9190 : tensor<300x320xf32>
-    %9192 = stablehlo.convert %9191 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %9193 = stablehlo.reshape %9192 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %9194 = stablehlo.reshape %9193 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %9195 = stablehlo.transpose %9194, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %9196 = stablehlo.transpose %9185, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %9197 = stablehlo.reshape %9127 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %9198 = stablehlo.reshape %9196 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %9199 = stablehlo.broadcast_in_dim %9198, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %9200 = stablehlo.dot_general %9197, %9199, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %9201 = stablehlo.reshape %9200 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %9202 = stablehlo.broadcast_in_dim %9201, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %9203 = stablehlo.divide %9202, %3152 : tensor<1x5x1200x300xbf16>
-    %9204 = stablehlo.convert %9203 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %9205 = stablehlo.reduce(%9204 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %9206 = stablehlo.reshape %9205 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %9207 = stablehlo.broadcast_in_dim %9204, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %9208 = stablehlo.broadcast_in_dim %9206, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %9209 = stablehlo.subtract %9207, %9208 : tensor<1x5x1200x300xf32>
-    %9210 = stablehlo.exponential %9209 : tensor<1x5x1200x300xf32>
-    %9211 = stablehlo.reduce(%9210 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %9212 = stablehlo.reshape %9211 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %9213 = stablehlo.broadcast_in_dim %9210, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %9214 = stablehlo.broadcast_in_dim %9212, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %9215 = stablehlo.divide %9213, %9214 : tensor<1x5x1200x300xf32>
-    %9216 = stablehlo.convert %9215 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %9217 = stablehlo.reshape %9216 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %9218 = stablehlo.reshape %9195 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %9219 = stablehlo.broadcast_in_dim %9218, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %9220 = stablehlo.dot_general %9217, %9219, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %9221 = stablehlo.reshape %9220 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %9222 = stablehlo.transpose %9221, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %9223 = stablehlo.reshape %9222 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %9224 = stablehlo.reshape %9223 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9225 = stablehlo.convert %9224 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9226 = stablehlo.dot_general %9225, %arg883, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %9227 = stablehlo.broadcast_in_dim %9226, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9228 = stablehlo.multiply %9227, %3065 : tensor<1200x320xf32>
-    %9229 = stablehlo.broadcast_in_dim %9228, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9230 = stablehlo.broadcast_in_dim %arg884, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %9231 = stablehlo.add %9229, %9230 : tensor<1200x320xf32>
-    %9232 = stablehlo.convert %9231 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %9233 = stablehlo.reshape %9232 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9234 = stablehlo.add %9233, %9078 : tensor<1x1200x320xbf16>
-    %9235 = stablehlo.convert %9234 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9236 = stablehlo.convert %9235 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9237 = stablehlo.reduce(%9236 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9238 = stablehlo.reshape %9237 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9239 = stablehlo.broadcast_in_dim %9238, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9240 = stablehlo.divide %9239, %2987 : tensor<1x1200x1xf64>
-    %9241 = stablehlo.broadcast_in_dim %9236, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9242 = stablehlo.broadcast_in_dim %9240, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9243 = stablehlo.subtract %9241, %9242 : tensor<1x1200x320xf64>
-    %9244 = stablehlo.multiply %9243, %9243 : tensor<1x1200x320xf64>
-    %9245 = stablehlo.reduce(%9244 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9246 = stablehlo.reshape %9245 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9247 = stablehlo.broadcast_in_dim %9246, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9248 = stablehlo.divide %9247, %2987 : tensor<1x1200x1xf64>
-    %9249 = stablehlo.convert %9248 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9250 = stablehlo.reduce(%9235 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9251 = stablehlo.reshape %9250 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9252 = stablehlo.broadcast_in_dim %9251, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9253 = stablehlo.divide %9252, %3003 : tensor<1x1200x1xf32>
-    %9254 = stablehlo.broadcast_in_dim %9249, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9255 = stablehlo.add %9254, %3006 : tensor<1x1200x1xf32>
-    %9256 = stablehlo.rsqrt %9255 : tensor<1x1200x1xf32>
-    %9257 = stablehlo.broadcast_in_dim %9235, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9258 = stablehlo.broadcast_in_dim %9253, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9259 = stablehlo.subtract %9257, %9258 : tensor<1x1200x320xf32>
-    %9260 = stablehlo.broadcast_in_dim %9259, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9261 = stablehlo.broadcast_in_dim %9256, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9262 = stablehlo.multiply %9260, %9261 : tensor<1x1200x320xf32>
-    %9263 = stablehlo.convert %arg408 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9264 = stablehlo.broadcast_in_dim %9262, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9265 = stablehlo.broadcast_in_dim %9263, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9266 = stablehlo.multiply %9264, %9265 : tensor<1x1200x320xf32>
-    %9267 = stablehlo.convert %arg409 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9268 = stablehlo.broadcast_in_dim %9266, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9269 = stablehlo.broadcast_in_dim %9267, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9270 = stablehlo.add %9268, %9269 : tensor<1x1200x320xf32>
-    %9271 = stablehlo.convert %9270 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9272 = stablehlo.reshape %9271 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9273 = stablehlo.convert %9272 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9274 = stablehlo.dot_general %9273, %arg885, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %9275 = stablehlo.broadcast_in_dim %9274, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9276 = stablehlo.multiply %9275, %3226 : tensor<1200x1280xf32>
-    %9277 = stablehlo.broadcast_in_dim %9276, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9278 = stablehlo.broadcast_in_dim %arg886, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %9279 = stablehlo.add %9277, %9278 : tensor<1200x1280xf32>
-    %9280 = stablehlo.convert %9279 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %9281 = stablehlo.reshape %9280 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %9282 = stablehlo.transpose %9281, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %9283 = stablehlo.reshape %9282 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9284 = stablehlo.convolution(%9283, %arg410) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9285 = stablehlo.reshape %arg411 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %9286 = stablehlo.broadcast_in_dim %9284, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9287 = stablehlo.broadcast_in_dim %9285, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9288 = stablehlo.add %9286, %9287 : tensor<1x1280x30x40xbf16>
-    %9289 = stablehlo.reshape %9288 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %9290 = stablehlo.transpose %9289, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %9291 = stablehlo.multiply %9290, %cst_42 : tensor<1x1200x1280xbf16>
-    %9292 = stablehlo.multiply %9290, %3243 : tensor<1x1200x1280xbf16>
-    %9293 = stablehlo.convert %9292 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %9294 = stablehlo.clamp %cst_43, %9293, %cst_44 : tensor<1x1200x1280xf32>
-    %9295 = stablehlo.multiply %9294, %9294 : tensor<1x1200x1280xf32>
-    %9296 = stablehlo.multiply %cst_45, %9295 : tensor<1x1200x1280xf32>
-    %9297 = stablehlo.add %9296, %cst_46 : tensor<1x1200x1280xf32>
-    %9298 = stablehlo.multiply %9297, %9295 : tensor<1x1200x1280xf32>
-    %9299 = stablehlo.add %9298, %cst_47 : tensor<1x1200x1280xf32>
-    %9300 = stablehlo.multiply %9299, %9295 : tensor<1x1200x1280xf32>
-    %9301 = stablehlo.add %9300, %cst_48 : tensor<1x1200x1280xf32>
-    %9302 = stablehlo.multiply %9301, %9295 : tensor<1x1200x1280xf32>
-    %9303 = stablehlo.add %9302, %cst_49 : tensor<1x1200x1280xf32>
-    %9304 = stablehlo.multiply %9303, %9295 : tensor<1x1200x1280xf32>
-    %9305 = stablehlo.add %9304, %cst_50 : tensor<1x1200x1280xf32>
-    %9306 = stablehlo.multiply %9305, %9295 : tensor<1x1200x1280xf32>
-    %9307 = stablehlo.add %9306, %cst_51 : tensor<1x1200x1280xf32>
-    %9308 = stablehlo.multiply %cst_52, %9295 : tensor<1x1200x1280xf32>
-    %9309 = stablehlo.add %9308, %cst_53 : tensor<1x1200x1280xf32>
-    %9310 = stablehlo.multiply %9309, %9295 : tensor<1x1200x1280xf32>
-    %9311 = stablehlo.add %9310, %cst_54 : tensor<1x1200x1280xf32>
-    %9312 = stablehlo.multiply %9311, %9295 : tensor<1x1200x1280xf32>
-    %9313 = stablehlo.add %9312, %cst_55 : tensor<1x1200x1280xf32>
-    %9314 = stablehlo.multiply %9313, %9295 : tensor<1x1200x1280xf32>
-    %9315 = stablehlo.add %9314, %cst_56 : tensor<1x1200x1280xf32>
-    %9316 = stablehlo.multiply %9294, %9307 : tensor<1x1200x1280xf32>
-    %9317 = stablehlo.divide %9316, %9315 : tensor<1x1200x1280xf32>
-    %9318 = stablehlo.clamp %cst_57, %9317, %cst_58 : tensor<1x1200x1280xf32>
-    %9319 = stablehlo.convert %9318 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %9320 = stablehlo.add %9319, %cst_40 : tensor<1x1200x1280xbf16>
-    %9321 = stablehlo.multiply %9320, %9291 : tensor<1x1200x1280xbf16>
-    %9322 = stablehlo.reshape %9321 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %9323 = stablehlo.dot_general %9322, %arg887, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %9324 = stablehlo.reshape %9323 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9325 = stablehlo.broadcast_in_dim %9324, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9326 = stablehlo.broadcast_in_dim %arg412, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %9327 = stablehlo.add %9325, %9326 : tensor<1x1200x320xbf16>
-    %9328 = stablehlo.reshape %9327 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9329 = stablehlo.reshape %9328 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9330 = stablehlo.add %9329, %9234 : tensor<1x1200x320xbf16>
-    %9331 = stablehlo.convert %9330 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9332 = stablehlo.convert %9331 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9333 = stablehlo.reduce(%9332 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9334 = stablehlo.reshape %9333 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9335 = stablehlo.broadcast_in_dim %9334, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9336 = stablehlo.divide %9335, %2987 : tensor<1x1200x1xf64>
-    %9337 = stablehlo.broadcast_in_dim %9332, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9338 = stablehlo.broadcast_in_dim %9336, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9339 = stablehlo.subtract %9337, %9338 : tensor<1x1200x320xf64>
-    %9340 = stablehlo.multiply %9339, %9339 : tensor<1x1200x320xf64>
-    %9341 = stablehlo.reduce(%9340 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9342 = stablehlo.reshape %9341 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9343 = stablehlo.broadcast_in_dim %9342, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9344 = stablehlo.divide %9343, %2987 : tensor<1x1200x1xf64>
-    %9345 = stablehlo.convert %9344 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9346 = stablehlo.reduce(%9331 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9347 = stablehlo.reshape %9346 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9348 = stablehlo.broadcast_in_dim %9347, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9349 = stablehlo.divide %9348, %3003 : tensor<1x1200x1xf32>
-    %9350 = stablehlo.broadcast_in_dim %9345, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9351 = stablehlo.add %9350, %3006 : tensor<1x1200x1xf32>
-    %9352 = stablehlo.rsqrt %9351 : tensor<1x1200x1xf32>
-    %9353 = stablehlo.broadcast_in_dim %9331, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9354 = stablehlo.broadcast_in_dim %9349, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9355 = stablehlo.subtract %9353, %9354 : tensor<1x1200x320xf32>
-    %9356 = stablehlo.broadcast_in_dim %9355, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9357 = stablehlo.broadcast_in_dim %9352, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9358 = stablehlo.multiply %9356, %9357 : tensor<1x1200x320xf32>
-    %9359 = stablehlo.convert %arg413 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9360 = stablehlo.broadcast_in_dim %9358, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9361 = stablehlo.broadcast_in_dim %9359, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9362 = stablehlo.multiply %9360, %9361 : tensor<1x1200x320xf32>
-    %9363 = stablehlo.convert %arg414 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9364 = stablehlo.broadcast_in_dim %9362, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9365 = stablehlo.broadcast_in_dim %9363, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9366 = stablehlo.add %9364, %9365 : tensor<1x1200x320xf32>
-    %9367 = stablehlo.convert %9366 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9368 = stablehlo.reshape %9367 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9369 = stablehlo.convert %9368 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9370 = stablehlo.dot_general %9369, %arg888, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %9371 = stablehlo.broadcast_in_dim %9370, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9372 = stablehlo.multiply %9371, %3065 : tensor<1200x320xf32>
-    %9373 = stablehlo.broadcast_in_dim %9372, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9374 = stablehlo.broadcast_in_dim %arg889, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %9375 = stablehlo.add %9373, %9374 : tensor<1200x320xf32>
-    %9376 = stablehlo.convert %9375 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %9377 = stablehlo.reshape %9376 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9378 = stablehlo.reshape %9377 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %9379 = stablehlo.transpose %9378, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %9380 = stablehlo.transpose %9367, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %9381 = stablehlo.reshape %9380 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %9382 = stablehlo.convolution(%9381, %arg415) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %9383 = stablehlo.reshape %arg416 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %9384 = stablehlo.broadcast_in_dim %9382, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %9385 = stablehlo.broadcast_in_dim %9383, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %9386 = stablehlo.add %9384, %9385 : tensor<1x320x15x20xbf16>
-    %9387 = stablehlo.reshape %9386 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %9388 = stablehlo.transpose %9387, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %9389 = stablehlo.convert %9388 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %9390 = stablehlo.convert %9389 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %9391 = stablehlo.reduce(%9390 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9392 = stablehlo.reshape %9391 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9393 = stablehlo.broadcast_in_dim %9392, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9394 = stablehlo.divide %9393, %3088 : tensor<1x300x1xf64>
-    %9395 = stablehlo.broadcast_in_dim %9390, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %9396 = stablehlo.broadcast_in_dim %9394, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %9397 = stablehlo.subtract %9395, %9396 : tensor<1x300x320xf64>
-    %9398 = stablehlo.multiply %9397, %9397 : tensor<1x300x320xf64>
-    %9399 = stablehlo.reduce(%9398 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9400 = stablehlo.reshape %9399 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9401 = stablehlo.broadcast_in_dim %9400, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9402 = stablehlo.divide %9401, %3088 : tensor<1x300x1xf64>
-    %9403 = stablehlo.convert %9402 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %9404 = stablehlo.reduce(%9389 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %9405 = stablehlo.reshape %9404 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %9406 = stablehlo.broadcast_in_dim %9405, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9407 = stablehlo.divide %9406, %3102 : tensor<1x300x1xf32>
-    %9408 = stablehlo.broadcast_in_dim %9403, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9409 = stablehlo.add %9408, %136 : tensor<1x300x1xf32>
-    %9410 = stablehlo.rsqrt %9409 : tensor<1x300x1xf32>
-    %9411 = stablehlo.broadcast_in_dim %9389, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9412 = stablehlo.broadcast_in_dim %9407, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %9413 = stablehlo.subtract %9411, %9412 : tensor<1x300x320xf32>
-    %9414 = stablehlo.broadcast_in_dim %9413, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9415 = stablehlo.broadcast_in_dim %9410, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %9416 = stablehlo.multiply %9414, %9415 : tensor<1x300x320xf32>
-    %9417 = stablehlo.convert %arg417 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9418 = stablehlo.broadcast_in_dim %9416, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9419 = stablehlo.broadcast_in_dim %9417, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %9420 = stablehlo.multiply %9418, %9419 : tensor<1x300x320xf32>
-    %9421 = stablehlo.convert %arg418 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9422 = stablehlo.broadcast_in_dim %9420, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9423 = stablehlo.broadcast_in_dim %9421, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %9424 = stablehlo.add %9422, %9423 : tensor<1x300x320xf32>
-    %9425 = stablehlo.convert %9424 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %9426 = stablehlo.reshape %9425 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %9427 = stablehlo.convert %9426 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %9428 = stablehlo.dot_general %9427, %arg890, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %9429 = stablehlo.broadcast_in_dim %9428, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9430 = stablehlo.multiply %9429, %3126 : tensor<300x320xf32>
-    %9431 = stablehlo.broadcast_in_dim %9430, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9432 = stablehlo.broadcast_in_dim %arg891, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %9433 = stablehlo.add %9431, %9432 : tensor<300x320xf32>
-    %9434 = stablehlo.convert %9433 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %9435 = stablehlo.reshape %9434 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %9436 = stablehlo.reshape %9435 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %9437 = stablehlo.transpose %9436, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %9438 = stablehlo.dot_general %9427, %arg892, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %9439 = stablehlo.broadcast_in_dim %9438, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9440 = stablehlo.multiply %9439, %3126 : tensor<300x320xf32>
-    %9441 = stablehlo.broadcast_in_dim %9440, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9442 = stablehlo.broadcast_in_dim %arg893, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %9443 = stablehlo.add %9441, %9442 : tensor<300x320xf32>
-    %9444 = stablehlo.convert %9443 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %9445 = stablehlo.reshape %9444 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %9446 = stablehlo.reshape %9445 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %9447 = stablehlo.transpose %9446, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %9448 = stablehlo.transpose %9437, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %9449 = stablehlo.reshape %9379 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %9450 = stablehlo.reshape %9448 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %9451 = stablehlo.broadcast_in_dim %9450, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %9452 = stablehlo.dot_general %9449, %9451, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %9453 = stablehlo.reshape %9452 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %9454 = stablehlo.broadcast_in_dim %9453, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %9455 = stablehlo.divide %9454, %3152 : tensor<1x5x1200x300xbf16>
-    %9456 = stablehlo.convert %9455 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %9457 = stablehlo.reduce(%9456 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %9458 = stablehlo.reshape %9457 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %9459 = stablehlo.broadcast_in_dim %9456, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %9460 = stablehlo.broadcast_in_dim %9458, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %9461 = stablehlo.subtract %9459, %9460 : tensor<1x5x1200x300xf32>
-    %9462 = stablehlo.exponential %9461 : tensor<1x5x1200x300xf32>
-    %9463 = stablehlo.reduce(%9462 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %9464 = stablehlo.reshape %9463 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %9465 = stablehlo.broadcast_in_dim %9462, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %9466 = stablehlo.broadcast_in_dim %9464, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %9467 = stablehlo.divide %9465, %9466 : tensor<1x5x1200x300xf32>
-    %9468 = stablehlo.convert %9467 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %9469 = stablehlo.reshape %9468 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %9470 = stablehlo.reshape %9447 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %9471 = stablehlo.broadcast_in_dim %9470, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %9472 = stablehlo.dot_general %9469, %9471, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %9473 = stablehlo.reshape %9472 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %9474 = stablehlo.transpose %9473, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %9475 = stablehlo.reshape %9474 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %9476 = stablehlo.reshape %9475 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9477 = stablehlo.convert %9476 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9478 = stablehlo.dot_general %9477, %arg894, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %9479 = stablehlo.broadcast_in_dim %9478, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9480 = stablehlo.multiply %9479, %3065 : tensor<1200x320xf32>
-    %9481 = stablehlo.broadcast_in_dim %9480, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9482 = stablehlo.broadcast_in_dim %arg895, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %9483 = stablehlo.add %9481, %9482 : tensor<1200x320xf32>
-    %9484 = stablehlo.convert %9483 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %9485 = stablehlo.reshape %9484 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9486 = stablehlo.add %9485, %9330 : tensor<1x1200x320xbf16>
-    %9487 = stablehlo.convert %9486 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9488 = stablehlo.convert %9487 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9489 = stablehlo.reduce(%9488 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9490 = stablehlo.reshape %9489 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9491 = stablehlo.broadcast_in_dim %9490, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9492 = stablehlo.divide %9491, %2987 : tensor<1x1200x1xf64>
-    %9493 = stablehlo.broadcast_in_dim %9488, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9494 = stablehlo.broadcast_in_dim %9492, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9495 = stablehlo.subtract %9493, %9494 : tensor<1x1200x320xf64>
-    %9496 = stablehlo.multiply %9495, %9495 : tensor<1x1200x320xf64>
-    %9497 = stablehlo.reduce(%9496 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9498 = stablehlo.reshape %9497 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9499 = stablehlo.broadcast_in_dim %9498, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9500 = stablehlo.divide %9499, %2987 : tensor<1x1200x1xf64>
-    %9501 = stablehlo.convert %9500 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9502 = stablehlo.reduce(%9487 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9503 = stablehlo.reshape %9502 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9504 = stablehlo.broadcast_in_dim %9503, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9505 = stablehlo.divide %9504, %3003 : tensor<1x1200x1xf32>
-    %9506 = stablehlo.broadcast_in_dim %9501, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9507 = stablehlo.add %9506, %3006 : tensor<1x1200x1xf32>
-    %9508 = stablehlo.rsqrt %9507 : tensor<1x1200x1xf32>
-    %9509 = stablehlo.broadcast_in_dim %9487, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9510 = stablehlo.broadcast_in_dim %9505, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9511 = stablehlo.subtract %9509, %9510 : tensor<1x1200x320xf32>
-    %9512 = stablehlo.broadcast_in_dim %9511, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9513 = stablehlo.broadcast_in_dim %9508, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9514 = stablehlo.multiply %9512, %9513 : tensor<1x1200x320xf32>
-    %9515 = stablehlo.convert %arg419 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9516 = stablehlo.broadcast_in_dim %9514, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9517 = stablehlo.broadcast_in_dim %9515, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9518 = stablehlo.multiply %9516, %9517 : tensor<1x1200x320xf32>
-    %9519 = stablehlo.convert %arg420 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9520 = stablehlo.broadcast_in_dim %9518, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9521 = stablehlo.broadcast_in_dim %9519, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9522 = stablehlo.add %9520, %9521 : tensor<1x1200x320xf32>
-    %9523 = stablehlo.convert %9522 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9524 = stablehlo.reshape %9523 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9525 = stablehlo.convert %9524 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9526 = stablehlo.dot_general %9525, %arg896, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %9527 = stablehlo.broadcast_in_dim %9526, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9528 = stablehlo.multiply %9527, %3226 : tensor<1200x1280xf32>
-    %9529 = stablehlo.broadcast_in_dim %9528, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9530 = stablehlo.broadcast_in_dim %arg897, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %9531 = stablehlo.add %9529, %9530 : tensor<1200x1280xf32>
-    %9532 = stablehlo.convert %9531 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %9533 = stablehlo.reshape %9532 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %9534 = stablehlo.transpose %9533, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %9535 = stablehlo.reshape %9534 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9536 = stablehlo.convolution(%9535, %arg421) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9537 = stablehlo.reshape %arg422 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %9538 = stablehlo.broadcast_in_dim %9536, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9539 = stablehlo.broadcast_in_dim %9537, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9540 = stablehlo.add %9538, %9539 : tensor<1x1280x30x40xbf16>
-    %9541 = stablehlo.reshape %9540 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %9542 = stablehlo.transpose %9541, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %9543 = stablehlo.multiply %9542, %cst_42 : tensor<1x1200x1280xbf16>
-    %9544 = stablehlo.multiply %9542, %3243 : tensor<1x1200x1280xbf16>
-    %9545 = stablehlo.convert %9544 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %9546 = stablehlo.clamp %cst_43, %9545, %cst_44 : tensor<1x1200x1280xf32>
-    %9547 = stablehlo.multiply %9546, %9546 : tensor<1x1200x1280xf32>
-    %9548 = stablehlo.multiply %cst_45, %9547 : tensor<1x1200x1280xf32>
-    %9549 = stablehlo.add %9548, %cst_46 : tensor<1x1200x1280xf32>
-    %9550 = stablehlo.multiply %9549, %9547 : tensor<1x1200x1280xf32>
-    %9551 = stablehlo.add %9550, %cst_47 : tensor<1x1200x1280xf32>
-    %9552 = stablehlo.multiply %9551, %9547 : tensor<1x1200x1280xf32>
-    %9553 = stablehlo.add %9552, %cst_48 : tensor<1x1200x1280xf32>
-    %9554 = stablehlo.multiply %9553, %9547 : tensor<1x1200x1280xf32>
-    %9555 = stablehlo.add %9554, %cst_49 : tensor<1x1200x1280xf32>
-    %9556 = stablehlo.multiply %9555, %9547 : tensor<1x1200x1280xf32>
-    %9557 = stablehlo.add %9556, %cst_50 : tensor<1x1200x1280xf32>
-    %9558 = stablehlo.multiply %9557, %9547 : tensor<1x1200x1280xf32>
-    %9559 = stablehlo.add %9558, %cst_51 : tensor<1x1200x1280xf32>
-    %9560 = stablehlo.multiply %cst_52, %9547 : tensor<1x1200x1280xf32>
-    %9561 = stablehlo.add %9560, %cst_53 : tensor<1x1200x1280xf32>
-    %9562 = stablehlo.multiply %9561, %9547 : tensor<1x1200x1280xf32>
-    %9563 = stablehlo.add %9562, %cst_54 : tensor<1x1200x1280xf32>
-    %9564 = stablehlo.multiply %9563, %9547 : tensor<1x1200x1280xf32>
-    %9565 = stablehlo.add %9564, %cst_55 : tensor<1x1200x1280xf32>
-    %9566 = stablehlo.multiply %9565, %9547 : tensor<1x1200x1280xf32>
-    %9567 = stablehlo.add %9566, %cst_56 : tensor<1x1200x1280xf32>
-    %9568 = stablehlo.multiply %9546, %9559 : tensor<1x1200x1280xf32>
-    %9569 = stablehlo.divide %9568, %9567 : tensor<1x1200x1280xf32>
-    %9570 = stablehlo.clamp %cst_57, %9569, %cst_58 : tensor<1x1200x1280xf32>
-    %9571 = stablehlo.convert %9570 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %9572 = stablehlo.add %9571, %cst_40 : tensor<1x1200x1280xbf16>
-    %9573 = stablehlo.multiply %9572, %9543 : tensor<1x1200x1280xbf16>
-    %9574 = stablehlo.reshape %9573 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %9575 = stablehlo.dot_general %9574, %arg898, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %9576 = stablehlo.reshape %9575 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9577 = stablehlo.broadcast_in_dim %9576, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9578 = stablehlo.broadcast_in_dim %arg423, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %9579 = stablehlo.add %9577, %9578 : tensor<1x1200x320xbf16>
-    %9580 = stablehlo.reshape %9579 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9581 = stablehlo.reshape %9580 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9582 = stablehlo.add %9581, %9486 : tensor<1x1200x320xbf16>
-    %9583 = stablehlo.convert %9582 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9584 = stablehlo.convert %9583 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9585 = stablehlo.reduce(%9584 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9586 = stablehlo.reshape %9585 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9587 = stablehlo.broadcast_in_dim %9586, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9588 = stablehlo.divide %9587, %2987 : tensor<1x1200x1xf64>
-    %9589 = stablehlo.broadcast_in_dim %9584, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9590 = stablehlo.broadcast_in_dim %9588, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9591 = stablehlo.subtract %9589, %9590 : tensor<1x1200x320xf64>
-    %9592 = stablehlo.multiply %9591, %9591 : tensor<1x1200x320xf64>
-    %9593 = stablehlo.reduce(%9592 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9594 = stablehlo.reshape %9593 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9595 = stablehlo.broadcast_in_dim %9594, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9596 = stablehlo.divide %9595, %2987 : tensor<1x1200x1xf64>
-    %9597 = stablehlo.convert %9596 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9598 = stablehlo.reduce(%9583 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9599 = stablehlo.reshape %9598 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9600 = stablehlo.broadcast_in_dim %9599, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9601 = stablehlo.divide %9600, %3003 : tensor<1x1200x1xf32>
-    %9602 = stablehlo.broadcast_in_dim %9597, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9603 = stablehlo.add %9602, %3006 : tensor<1x1200x1xf32>
-    %9604 = stablehlo.rsqrt %9603 : tensor<1x1200x1xf32>
-    %9605 = stablehlo.broadcast_in_dim %9583, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9606 = stablehlo.broadcast_in_dim %9601, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9607 = stablehlo.subtract %9605, %9606 : tensor<1x1200x320xf32>
-    %9608 = stablehlo.broadcast_in_dim %9607, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9609 = stablehlo.broadcast_in_dim %9604, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9610 = stablehlo.multiply %9608, %9609 : tensor<1x1200x320xf32>
-    %9611 = stablehlo.convert %arg424 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9612 = stablehlo.broadcast_in_dim %9610, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9613 = stablehlo.broadcast_in_dim %9611, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9614 = stablehlo.multiply %9612, %9613 : tensor<1x1200x320xf32>
-    %9615 = stablehlo.convert %arg425 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9616 = stablehlo.broadcast_in_dim %9614, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9617 = stablehlo.broadcast_in_dim %9615, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9618 = stablehlo.add %9616, %9617 : tensor<1x1200x320xf32>
-    %9619 = stablehlo.convert %9618 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9620 = stablehlo.reshape %9619 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9621 = stablehlo.convert %9620 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9622 = stablehlo.dot_general %9621, %arg899, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %9623 = stablehlo.broadcast_in_dim %9622, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9624 = stablehlo.multiply %9623, %3065 : tensor<1200x320xf32>
-    %9625 = stablehlo.broadcast_in_dim %9624, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9626 = stablehlo.broadcast_in_dim %arg900, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %9627 = stablehlo.add %9625, %9626 : tensor<1200x320xf32>
-    %9628 = stablehlo.convert %9627 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %9629 = stablehlo.reshape %9628 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9630 = stablehlo.reshape %9629 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x5x64xbf16>
-    %9631 = stablehlo.transpose %9630, dims = [0, 2, 1, 3] : (tensor<1x1200x5x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %9632 = stablehlo.transpose %9619, dims = [0, 2, 1] : (tensor<1x1200x320xbf16>) -> tensor<1x320x1200xbf16>
-    %9633 = stablehlo.reshape %9632 : (tensor<1x320x1200xbf16>) -> tensor<1x320x30x40xbf16>
-    %9634 = stablehlo.convolution(%9633, %arg426) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<320x320x2x2xbf16>) -> tensor<1x320x15x20xbf16>
-    %9635 = stablehlo.reshape %arg427 : (tensor<320xbf16>) -> tensor<320x1x1xbf16>
-    %9636 = stablehlo.broadcast_in_dim %9634, dims = [0, 1, 2, 3] : (tensor<1x320x15x20xbf16>) -> tensor<1x320x15x20xbf16>
-    %9637 = stablehlo.broadcast_in_dim %9635, dims = [1, 2, 3] : (tensor<320x1x1xbf16>) -> tensor<1x320x15x20xbf16>
-    %9638 = stablehlo.add %9636, %9637 : tensor<1x320x15x20xbf16>
-    %9639 = stablehlo.reshape %9638 : (tensor<1x320x15x20xbf16>) -> tensor<1x320x300xbf16>
-    %9640 = stablehlo.transpose %9639, dims = [0, 2, 1] : (tensor<1x320x300xbf16>) -> tensor<1x300x320xbf16>
-    %9641 = stablehlo.convert %9640 : (tensor<1x300x320xbf16>) -> tensor<1x300x320xf32>
-    %9642 = stablehlo.convert %9641 : (tensor<1x300x320xf32>) -> tensor<1x300x320xf64>
-    %9643 = stablehlo.reduce(%9642 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9644 = stablehlo.reshape %9643 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9645 = stablehlo.broadcast_in_dim %9644, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9646 = stablehlo.divide %9645, %3088 : tensor<1x300x1xf64>
-    %9647 = stablehlo.broadcast_in_dim %9642, dims = [0, 1, 2] : (tensor<1x300x320xf64>) -> tensor<1x300x320xf64>
-    %9648 = stablehlo.broadcast_in_dim %9646, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x320xf64>
-    %9649 = stablehlo.subtract %9647, %9648 : tensor<1x300x320xf64>
-    %9650 = stablehlo.multiply %9649, %9649 : tensor<1x300x320xf64>
-    %9651 = stablehlo.reduce(%9650 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9652 = stablehlo.reshape %9651 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9653 = stablehlo.broadcast_in_dim %9652, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9654 = stablehlo.divide %9653, %3088 : tensor<1x300x1xf64>
-    %9655 = stablehlo.convert %9654 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %9656 = stablehlo.reduce(%9641 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x320xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %9657 = stablehlo.reshape %9656 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %9658 = stablehlo.broadcast_in_dim %9657, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9659 = stablehlo.divide %9658, %3102 : tensor<1x300x1xf32>
-    %9660 = stablehlo.broadcast_in_dim %9655, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9661 = stablehlo.add %9660, %136 : tensor<1x300x1xf32>
-    %9662 = stablehlo.rsqrt %9661 : tensor<1x300x1xf32>
-    %9663 = stablehlo.broadcast_in_dim %9641, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9664 = stablehlo.broadcast_in_dim %9659, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %9665 = stablehlo.subtract %9663, %9664 : tensor<1x300x320xf32>
-    %9666 = stablehlo.broadcast_in_dim %9665, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9667 = stablehlo.broadcast_in_dim %9662, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x320xf32>
-    %9668 = stablehlo.multiply %9666, %9667 : tensor<1x300x320xf32>
-    %9669 = stablehlo.convert %arg428 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9670 = stablehlo.broadcast_in_dim %9668, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9671 = stablehlo.broadcast_in_dim %9669, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %9672 = stablehlo.multiply %9670, %9671 : tensor<1x300x320xf32>
-    %9673 = stablehlo.convert %arg429 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9674 = stablehlo.broadcast_in_dim %9672, dims = [0, 1, 2] : (tensor<1x300x320xf32>) -> tensor<1x300x320xf32>
-    %9675 = stablehlo.broadcast_in_dim %9673, dims = [2] : (tensor<320xf32>) -> tensor<1x300x320xf32>
-    %9676 = stablehlo.add %9674, %9675 : tensor<1x300x320xf32>
-    %9677 = stablehlo.convert %9676 : (tensor<1x300x320xf32>) -> tensor<1x300x320xbf16>
-    %9678 = stablehlo.reshape %9677 : (tensor<1x300x320xbf16>) -> tensor<300x320xbf16>
-    %9679 = stablehlo.convert %9678 : (tensor<300x320xbf16>) -> tensor<300x320xf32>
-    %9680 = stablehlo.dot_general %9679, %arg901, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %9681 = stablehlo.broadcast_in_dim %9680, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9682 = stablehlo.multiply %9681, %3126 : tensor<300x320xf32>
-    %9683 = stablehlo.broadcast_in_dim %9682, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9684 = stablehlo.broadcast_in_dim %arg902, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %9685 = stablehlo.add %9683, %9684 : tensor<300x320xf32>
-    %9686 = stablehlo.convert %9685 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %9687 = stablehlo.reshape %9686 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %9688 = stablehlo.reshape %9687 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %9689 = stablehlo.transpose %9688, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %9690 = stablehlo.dot_general %9679, %arg903, contracting_dims = [1] x [0] : (tensor<300x320xf32>, tensor<320x320xf32>) -> tensor<300x320xf32>
-    %9691 = stablehlo.broadcast_in_dim %9690, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9692 = stablehlo.multiply %9691, %3126 : tensor<300x320xf32>
-    %9693 = stablehlo.broadcast_in_dim %9692, dims = [0, 1] : (tensor<300x320xf32>) -> tensor<300x320xf32>
-    %9694 = stablehlo.broadcast_in_dim %arg904, dims = [1] : (tensor<320xf32>) -> tensor<300x320xf32>
-    %9695 = stablehlo.add %9693, %9694 : tensor<300x320xf32>
-    %9696 = stablehlo.convert %9695 : (tensor<300x320xf32>) -> tensor<300x320xbf16>
-    %9697 = stablehlo.reshape %9696 : (tensor<300x320xbf16>) -> tensor<1x300x320xbf16>
-    %9698 = stablehlo.reshape %9697 : (tensor<1x300x320xbf16>) -> tensor<1x300x5x64xbf16>
-    %9699 = stablehlo.transpose %9698, dims = [0, 2, 1, 3] : (tensor<1x300x5x64xbf16>) -> tensor<1x5x300x64xbf16>
-    %9700 = stablehlo.transpose %9689, dims = [0, 1, 3, 2] : (tensor<1x5x300x64xbf16>) -> tensor<1x5x64x300xbf16>
-    %9701 = stablehlo.reshape %9631 : (tensor<1x5x1200x64xbf16>) -> tensor<5x1200x64xbf16>
-    %9702 = stablehlo.reshape %9700 : (tensor<1x5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %9703 = stablehlo.broadcast_in_dim %9702, dims = [0, 1, 2] : (tensor<5x64x300xbf16>) -> tensor<5x64x300xbf16>
-    %9704 = stablehlo.dot_general %9701, %9703, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x64xbf16>, tensor<5x64x300xbf16>) -> tensor<5x1200x300xbf16>
-    %9705 = stablehlo.reshape %9704 : (tensor<5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %9706 = stablehlo.broadcast_in_dim %9705, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xbf16>
-    %9707 = stablehlo.divide %9706, %3152 : tensor<1x5x1200x300xbf16>
-    %9708 = stablehlo.convert %9707 : (tensor<1x5x1200x300xbf16>) -> tensor<1x5x1200x300xf32>
-    %9709 = stablehlo.reduce(%9708 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %9710 = stablehlo.reshape %9709 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %9711 = stablehlo.broadcast_in_dim %9708, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %9712 = stablehlo.broadcast_in_dim %9710, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %9713 = stablehlo.subtract %9711, %9712 : tensor<1x5x1200x300xf32>
-    %9714 = stablehlo.exponential %9713 : tensor<1x5x1200x300xf32>
-    %9715 = stablehlo.reduce(%9714 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1200x300xf32>, tensor<f32>) -> tensor<1x5x1200xf32>
-    %9716 = stablehlo.reshape %9715 : (tensor<1x5x1200xf32>) -> tensor<1x5x1200x1xf32>
-    %9717 = stablehlo.broadcast_in_dim %9714, dims = [0, 1, 2, 3] : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xf32>
-    %9718 = stablehlo.broadcast_in_dim %9716, dims = [0, 1, 2, 3] : (tensor<1x5x1200x1xf32>) -> tensor<1x5x1200x300xf32>
-    %9719 = stablehlo.divide %9717, %9718 : tensor<1x5x1200x300xf32>
-    %9720 = stablehlo.convert %9719 : (tensor<1x5x1200x300xf32>) -> tensor<1x5x1200x300xbf16>
-    %9721 = stablehlo.reshape %9720 : (tensor<1x5x1200x300xbf16>) -> tensor<5x1200x300xbf16>
-    %9722 = stablehlo.reshape %9699 : (tensor<1x5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %9723 = stablehlo.broadcast_in_dim %9722, dims = [0, 1, 2] : (tensor<5x300x64xbf16>) -> tensor<5x300x64xbf16>
-    %9724 = stablehlo.dot_general %9721, %9723, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1200x300xbf16>, tensor<5x300x64xbf16>) -> tensor<5x1200x64xbf16>
-    %9725 = stablehlo.reshape %9724 : (tensor<5x1200x64xbf16>) -> tensor<1x5x1200x64xbf16>
-    %9726 = stablehlo.transpose %9725, dims = [0, 2, 1, 3] : (tensor<1x5x1200x64xbf16>) -> tensor<1x1200x5x64xbf16>
-    %9727 = stablehlo.reshape %9726 : (tensor<1x1200x5x64xbf16>) -> tensor<1x1200x320xbf16>
-    %9728 = stablehlo.reshape %9727 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9729 = stablehlo.convert %9728 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9730 = stablehlo.dot_general %9729, %arg905, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x320xf32>) -> tensor<1200x320xf32>
-    %9731 = stablehlo.broadcast_in_dim %9730, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9732 = stablehlo.multiply %9731, %3065 : tensor<1200x320xf32>
-    %9733 = stablehlo.broadcast_in_dim %9732, dims = [0, 1] : (tensor<1200x320xf32>) -> tensor<1200x320xf32>
-    %9734 = stablehlo.broadcast_in_dim %arg906, dims = [1] : (tensor<320xf32>) -> tensor<1200x320xf32>
-    %9735 = stablehlo.add %9733, %9734 : tensor<1200x320xf32>
-    %9736 = stablehlo.convert %9735 : (tensor<1200x320xf32>) -> tensor<1200x320xbf16>
-    %9737 = stablehlo.reshape %9736 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9738 = stablehlo.add %9737, %9582 : tensor<1x1200x320xbf16>
-    %9739 = stablehlo.convert %9738 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9740 = stablehlo.convert %9739 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9741 = stablehlo.reduce(%9740 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9742 = stablehlo.reshape %9741 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9743 = stablehlo.broadcast_in_dim %9742, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9744 = stablehlo.divide %9743, %2987 : tensor<1x1200x1xf64>
-    %9745 = stablehlo.broadcast_in_dim %9740, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9746 = stablehlo.broadcast_in_dim %9744, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9747 = stablehlo.subtract %9745, %9746 : tensor<1x1200x320xf64>
-    %9748 = stablehlo.multiply %9747, %9747 : tensor<1x1200x320xf64>
-    %9749 = stablehlo.reduce(%9748 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9750 = stablehlo.reshape %9749 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9751 = stablehlo.broadcast_in_dim %9750, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9752 = stablehlo.divide %9751, %2987 : tensor<1x1200x1xf64>
-    %9753 = stablehlo.convert %9752 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9754 = stablehlo.reduce(%9739 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9755 = stablehlo.reshape %9754 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9756 = stablehlo.broadcast_in_dim %9755, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9757 = stablehlo.divide %9756, %3003 : tensor<1x1200x1xf32>
-    %9758 = stablehlo.broadcast_in_dim %9753, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9759 = stablehlo.add %9758, %3006 : tensor<1x1200x1xf32>
-    %9760 = stablehlo.rsqrt %9759 : tensor<1x1200x1xf32>
-    %9761 = stablehlo.broadcast_in_dim %9739, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9762 = stablehlo.broadcast_in_dim %9757, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9763 = stablehlo.subtract %9761, %9762 : tensor<1x1200x320xf32>
-    %9764 = stablehlo.broadcast_in_dim %9763, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9765 = stablehlo.broadcast_in_dim %9760, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9766 = stablehlo.multiply %9764, %9765 : tensor<1x1200x320xf32>
-    %9767 = stablehlo.convert %arg430 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9768 = stablehlo.broadcast_in_dim %9766, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9769 = stablehlo.broadcast_in_dim %9767, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9770 = stablehlo.multiply %9768, %9769 : tensor<1x1200x320xf32>
-    %9771 = stablehlo.convert %arg431 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9772 = stablehlo.broadcast_in_dim %9770, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9773 = stablehlo.broadcast_in_dim %9771, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9774 = stablehlo.add %9772, %9773 : tensor<1x1200x320xf32>
-    %9775 = stablehlo.convert %9774 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9776 = stablehlo.reshape %9775 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9777 = stablehlo.convert %9776 : (tensor<1200x320xbf16>) -> tensor<1200x320xf32>
-    %9778 = stablehlo.dot_general %9777, %arg907, contracting_dims = [1] x [0] : (tensor<1200x320xf32>, tensor<320x1280xf32>) -> tensor<1200x1280xf32>
-    %9779 = stablehlo.broadcast_in_dim %9778, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9780 = stablehlo.multiply %9779, %3226 : tensor<1200x1280xf32>
-    %9781 = stablehlo.broadcast_in_dim %9780, dims = [0, 1] : (tensor<1200x1280xf32>) -> tensor<1200x1280xf32>
-    %9782 = stablehlo.broadcast_in_dim %arg908, dims = [1] : (tensor<1280xf32>) -> tensor<1200x1280xf32>
-    %9783 = stablehlo.add %9781, %9782 : tensor<1200x1280xf32>
-    %9784 = stablehlo.convert %9783 : (tensor<1200x1280xf32>) -> tensor<1200x1280xbf16>
-    %9785 = stablehlo.reshape %9784 : (tensor<1200x1280xbf16>) -> tensor<1x1200x1280xbf16>
-    %9786 = stablehlo.transpose %9785, dims = [0, 2, 1] : (tensor<1x1200x1280xbf16>) -> tensor<1x1280x1200xbf16>
-    %9787 = stablehlo.reshape %9786 : (tensor<1x1280x1200xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9788 = stablehlo.convolution(%9787, %arg432) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1280 : i64} : (tensor<1x1280x30x40xbf16>, tensor<1280x1x3x3xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9789 = stablehlo.reshape %arg433 : (tensor<1280xbf16>) -> tensor<1280x1x1xbf16>
-    %9790 = stablehlo.broadcast_in_dim %9788, dims = [0, 1, 2, 3] : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9791 = stablehlo.broadcast_in_dim %9789, dims = [1, 2, 3] : (tensor<1280x1x1xbf16>) -> tensor<1x1280x30x40xbf16>
-    %9792 = stablehlo.add %9790, %9791 : tensor<1x1280x30x40xbf16>
-    %9793 = stablehlo.reshape %9792 : (tensor<1x1280x30x40xbf16>) -> tensor<1x1280x1200xbf16>
-    %9794 = stablehlo.transpose %9793, dims = [0, 2, 1] : (tensor<1x1280x1200xbf16>) -> tensor<1x1200x1280xbf16>
-    %9795 = stablehlo.multiply %9794, %cst_42 : tensor<1x1200x1280xbf16>
-    %9796 = stablehlo.multiply %9794, %3243 : tensor<1x1200x1280xbf16>
-    %9797 = stablehlo.convert %9796 : (tensor<1x1200x1280xbf16>) -> tensor<1x1200x1280xf32>
-    %9798 = stablehlo.clamp %cst_43, %9797, %cst_44 : tensor<1x1200x1280xf32>
-    %9799 = stablehlo.multiply %9798, %9798 : tensor<1x1200x1280xf32>
-    %9800 = stablehlo.multiply %cst_45, %9799 : tensor<1x1200x1280xf32>
-    %9801 = stablehlo.add %9800, %cst_46 : tensor<1x1200x1280xf32>
-    %9802 = stablehlo.multiply %9801, %9799 : tensor<1x1200x1280xf32>
-    %9803 = stablehlo.add %9802, %cst_47 : tensor<1x1200x1280xf32>
-    %9804 = stablehlo.multiply %9803, %9799 : tensor<1x1200x1280xf32>
-    %9805 = stablehlo.add %9804, %cst_48 : tensor<1x1200x1280xf32>
-    %9806 = stablehlo.multiply %9805, %9799 : tensor<1x1200x1280xf32>
-    %9807 = stablehlo.add %9806, %cst_49 : tensor<1x1200x1280xf32>
-    %9808 = stablehlo.multiply %9807, %9799 : tensor<1x1200x1280xf32>
-    %9809 = stablehlo.add %9808, %cst_50 : tensor<1x1200x1280xf32>
-    %9810 = stablehlo.multiply %9809, %9799 : tensor<1x1200x1280xf32>
-    %9811 = stablehlo.add %9810, %cst_51 : tensor<1x1200x1280xf32>
-    %9812 = stablehlo.multiply %cst_52, %9799 : tensor<1x1200x1280xf32>
-    %9813 = stablehlo.add %9812, %cst_53 : tensor<1x1200x1280xf32>
-    %9814 = stablehlo.multiply %9813, %9799 : tensor<1x1200x1280xf32>
-    %9815 = stablehlo.add %9814, %cst_54 : tensor<1x1200x1280xf32>
-    %9816 = stablehlo.multiply %9815, %9799 : tensor<1x1200x1280xf32>
-    %9817 = stablehlo.add %9816, %cst_55 : tensor<1x1200x1280xf32>
-    %9818 = stablehlo.multiply %9817, %9799 : tensor<1x1200x1280xf32>
-    %9819 = stablehlo.add %9818, %cst_56 : tensor<1x1200x1280xf32>
-    %9820 = stablehlo.multiply %9798, %9811 : tensor<1x1200x1280xf32>
-    %9821 = stablehlo.divide %9820, %9819 : tensor<1x1200x1280xf32>
-    %9822 = stablehlo.clamp %cst_57, %9821, %cst_58 : tensor<1x1200x1280xf32>
-    %9823 = stablehlo.convert %9822 : (tensor<1x1200x1280xf32>) -> tensor<1x1200x1280xbf16>
-    %9824 = stablehlo.add %9823, %cst_40 : tensor<1x1200x1280xbf16>
-    %9825 = stablehlo.multiply %9824, %9795 : tensor<1x1200x1280xbf16>
-    %9826 = stablehlo.reshape %9825 : (tensor<1x1200x1280xbf16>) -> tensor<1200x1280xbf16>
-    %9827 = stablehlo.dot_general %9826, %arg909, contracting_dims = [1] x [0] : (tensor<1200x1280xbf16>, tensor<1280x320xbf16>) -> tensor<1200x320xbf16>
-    %9828 = stablehlo.reshape %9827 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9829 = stablehlo.broadcast_in_dim %9828, dims = [0, 1, 2] : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9830 = stablehlo.broadcast_in_dim %arg434, dims = [2] : (tensor<320xbf16>) -> tensor<1x1200x320xbf16>
-    %9831 = stablehlo.add %9829, %9830 : tensor<1x1200x320xbf16>
-    %9832 = stablehlo.reshape %9831 : (tensor<1x1200x320xbf16>) -> tensor<1200x320xbf16>
-    %9833 = stablehlo.reshape %9832 : (tensor<1200x320xbf16>) -> tensor<1x1200x320xbf16>
-    %9834 = stablehlo.add %9833, %9738 : tensor<1x1200x320xbf16>
-    %9835 = stablehlo.convert %9834 : (tensor<1x1200x320xbf16>) -> tensor<1x1200x320xf32>
-    %9836 = stablehlo.convert %9835 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf64>
-    %9837 = stablehlo.reduce(%9836 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9838 = stablehlo.reshape %9837 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9839 = stablehlo.broadcast_in_dim %9838, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9840 = stablehlo.divide %9839, %2987 : tensor<1x1200x1xf64>
-    %9841 = stablehlo.broadcast_in_dim %9836, dims = [0, 1, 2] : (tensor<1x1200x320xf64>) -> tensor<1x1200x320xf64>
-    %9842 = stablehlo.broadcast_in_dim %9840, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x320xf64>
-    %9843 = stablehlo.subtract %9841, %9842 : tensor<1x1200x320xf64>
-    %9844 = stablehlo.multiply %9843, %9843 : tensor<1x1200x320xf64>
-    %9845 = stablehlo.reduce(%9844 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf64>, tensor<f64>) -> tensor<1x1200xf64>
-    %9846 = stablehlo.reshape %9845 : (tensor<1x1200xf64>) -> tensor<1x1200x1xf64>
-    %9847 = stablehlo.broadcast_in_dim %9846, dims = [0, 1, 2] : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf64>
-    %9848 = stablehlo.divide %9847, %2987 : tensor<1x1200x1xf64>
-    %9849 = stablehlo.convert %9848 : (tensor<1x1200x1xf64>) -> tensor<1x1200x1xf32>
-    %9850 = stablehlo.reduce(%9835 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1200x320xf32>, tensor<f32>) -> tensor<1x1200xf32>
-    %9851 = stablehlo.reshape %9850 : (tensor<1x1200xf32>) -> tensor<1x1200x1xf32>
-    %9852 = stablehlo.broadcast_in_dim %9851, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9853 = stablehlo.divide %9852, %3003 : tensor<1x1200x1xf32>
-    %9854 = stablehlo.broadcast_in_dim %9849, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x1xf32>
-    %9855 = stablehlo.add %9854, %3006 : tensor<1x1200x1xf32>
-    %9856 = stablehlo.rsqrt %9855 : tensor<1x1200x1xf32>
-    %9857 = stablehlo.broadcast_in_dim %9835, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9858 = stablehlo.broadcast_in_dim %9853, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9859 = stablehlo.subtract %9857, %9858 : tensor<1x1200x320xf32>
-    %9860 = stablehlo.broadcast_in_dim %9859, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9861 = stablehlo.broadcast_in_dim %9856, dims = [0, 1, 2] : (tensor<1x1200x1xf32>) -> tensor<1x1200x320xf32>
-    %9862 = stablehlo.multiply %9860, %9861 : tensor<1x1200x320xf32>
-    %9863 = stablehlo.convert %arg435 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9864 = stablehlo.broadcast_in_dim %9862, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9865 = stablehlo.broadcast_in_dim %9863, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9866 = stablehlo.multiply %9864, %9865 : tensor<1x1200x320xf32>
-    %9867 = stablehlo.convert %arg436 : (tensor<320xbf16>) -> tensor<320xf32>
-    %9868 = stablehlo.broadcast_in_dim %9866, dims = [0, 1, 2] : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xf32>
-    %9869 = stablehlo.broadcast_in_dim %9867, dims = [2] : (tensor<320xf32>) -> tensor<1x1200x320xf32>
-    %9870 = stablehlo.add %9868, %9869 : tensor<1x1200x320xf32>
-    %9871 = stablehlo.convert %9870 : (tensor<1x1200x320xf32>) -> tensor<1x1200x320xbf16>
-    %9872 = stablehlo.reshape %9871 : (tensor<1x1200x320xbf16>) -> tensor<1x30x40x320xbf16>
-    %9873 = stablehlo.transpose %9872, dims = [0, 3, 1, 2] : (tensor<1x30x40x320xbf16>) -> tensor<1x320x30x40xbf16>
-    %9874 = stablehlo.convolution(%9873, %arg437) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<512x320x3x3xbf16>) -> tensor<1x512x15x20xbf16>
-    %9875 = stablehlo.reshape %arg438 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %9876 = stablehlo.broadcast_in_dim %9874, dims = [0, 1, 2, 3] : (tensor<1x512x15x20xbf16>) -> tensor<1x512x15x20xbf16>
-    %9877 = stablehlo.broadcast_in_dim %9875, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x15x20xbf16>
-    %9878 = stablehlo.add %9876, %9877 : tensor<1x512x15x20xbf16>
-    %9879 = stablehlo.reshape %9878 : (tensor<1x512x15x20xbf16>) -> tensor<1x512x300xbf16>
-    %9880 = stablehlo.transpose %9879, dims = [0, 2, 1] : (tensor<1x512x300xbf16>) -> tensor<1x300x512xbf16>
-    %9881 = stablehlo.convert %9880 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %9882 = stablehlo.convert %9881 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %9883 = stablehlo.reduce(%9882 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9884 = stablehlo.reshape %9883 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9885 = stablehlo.convert %cst_91 : (tensor<1xi64>) -> tensor<1xf64>
-    %9886 = stablehlo.reshape %9885 : (tensor<1xf64>) -> tensor<f64>
-    %9887 = stablehlo.broadcast_in_dim %9884, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9888 = stablehlo.broadcast_in_dim %9886, dims = [] : (tensor<f64>) -> tensor<1x300x1xf64>
-    %9889 = stablehlo.divide %9887, %9888 : tensor<1x300x1xf64>
-    %9890 = stablehlo.broadcast_in_dim %9882, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %9891 = stablehlo.broadcast_in_dim %9889, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %9892 = stablehlo.subtract %9890, %9891 : tensor<1x300x512xf64>
-    %9893 = stablehlo.multiply %9892, %9892 : tensor<1x300x512xf64>
-    %9894 = stablehlo.reduce(%9893 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9895 = stablehlo.reshape %9894 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9896 = stablehlo.broadcast_in_dim %9895, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9897 = stablehlo.divide %9896, %9888 : tensor<1x300x1xf64>
-    %9898 = stablehlo.convert %9897 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %9899 = stablehlo.reduce(%9881 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %9900 = stablehlo.reshape %9899 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %9901 = stablehlo.convert %cst_91 : (tensor<1xi64>) -> tensor<1xf32>
-    %9902 = stablehlo.reshape %9901 : (tensor<1xf32>) -> tensor<f32>
-    %9903 = stablehlo.broadcast_in_dim %9900, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9904 = stablehlo.broadcast_in_dim %9902, dims = [] : (tensor<f32>) -> tensor<1x300x1xf32>
-    %9905 = stablehlo.divide %9903, %9904 : tensor<1x300x1xf32>
-    %9906 = stablehlo.broadcast_in_dim %9898, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9907 = stablehlo.add %9906, %136 : tensor<1x300x1xf32>
-    %9908 = stablehlo.rsqrt %9907 : tensor<1x300x1xf32>
-    %9909 = stablehlo.broadcast_in_dim %9881, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9910 = stablehlo.broadcast_in_dim %9905, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %9911 = stablehlo.subtract %9909, %9910 : tensor<1x300x512xf32>
-    %9912 = stablehlo.broadcast_in_dim %9911, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9913 = stablehlo.broadcast_in_dim %9908, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %9914 = stablehlo.multiply %9912, %9913 : tensor<1x300x512xf32>
-    %9915 = stablehlo.convert %arg439 : (tensor<512xbf16>) -> tensor<512xf32>
-    %9916 = stablehlo.broadcast_in_dim %9914, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9917 = stablehlo.broadcast_in_dim %9915, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %9918 = stablehlo.multiply %9916, %9917 : tensor<1x300x512xf32>
-    %9919 = stablehlo.convert %arg440 : (tensor<512xbf16>) -> tensor<512xf32>
-    %9920 = stablehlo.broadcast_in_dim %9918, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9921 = stablehlo.broadcast_in_dim %9919, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %9922 = stablehlo.add %9920, %9921 : tensor<1x300x512xf32>
-    %9923 = stablehlo.convert %9922 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %9924 = stablehlo.convert %9923 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %9925 = stablehlo.convert %9924 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %9926 = stablehlo.reduce(%9925 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9927 = stablehlo.reshape %9926 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9928 = stablehlo.broadcast_in_dim %9927, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9929 = stablehlo.divide %9928, %9888 : tensor<1x300x1xf64>
-    %9930 = stablehlo.broadcast_in_dim %9925, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %9931 = stablehlo.broadcast_in_dim %9929, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %9932 = stablehlo.subtract %9930, %9931 : tensor<1x300x512xf64>
-    %9933 = stablehlo.multiply %9932, %9932 : tensor<1x300x512xf64>
-    %9934 = stablehlo.reduce(%9933 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %9935 = stablehlo.reshape %9934 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %9936 = stablehlo.broadcast_in_dim %9935, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %9937 = stablehlo.divide %9936, %9888 : tensor<1x300x1xf64>
-    %9938 = stablehlo.convert %9937 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %9939 = stablehlo.reduce(%9924 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %9940 = stablehlo.reshape %9939 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %9941 = stablehlo.broadcast_in_dim %9940, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9942 = stablehlo.divide %9941, %9904 : tensor<1x300x1xf32>
-    %9943 = stablehlo.broadcast_in_dim %9938, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %9944 = stablehlo.add %9943, %136 : tensor<1x300x1xf32>
-    %9945 = stablehlo.rsqrt %9944 : tensor<1x300x1xf32>
-    %9946 = stablehlo.broadcast_in_dim %9924, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9947 = stablehlo.broadcast_in_dim %9942, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %9948 = stablehlo.subtract %9946, %9947 : tensor<1x300x512xf32>
-    %9949 = stablehlo.broadcast_in_dim %9948, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9950 = stablehlo.broadcast_in_dim %9945, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %9951 = stablehlo.multiply %9949, %9950 : tensor<1x300x512xf32>
-    %9952 = stablehlo.convert %arg441 : (tensor<512xbf16>) -> tensor<512xf32>
-    %9953 = stablehlo.broadcast_in_dim %9951, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9954 = stablehlo.broadcast_in_dim %9952, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %9955 = stablehlo.multiply %9953, %9954 : tensor<1x300x512xf32>
-    %9956 = stablehlo.convert %arg442 : (tensor<512xbf16>) -> tensor<512xf32>
-    %9957 = stablehlo.broadcast_in_dim %9955, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %9958 = stablehlo.broadcast_in_dim %9956, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %9959 = stablehlo.add %9957, %9958 : tensor<1x300x512xf32>
-    %9960 = stablehlo.convert %9959 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %9961 = stablehlo.reshape %9960 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %9962 = stablehlo.convert %9961 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %9963 = stablehlo.dot_general %9962, %arg910, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %9964 = stablehlo.broadcast_in_dim %9963, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %9965 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<300x512xf32>
-    %9966 = stablehlo.multiply %9964, %9965 : tensor<300x512xf32>
-    %9967 = stablehlo.broadcast_in_dim %9966, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %9968 = stablehlo.broadcast_in_dim %arg911, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %9969 = stablehlo.add %9967, %9968 : tensor<300x512xf32>
-    %9970 = stablehlo.convert %9969 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %9971 = stablehlo.reshape %9970 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %9972 = stablehlo.reshape %9971 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %9973 = stablehlo.transpose %9972, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %9974 = stablehlo.dot_general %9962, %arg912, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %9975 = stablehlo.broadcast_in_dim %9974, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %9976 = stablehlo.multiply %9975, %9965 : tensor<300x512xf32>
-    %9977 = stablehlo.broadcast_in_dim %9976, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %9978 = stablehlo.broadcast_in_dim %arg913, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %9979 = stablehlo.add %9977, %9978 : tensor<300x512xf32>
-    %9980 = stablehlo.convert %9979 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %9981 = stablehlo.reshape %9980 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %9982 = stablehlo.reshape %9981 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %9983 = stablehlo.transpose %9982, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %9984 = stablehlo.dot_general %9962, %arg914, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %9985 = stablehlo.broadcast_in_dim %9984, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %9986 = stablehlo.multiply %9985, %9965 : tensor<300x512xf32>
-    %9987 = stablehlo.broadcast_in_dim %9986, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %9988 = stablehlo.broadcast_in_dim %arg915, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %9989 = stablehlo.add %9987, %9988 : tensor<300x512xf32>
-    %9990 = stablehlo.convert %9989 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %9991 = stablehlo.reshape %9990 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %9992 = stablehlo.reshape %9991 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %9993 = stablehlo.transpose %9992, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %9994 = stablehlo.transpose %9983, dims = [0, 1, 3, 2] : (tensor<1x8x300x64xbf16>) -> tensor<1x8x64x300xbf16>
-    %9995 = stablehlo.reshape %9973 : (tensor<1x8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %9996 = stablehlo.reshape %9994 : (tensor<1x8x64x300xbf16>) -> tensor<8x64x300xbf16>
-    %9997 = stablehlo.broadcast_in_dim %9996, dims = [0, 1, 2] : (tensor<8x64x300xbf16>) -> tensor<8x64x300xbf16>
-    %9998 = stablehlo.dot_general %9995, %9997, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x300x64xbf16>, tensor<8x64x300xbf16>) -> tensor<8x300x300xbf16>
-    %9999 = stablehlo.reshape %9998 : (tensor<8x300x300xbf16>) -> tensor<1x8x300x300xbf16>
-    %10000 = stablehlo.broadcast_in_dim %9999, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xbf16>) -> tensor<1x8x300x300xbf16>
-    %10001 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x8x300x300xbf16>
-    %10002 = stablehlo.divide %10000, %10001 : tensor<1x8x300x300xbf16>
-    %10003 = stablehlo.convert %10002 : (tensor<1x8x300x300xbf16>) -> tensor<1x8x300x300xf32>
-    %10004 = stablehlo.reduce(%10003 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x300x300xf32>, tensor<f32>) -> tensor<1x8x300xf32>
-    %10005 = stablehlo.reshape %10004 : (tensor<1x8x300xf32>) -> tensor<1x8x300x1xf32>
-    %10006 = stablehlo.broadcast_in_dim %10003, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xf32>
-    %10007 = stablehlo.broadcast_in_dim %10005, dims = [0, 1, 2, 3] : (tensor<1x8x300x1xf32>) -> tensor<1x8x300x300xf32>
-    %10008 = stablehlo.subtract %10006, %10007 : tensor<1x8x300x300xf32>
-    %10009 = stablehlo.exponential %10008 : tensor<1x8x300x300xf32>
-    %10010 = stablehlo.reduce(%10009 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x300x300xf32>, tensor<f32>) -> tensor<1x8x300xf32>
-    %10011 = stablehlo.reshape %10010 : (tensor<1x8x300xf32>) -> tensor<1x8x300x1xf32>
-    %10012 = stablehlo.broadcast_in_dim %10009, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xf32>
-    %10013 = stablehlo.broadcast_in_dim %10011, dims = [0, 1, 2, 3] : (tensor<1x8x300x1xf32>) -> tensor<1x8x300x300xf32>
-    %10014 = stablehlo.divide %10012, %10013 : tensor<1x8x300x300xf32>
-    %10015 = stablehlo.convert %10014 : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xbf16>
-    %10016 = stablehlo.reshape %10015 : (tensor<1x8x300x300xbf16>) -> tensor<8x300x300xbf16>
-    %10017 = stablehlo.reshape %9993 : (tensor<1x8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10018 = stablehlo.broadcast_in_dim %10017, dims = [0, 1, 2] : (tensor<8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10019 = stablehlo.dot_general %10016, %10018, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x300x300xbf16>, tensor<8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10020 = stablehlo.reshape %10019 : (tensor<8x300x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10021 = stablehlo.transpose %10020, dims = [0, 2, 1, 3] : (tensor<1x8x300x64xbf16>) -> tensor<1x300x8x64xbf16>
-    %10022 = stablehlo.reshape %10021 : (tensor<1x300x8x64xbf16>) -> tensor<1x300x512xbf16>
-    %10023 = stablehlo.reshape %10022 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10024 = stablehlo.convert %10023 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10025 = stablehlo.dot_general %10024, %arg916, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10026 = stablehlo.broadcast_in_dim %10025, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10027 = stablehlo.multiply %10026, %9965 : tensor<300x512xf32>
-    %10028 = stablehlo.broadcast_in_dim %10027, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10029 = stablehlo.broadcast_in_dim %arg917, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10030 = stablehlo.add %10028, %10029 : tensor<300x512xf32>
-    %10031 = stablehlo.convert %10030 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10032 = stablehlo.reshape %10031 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10033 = stablehlo.add %10032, %9923 : tensor<1x300x512xbf16>
-    %10034 = stablehlo.convert %10033 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %10035 = stablehlo.convert %10034 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %10036 = stablehlo.reduce(%10035 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10037 = stablehlo.reshape %10036 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10038 = stablehlo.broadcast_in_dim %10037, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10039 = stablehlo.divide %10038, %9888 : tensor<1x300x1xf64>
-    %10040 = stablehlo.broadcast_in_dim %10035, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %10041 = stablehlo.broadcast_in_dim %10039, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %10042 = stablehlo.subtract %10040, %10041 : tensor<1x300x512xf64>
-    %10043 = stablehlo.multiply %10042, %10042 : tensor<1x300x512xf64>
-    %10044 = stablehlo.reduce(%10043 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10045 = stablehlo.reshape %10044 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10046 = stablehlo.broadcast_in_dim %10045, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10047 = stablehlo.divide %10046, %9888 : tensor<1x300x1xf64>
-    %10048 = stablehlo.convert %10047 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %10049 = stablehlo.reduce(%10034 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %10050 = stablehlo.reshape %10049 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %10051 = stablehlo.broadcast_in_dim %10050, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10052 = stablehlo.divide %10051, %9904 : tensor<1x300x1xf32>
-    %10053 = stablehlo.broadcast_in_dim %10048, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10054 = stablehlo.add %10053, %136 : tensor<1x300x1xf32>
-    %10055 = stablehlo.rsqrt %10054 : tensor<1x300x1xf32>
-    %10056 = stablehlo.broadcast_in_dim %10034, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10057 = stablehlo.broadcast_in_dim %10052, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10058 = stablehlo.subtract %10056, %10057 : tensor<1x300x512xf32>
-    %10059 = stablehlo.broadcast_in_dim %10058, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10060 = stablehlo.broadcast_in_dim %10055, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10061 = stablehlo.multiply %10059, %10060 : tensor<1x300x512xf32>
-    %10062 = stablehlo.convert %arg443 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10063 = stablehlo.broadcast_in_dim %10061, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10064 = stablehlo.broadcast_in_dim %10062, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10065 = stablehlo.multiply %10063, %10064 : tensor<1x300x512xf32>
-    %10066 = stablehlo.convert %arg444 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10067 = stablehlo.broadcast_in_dim %10065, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10068 = stablehlo.broadcast_in_dim %10066, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10069 = stablehlo.add %10067, %10068 : tensor<1x300x512xf32>
-    %10070 = stablehlo.convert %10069 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %10071 = stablehlo.reshape %10070 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10072 = stablehlo.convert %10071 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10073 = stablehlo.dot_general %10072, %arg918, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x2048xf32>) -> tensor<300x2048xf32>
-    %10074 = stablehlo.broadcast_in_dim %10073, dims = [0, 1] : (tensor<300x2048xf32>) -> tensor<300x2048xf32>
-    %10075 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<300x2048xf32>
-    %10076 = stablehlo.multiply %10074, %10075 : tensor<300x2048xf32>
-    %10077 = stablehlo.broadcast_in_dim %10076, dims = [0, 1] : (tensor<300x2048xf32>) -> tensor<300x2048xf32>
-    %10078 = stablehlo.broadcast_in_dim %arg919, dims = [1] : (tensor<2048xf32>) -> tensor<300x2048xf32>
-    %10079 = stablehlo.add %10077, %10078 : tensor<300x2048xf32>
-    %10080 = stablehlo.convert %10079 : (tensor<300x2048xf32>) -> tensor<300x2048xbf16>
-    %10081 = stablehlo.reshape %10080 : (tensor<300x2048xbf16>) -> tensor<1x300x2048xbf16>
-    %10082 = stablehlo.transpose %10081, dims = [0, 2, 1] : (tensor<1x300x2048xbf16>) -> tensor<1x2048x300xbf16>
-    %10083 = stablehlo.reshape %10082 : (tensor<1x2048x300xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10084 = stablehlo.convolution(%10083, %arg445) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 2048 : i64} : (tensor<1x2048x15x20xbf16>, tensor<2048x1x3x3xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10085 = stablehlo.reshape %arg446 : (tensor<2048xbf16>) -> tensor<2048x1x1xbf16>
-    %10086 = stablehlo.broadcast_in_dim %10084, dims = [0, 1, 2, 3] : (tensor<1x2048x15x20xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10087 = stablehlo.broadcast_in_dim %10085, dims = [1, 2, 3] : (tensor<2048x1x1xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10088 = stablehlo.add %10086, %10087 : tensor<1x2048x15x20xbf16>
-    %10089 = stablehlo.reshape %10088 : (tensor<1x2048x15x20xbf16>) -> tensor<1x2048x300xbf16>
-    %10090 = stablehlo.transpose %10089, dims = [0, 2, 1] : (tensor<1x2048x300xbf16>) -> tensor<1x300x2048xbf16>
-    %10091 = stablehlo.multiply %10090, %cst_61 : tensor<1x300x2048xbf16>
-    %10092 = stablehlo.rsqrt %cst_60 : tensor<1x300x2048xbf16>
-    %10093 = stablehlo.multiply %10090, %10092 : tensor<1x300x2048xbf16>
-    %10094 = stablehlo.convert %10093 : (tensor<1x300x2048xbf16>) -> tensor<1x300x2048xf32>
-    %10095 = stablehlo.clamp %cst_62, %10094, %cst_63 : tensor<1x300x2048xf32>
-    %10096 = stablehlo.multiply %10095, %10095 : tensor<1x300x2048xf32>
-    %10097 = stablehlo.multiply %cst_64, %10096 : tensor<1x300x2048xf32>
-    %10098 = stablehlo.add %10097, %cst_65 : tensor<1x300x2048xf32>
-    %10099 = stablehlo.multiply %10098, %10096 : tensor<1x300x2048xf32>
-    %10100 = stablehlo.add %10099, %cst_66 : tensor<1x300x2048xf32>
-    %10101 = stablehlo.multiply %10100, %10096 : tensor<1x300x2048xf32>
-    %10102 = stablehlo.add %10101, %cst_67 : tensor<1x300x2048xf32>
-    %10103 = stablehlo.multiply %10102, %10096 : tensor<1x300x2048xf32>
-    %10104 = stablehlo.add %10103, %cst_68 : tensor<1x300x2048xf32>
-    %10105 = stablehlo.multiply %10104, %10096 : tensor<1x300x2048xf32>
-    %10106 = stablehlo.add %10105, %cst_69 : tensor<1x300x2048xf32>
-    %10107 = stablehlo.multiply %10106, %10096 : tensor<1x300x2048xf32>
-    %10108 = stablehlo.add %10107, %cst_70 : tensor<1x300x2048xf32>
-    %10109 = stablehlo.multiply %cst_71, %10096 : tensor<1x300x2048xf32>
-    %10110 = stablehlo.add %10109, %cst_72 : tensor<1x300x2048xf32>
-    %10111 = stablehlo.multiply %10110, %10096 : tensor<1x300x2048xf32>
-    %10112 = stablehlo.add %10111, %cst_73 : tensor<1x300x2048xf32>
-    %10113 = stablehlo.multiply %10112, %10096 : tensor<1x300x2048xf32>
-    %10114 = stablehlo.add %10113, %cst_74 : tensor<1x300x2048xf32>
-    %10115 = stablehlo.multiply %10114, %10096 : tensor<1x300x2048xf32>
-    %10116 = stablehlo.add %10115, %cst_75 : tensor<1x300x2048xf32>
-    %10117 = stablehlo.multiply %10095, %10108 : tensor<1x300x2048xf32>
-    %10118 = stablehlo.divide %10117, %10116 : tensor<1x300x2048xf32>
-    %10119 = stablehlo.clamp %cst_76, %10118, %cst_77 : tensor<1x300x2048xf32>
-    %10120 = stablehlo.convert %10119 : (tensor<1x300x2048xf32>) -> tensor<1x300x2048xbf16>
-    %10121 = stablehlo.add %10120, %cst_59 : tensor<1x300x2048xbf16>
-    %10122 = stablehlo.multiply %10121, %10091 : tensor<1x300x2048xbf16>
-    %10123 = stablehlo.reshape %10122 : (tensor<1x300x2048xbf16>) -> tensor<300x2048xbf16>
-    %10124 = stablehlo.dot_general %10123, %arg920, contracting_dims = [1] x [0] : (tensor<300x2048xbf16>, tensor<2048x512xbf16>) -> tensor<300x512xbf16>
-    %10125 = stablehlo.reshape %10124 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10126 = stablehlo.broadcast_in_dim %10125, dims = [0, 1, 2] : (tensor<1x300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10127 = stablehlo.broadcast_in_dim %arg447, dims = [2] : (tensor<512xbf16>) -> tensor<1x300x512xbf16>
-    %10128 = stablehlo.add %10126, %10127 : tensor<1x300x512xbf16>
-    %10129 = stablehlo.reshape %10128 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10130 = stablehlo.reshape %10129 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10131 = stablehlo.add %10130, %10033 : tensor<1x300x512xbf16>
-    %10132 = stablehlo.convert %10131 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %10133 = stablehlo.convert %10132 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %10134 = stablehlo.reduce(%10133 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10135 = stablehlo.reshape %10134 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10136 = stablehlo.broadcast_in_dim %10135, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10137 = stablehlo.divide %10136, %9888 : tensor<1x300x1xf64>
-    %10138 = stablehlo.broadcast_in_dim %10133, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %10139 = stablehlo.broadcast_in_dim %10137, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %10140 = stablehlo.subtract %10138, %10139 : tensor<1x300x512xf64>
-    %10141 = stablehlo.multiply %10140, %10140 : tensor<1x300x512xf64>
-    %10142 = stablehlo.reduce(%10141 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10143 = stablehlo.reshape %10142 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10144 = stablehlo.broadcast_in_dim %10143, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10145 = stablehlo.divide %10144, %9888 : tensor<1x300x1xf64>
-    %10146 = stablehlo.convert %10145 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %10147 = stablehlo.reduce(%10132 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %10148 = stablehlo.reshape %10147 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %10149 = stablehlo.broadcast_in_dim %10148, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10150 = stablehlo.divide %10149, %9904 : tensor<1x300x1xf32>
-    %10151 = stablehlo.broadcast_in_dim %10146, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10152 = stablehlo.add %10151, %136 : tensor<1x300x1xf32>
-    %10153 = stablehlo.rsqrt %10152 : tensor<1x300x1xf32>
-    %10154 = stablehlo.broadcast_in_dim %10132, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10155 = stablehlo.broadcast_in_dim %10150, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10156 = stablehlo.subtract %10154, %10155 : tensor<1x300x512xf32>
-    %10157 = stablehlo.broadcast_in_dim %10156, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10158 = stablehlo.broadcast_in_dim %10153, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10159 = stablehlo.multiply %10157, %10158 : tensor<1x300x512xf32>
-    %10160 = stablehlo.convert %arg448 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10161 = stablehlo.broadcast_in_dim %10159, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10162 = stablehlo.broadcast_in_dim %10160, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10163 = stablehlo.multiply %10161, %10162 : tensor<1x300x512xf32>
-    %10164 = stablehlo.convert %arg449 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10165 = stablehlo.broadcast_in_dim %10163, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10166 = stablehlo.broadcast_in_dim %10164, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10167 = stablehlo.add %10165, %10166 : tensor<1x300x512xf32>
-    %10168 = stablehlo.convert %10167 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %10169 = stablehlo.reshape %10168 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10170 = stablehlo.convert %10169 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10171 = stablehlo.dot_general %10170, %arg921, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10172 = stablehlo.broadcast_in_dim %10171, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10173 = stablehlo.multiply %10172, %9965 : tensor<300x512xf32>
-    %10174 = stablehlo.broadcast_in_dim %10173, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10175 = stablehlo.broadcast_in_dim %arg922, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10176 = stablehlo.add %10174, %10175 : tensor<300x512xf32>
-    %10177 = stablehlo.convert %10176 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10178 = stablehlo.reshape %10177 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10179 = stablehlo.reshape %10178 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %10180 = stablehlo.transpose %10179, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10181 = stablehlo.dot_general %10170, %arg923, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10182 = stablehlo.broadcast_in_dim %10181, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10183 = stablehlo.multiply %10182, %9965 : tensor<300x512xf32>
-    %10184 = stablehlo.broadcast_in_dim %10183, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10185 = stablehlo.broadcast_in_dim %arg924, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10186 = stablehlo.add %10184, %10185 : tensor<300x512xf32>
-    %10187 = stablehlo.convert %10186 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10188 = stablehlo.reshape %10187 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10189 = stablehlo.reshape %10188 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %10190 = stablehlo.transpose %10189, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10191 = stablehlo.dot_general %10170, %arg925, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10192 = stablehlo.broadcast_in_dim %10191, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10193 = stablehlo.multiply %10192, %9965 : tensor<300x512xf32>
-    %10194 = stablehlo.broadcast_in_dim %10193, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10195 = stablehlo.broadcast_in_dim %arg926, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10196 = stablehlo.add %10194, %10195 : tensor<300x512xf32>
-    %10197 = stablehlo.convert %10196 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10198 = stablehlo.reshape %10197 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10199 = stablehlo.reshape %10198 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %10200 = stablehlo.transpose %10199, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10201 = stablehlo.transpose %10190, dims = [0, 1, 3, 2] : (tensor<1x8x300x64xbf16>) -> tensor<1x8x64x300xbf16>
-    %10202 = stablehlo.reshape %10180 : (tensor<1x8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10203 = stablehlo.reshape %10201 : (tensor<1x8x64x300xbf16>) -> tensor<8x64x300xbf16>
-    %10204 = stablehlo.broadcast_in_dim %10203, dims = [0, 1, 2] : (tensor<8x64x300xbf16>) -> tensor<8x64x300xbf16>
-    %10205 = stablehlo.dot_general %10202, %10204, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x300x64xbf16>, tensor<8x64x300xbf16>) -> tensor<8x300x300xbf16>
-    %10206 = stablehlo.reshape %10205 : (tensor<8x300x300xbf16>) -> tensor<1x8x300x300xbf16>
-    %10207 = stablehlo.broadcast_in_dim %10206, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xbf16>) -> tensor<1x8x300x300xbf16>
-    %10208 = stablehlo.divide %10207, %10001 : tensor<1x8x300x300xbf16>
-    %10209 = stablehlo.convert %10208 : (tensor<1x8x300x300xbf16>) -> tensor<1x8x300x300xf32>
-    %10210 = stablehlo.reduce(%10209 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x300x300xf32>, tensor<f32>) -> tensor<1x8x300xf32>
-    %10211 = stablehlo.reshape %10210 : (tensor<1x8x300xf32>) -> tensor<1x8x300x1xf32>
-    %10212 = stablehlo.broadcast_in_dim %10209, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xf32>
-    %10213 = stablehlo.broadcast_in_dim %10211, dims = [0, 1, 2, 3] : (tensor<1x8x300x1xf32>) -> tensor<1x8x300x300xf32>
-    %10214 = stablehlo.subtract %10212, %10213 : tensor<1x8x300x300xf32>
-    %10215 = stablehlo.exponential %10214 : tensor<1x8x300x300xf32>
-    %10216 = stablehlo.reduce(%10215 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x300x300xf32>, tensor<f32>) -> tensor<1x8x300xf32>
-    %10217 = stablehlo.reshape %10216 : (tensor<1x8x300xf32>) -> tensor<1x8x300x1xf32>
-    %10218 = stablehlo.broadcast_in_dim %10215, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xf32>
-    %10219 = stablehlo.broadcast_in_dim %10217, dims = [0, 1, 2, 3] : (tensor<1x8x300x1xf32>) -> tensor<1x8x300x300xf32>
-    %10220 = stablehlo.divide %10218, %10219 : tensor<1x8x300x300xf32>
-    %10221 = stablehlo.convert %10220 : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xbf16>
-    %10222 = stablehlo.reshape %10221 : (tensor<1x8x300x300xbf16>) -> tensor<8x300x300xbf16>
-    %10223 = stablehlo.reshape %10200 : (tensor<1x8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10224 = stablehlo.broadcast_in_dim %10223, dims = [0, 1, 2] : (tensor<8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10225 = stablehlo.dot_general %10222, %10224, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x300x300xbf16>, tensor<8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10226 = stablehlo.reshape %10225 : (tensor<8x300x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10227 = stablehlo.transpose %10226, dims = [0, 2, 1, 3] : (tensor<1x8x300x64xbf16>) -> tensor<1x300x8x64xbf16>
-    %10228 = stablehlo.reshape %10227 : (tensor<1x300x8x64xbf16>) -> tensor<1x300x512xbf16>
-    %10229 = stablehlo.reshape %10228 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10230 = stablehlo.convert %10229 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10231 = stablehlo.dot_general %10230, %arg927, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10232 = stablehlo.broadcast_in_dim %10231, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10233 = stablehlo.multiply %10232, %9965 : tensor<300x512xf32>
-    %10234 = stablehlo.broadcast_in_dim %10233, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10235 = stablehlo.broadcast_in_dim %arg928, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10236 = stablehlo.add %10234, %10235 : tensor<300x512xf32>
-    %10237 = stablehlo.convert %10236 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10238 = stablehlo.reshape %10237 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10239 = stablehlo.add %10238, %10131 : tensor<1x300x512xbf16>
-    %10240 = stablehlo.convert %10239 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %10241 = stablehlo.convert %10240 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %10242 = stablehlo.reduce(%10241 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10243 = stablehlo.reshape %10242 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10244 = stablehlo.broadcast_in_dim %10243, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10245 = stablehlo.divide %10244, %9888 : tensor<1x300x1xf64>
-    %10246 = stablehlo.broadcast_in_dim %10241, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %10247 = stablehlo.broadcast_in_dim %10245, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %10248 = stablehlo.subtract %10246, %10247 : tensor<1x300x512xf64>
-    %10249 = stablehlo.multiply %10248, %10248 : tensor<1x300x512xf64>
-    %10250 = stablehlo.reduce(%10249 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10251 = stablehlo.reshape %10250 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10252 = stablehlo.broadcast_in_dim %10251, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10253 = stablehlo.divide %10252, %9888 : tensor<1x300x1xf64>
-    %10254 = stablehlo.convert %10253 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %10255 = stablehlo.reduce(%10240 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %10256 = stablehlo.reshape %10255 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %10257 = stablehlo.broadcast_in_dim %10256, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10258 = stablehlo.divide %10257, %9904 : tensor<1x300x1xf32>
-    %10259 = stablehlo.broadcast_in_dim %10254, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10260 = stablehlo.add %10259, %136 : tensor<1x300x1xf32>
-    %10261 = stablehlo.rsqrt %10260 : tensor<1x300x1xf32>
-    %10262 = stablehlo.broadcast_in_dim %10240, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10263 = stablehlo.broadcast_in_dim %10258, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10264 = stablehlo.subtract %10262, %10263 : tensor<1x300x512xf32>
-    %10265 = stablehlo.broadcast_in_dim %10264, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10266 = stablehlo.broadcast_in_dim %10261, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10267 = stablehlo.multiply %10265, %10266 : tensor<1x300x512xf32>
-    %10268 = stablehlo.convert %arg450 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10269 = stablehlo.broadcast_in_dim %10267, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10270 = stablehlo.broadcast_in_dim %10268, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10271 = stablehlo.multiply %10269, %10270 : tensor<1x300x512xf32>
-    %10272 = stablehlo.convert %arg451 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10273 = stablehlo.broadcast_in_dim %10271, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10274 = stablehlo.broadcast_in_dim %10272, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10275 = stablehlo.add %10273, %10274 : tensor<1x300x512xf32>
-    %10276 = stablehlo.convert %10275 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %10277 = stablehlo.reshape %10276 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10278 = stablehlo.convert %10277 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10279 = stablehlo.dot_general %10278, %arg929, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x2048xf32>) -> tensor<300x2048xf32>
-    %10280 = stablehlo.broadcast_in_dim %10279, dims = [0, 1] : (tensor<300x2048xf32>) -> tensor<300x2048xf32>
-    %10281 = stablehlo.multiply %10280, %10075 : tensor<300x2048xf32>
-    %10282 = stablehlo.broadcast_in_dim %10281, dims = [0, 1] : (tensor<300x2048xf32>) -> tensor<300x2048xf32>
-    %10283 = stablehlo.broadcast_in_dim %arg930, dims = [1] : (tensor<2048xf32>) -> tensor<300x2048xf32>
-    %10284 = stablehlo.add %10282, %10283 : tensor<300x2048xf32>
-    %10285 = stablehlo.convert %10284 : (tensor<300x2048xf32>) -> tensor<300x2048xbf16>
-    %10286 = stablehlo.reshape %10285 : (tensor<300x2048xbf16>) -> tensor<1x300x2048xbf16>
-    %10287 = stablehlo.transpose %10286, dims = [0, 2, 1] : (tensor<1x300x2048xbf16>) -> tensor<1x2048x300xbf16>
-    %10288 = stablehlo.reshape %10287 : (tensor<1x2048x300xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10289 = stablehlo.convolution(%10288, %arg452) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 2048 : i64} : (tensor<1x2048x15x20xbf16>, tensor<2048x1x3x3xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10290 = stablehlo.reshape %arg453 : (tensor<2048xbf16>) -> tensor<2048x1x1xbf16>
-    %10291 = stablehlo.broadcast_in_dim %10289, dims = [0, 1, 2, 3] : (tensor<1x2048x15x20xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10292 = stablehlo.broadcast_in_dim %10290, dims = [1, 2, 3] : (tensor<2048x1x1xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10293 = stablehlo.add %10291, %10292 : tensor<1x2048x15x20xbf16>
-    %10294 = stablehlo.reshape %10293 : (tensor<1x2048x15x20xbf16>) -> tensor<1x2048x300xbf16>
-    %10295 = stablehlo.transpose %10294, dims = [0, 2, 1] : (tensor<1x2048x300xbf16>) -> tensor<1x300x2048xbf16>
-    %10296 = stablehlo.multiply %10295, %cst_61 : tensor<1x300x2048xbf16>
-    %10297 = stablehlo.multiply %10295, %10092 : tensor<1x300x2048xbf16>
-    %10298 = stablehlo.convert %10297 : (tensor<1x300x2048xbf16>) -> tensor<1x300x2048xf32>
-    %10299 = stablehlo.clamp %cst_62, %10298, %cst_63 : tensor<1x300x2048xf32>
-    %10300 = stablehlo.multiply %10299, %10299 : tensor<1x300x2048xf32>
-    %10301 = stablehlo.multiply %cst_64, %10300 : tensor<1x300x2048xf32>
-    %10302 = stablehlo.add %10301, %cst_65 : tensor<1x300x2048xf32>
-    %10303 = stablehlo.multiply %10302, %10300 : tensor<1x300x2048xf32>
-    %10304 = stablehlo.add %10303, %cst_66 : tensor<1x300x2048xf32>
-    %10305 = stablehlo.multiply %10304, %10300 : tensor<1x300x2048xf32>
-    %10306 = stablehlo.add %10305, %cst_67 : tensor<1x300x2048xf32>
-    %10307 = stablehlo.multiply %10306, %10300 : tensor<1x300x2048xf32>
-    %10308 = stablehlo.add %10307, %cst_68 : tensor<1x300x2048xf32>
-    %10309 = stablehlo.multiply %10308, %10300 : tensor<1x300x2048xf32>
-    %10310 = stablehlo.add %10309, %cst_69 : tensor<1x300x2048xf32>
-    %10311 = stablehlo.multiply %10310, %10300 : tensor<1x300x2048xf32>
-    %10312 = stablehlo.add %10311, %cst_70 : tensor<1x300x2048xf32>
-    %10313 = stablehlo.multiply %cst_71, %10300 : tensor<1x300x2048xf32>
-    %10314 = stablehlo.add %10313, %cst_72 : tensor<1x300x2048xf32>
-    %10315 = stablehlo.multiply %10314, %10300 : tensor<1x300x2048xf32>
-    %10316 = stablehlo.add %10315, %cst_73 : tensor<1x300x2048xf32>
-    %10317 = stablehlo.multiply %10316, %10300 : tensor<1x300x2048xf32>
-    %10318 = stablehlo.add %10317, %cst_74 : tensor<1x300x2048xf32>
-    %10319 = stablehlo.multiply %10318, %10300 : tensor<1x300x2048xf32>
-    %10320 = stablehlo.add %10319, %cst_75 : tensor<1x300x2048xf32>
-    %10321 = stablehlo.multiply %10299, %10312 : tensor<1x300x2048xf32>
-    %10322 = stablehlo.divide %10321, %10320 : tensor<1x300x2048xf32>
-    %10323 = stablehlo.clamp %cst_76, %10322, %cst_77 : tensor<1x300x2048xf32>
-    %10324 = stablehlo.convert %10323 : (tensor<1x300x2048xf32>) -> tensor<1x300x2048xbf16>
-    %10325 = stablehlo.add %10324, %cst_59 : tensor<1x300x2048xbf16>
-    %10326 = stablehlo.multiply %10325, %10296 : tensor<1x300x2048xbf16>
-    %10327 = stablehlo.reshape %10326 : (tensor<1x300x2048xbf16>) -> tensor<300x2048xbf16>
-    %10328 = stablehlo.dot_general %10327, %arg931, contracting_dims = [1] x [0] : (tensor<300x2048xbf16>, tensor<2048x512xbf16>) -> tensor<300x512xbf16>
-    %10329 = stablehlo.reshape %10328 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10330 = stablehlo.broadcast_in_dim %10329, dims = [0, 1, 2] : (tensor<1x300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10331 = stablehlo.broadcast_in_dim %arg454, dims = [2] : (tensor<512xbf16>) -> tensor<1x300x512xbf16>
-    %10332 = stablehlo.add %10330, %10331 : tensor<1x300x512xbf16>
-    %10333 = stablehlo.reshape %10332 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10334 = stablehlo.reshape %10333 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10335 = stablehlo.add %10334, %10239 : tensor<1x300x512xbf16>
-    %10336 = stablehlo.convert %10335 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %10337 = stablehlo.convert %10336 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %10338 = stablehlo.reduce(%10337 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10339 = stablehlo.reshape %10338 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10340 = stablehlo.broadcast_in_dim %10339, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10341 = stablehlo.divide %10340, %9888 : tensor<1x300x1xf64>
-    %10342 = stablehlo.broadcast_in_dim %10337, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %10343 = stablehlo.broadcast_in_dim %10341, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %10344 = stablehlo.subtract %10342, %10343 : tensor<1x300x512xf64>
-    %10345 = stablehlo.multiply %10344, %10344 : tensor<1x300x512xf64>
-    %10346 = stablehlo.reduce(%10345 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10347 = stablehlo.reshape %10346 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10348 = stablehlo.broadcast_in_dim %10347, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10349 = stablehlo.divide %10348, %9888 : tensor<1x300x1xf64>
-    %10350 = stablehlo.convert %10349 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %10351 = stablehlo.reduce(%10336 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %10352 = stablehlo.reshape %10351 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %10353 = stablehlo.broadcast_in_dim %10352, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10354 = stablehlo.divide %10353, %9904 : tensor<1x300x1xf32>
-    %10355 = stablehlo.broadcast_in_dim %10350, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10356 = stablehlo.add %10355, %136 : tensor<1x300x1xf32>
-    %10357 = stablehlo.rsqrt %10356 : tensor<1x300x1xf32>
-    %10358 = stablehlo.broadcast_in_dim %10336, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10359 = stablehlo.broadcast_in_dim %10354, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10360 = stablehlo.subtract %10358, %10359 : tensor<1x300x512xf32>
-    %10361 = stablehlo.broadcast_in_dim %10360, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10362 = stablehlo.broadcast_in_dim %10357, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10363 = stablehlo.multiply %10361, %10362 : tensor<1x300x512xf32>
-    %10364 = stablehlo.convert %arg455 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10365 = stablehlo.broadcast_in_dim %10363, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10366 = stablehlo.broadcast_in_dim %10364, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10367 = stablehlo.multiply %10365, %10366 : tensor<1x300x512xf32>
-    %10368 = stablehlo.convert %arg456 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10369 = stablehlo.broadcast_in_dim %10367, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10370 = stablehlo.broadcast_in_dim %10368, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10371 = stablehlo.add %10369, %10370 : tensor<1x300x512xf32>
-    %10372 = stablehlo.convert %10371 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %10373 = stablehlo.reshape %10372 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10374 = stablehlo.convert %10373 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10375 = stablehlo.dot_general %10374, %arg932, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10376 = stablehlo.broadcast_in_dim %10375, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10377 = stablehlo.multiply %10376, %9965 : tensor<300x512xf32>
-    %10378 = stablehlo.broadcast_in_dim %10377, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10379 = stablehlo.broadcast_in_dim %arg933, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10380 = stablehlo.add %10378, %10379 : tensor<300x512xf32>
-    %10381 = stablehlo.convert %10380 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10382 = stablehlo.reshape %10381 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10383 = stablehlo.reshape %10382 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %10384 = stablehlo.transpose %10383, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10385 = stablehlo.dot_general %10374, %arg934, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10386 = stablehlo.broadcast_in_dim %10385, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10387 = stablehlo.multiply %10386, %9965 : tensor<300x512xf32>
-    %10388 = stablehlo.broadcast_in_dim %10387, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10389 = stablehlo.broadcast_in_dim %arg935, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10390 = stablehlo.add %10388, %10389 : tensor<300x512xf32>
-    %10391 = stablehlo.convert %10390 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10392 = stablehlo.reshape %10391 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10393 = stablehlo.reshape %10392 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %10394 = stablehlo.transpose %10393, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10395 = stablehlo.dot_general %10374, %arg936, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10396 = stablehlo.broadcast_in_dim %10395, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10397 = stablehlo.multiply %10396, %9965 : tensor<300x512xf32>
-    %10398 = stablehlo.broadcast_in_dim %10397, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10399 = stablehlo.broadcast_in_dim %arg937, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10400 = stablehlo.add %10398, %10399 : tensor<300x512xf32>
-    %10401 = stablehlo.convert %10400 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10402 = stablehlo.reshape %10401 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10403 = stablehlo.reshape %10402 : (tensor<1x300x512xbf16>) -> tensor<1x300x8x64xbf16>
-    %10404 = stablehlo.transpose %10403, dims = [0, 2, 1, 3] : (tensor<1x300x8x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10405 = stablehlo.transpose %10394, dims = [0, 1, 3, 2] : (tensor<1x8x300x64xbf16>) -> tensor<1x8x64x300xbf16>
-    %10406 = stablehlo.reshape %10384 : (tensor<1x8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10407 = stablehlo.reshape %10405 : (tensor<1x8x64x300xbf16>) -> tensor<8x64x300xbf16>
-    %10408 = stablehlo.broadcast_in_dim %10407, dims = [0, 1, 2] : (tensor<8x64x300xbf16>) -> tensor<8x64x300xbf16>
-    %10409 = stablehlo.dot_general %10406, %10408, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x300x64xbf16>, tensor<8x64x300xbf16>) -> tensor<8x300x300xbf16>
-    %10410 = stablehlo.reshape %10409 : (tensor<8x300x300xbf16>) -> tensor<1x8x300x300xbf16>
-    %10411 = stablehlo.broadcast_in_dim %10410, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xbf16>) -> tensor<1x8x300x300xbf16>
-    %10412 = stablehlo.divide %10411, %10001 : tensor<1x8x300x300xbf16>
-    %10413 = stablehlo.convert %10412 : (tensor<1x8x300x300xbf16>) -> tensor<1x8x300x300xf32>
-    %10414 = stablehlo.reduce(%10413 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x300x300xf32>, tensor<f32>) -> tensor<1x8x300xf32>
-    %10415 = stablehlo.reshape %10414 : (tensor<1x8x300xf32>) -> tensor<1x8x300x1xf32>
-    %10416 = stablehlo.broadcast_in_dim %10413, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xf32>
-    %10417 = stablehlo.broadcast_in_dim %10415, dims = [0, 1, 2, 3] : (tensor<1x8x300x1xf32>) -> tensor<1x8x300x300xf32>
-    %10418 = stablehlo.subtract %10416, %10417 : tensor<1x8x300x300xf32>
-    %10419 = stablehlo.exponential %10418 : tensor<1x8x300x300xf32>
-    %10420 = stablehlo.reduce(%10419 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x300x300xf32>, tensor<f32>) -> tensor<1x8x300xf32>
-    %10421 = stablehlo.reshape %10420 : (tensor<1x8x300xf32>) -> tensor<1x8x300x1xf32>
-    %10422 = stablehlo.broadcast_in_dim %10419, dims = [0, 1, 2, 3] : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xf32>
-    %10423 = stablehlo.broadcast_in_dim %10421, dims = [0, 1, 2, 3] : (tensor<1x8x300x1xf32>) -> tensor<1x8x300x300xf32>
-    %10424 = stablehlo.divide %10422, %10423 : tensor<1x8x300x300xf32>
-    %10425 = stablehlo.convert %10424 : (tensor<1x8x300x300xf32>) -> tensor<1x8x300x300xbf16>
-    %10426 = stablehlo.reshape %10425 : (tensor<1x8x300x300xbf16>) -> tensor<8x300x300xbf16>
-    %10427 = stablehlo.reshape %10404 : (tensor<1x8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10428 = stablehlo.broadcast_in_dim %10427, dims = [0, 1, 2] : (tensor<8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10429 = stablehlo.dot_general %10426, %10428, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x300x300xbf16>, tensor<8x300x64xbf16>) -> tensor<8x300x64xbf16>
-    %10430 = stablehlo.reshape %10429 : (tensor<8x300x64xbf16>) -> tensor<1x8x300x64xbf16>
-    %10431 = stablehlo.transpose %10430, dims = [0, 2, 1, 3] : (tensor<1x8x300x64xbf16>) -> tensor<1x300x8x64xbf16>
-    %10432 = stablehlo.reshape %10431 : (tensor<1x300x8x64xbf16>) -> tensor<1x300x512xbf16>
-    %10433 = stablehlo.reshape %10432 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10434 = stablehlo.convert %10433 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10435 = stablehlo.dot_general %10434, %arg938, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x512xf32>) -> tensor<300x512xf32>
-    %10436 = stablehlo.broadcast_in_dim %10435, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10437 = stablehlo.multiply %10436, %9965 : tensor<300x512xf32>
-    %10438 = stablehlo.broadcast_in_dim %10437, dims = [0, 1] : (tensor<300x512xf32>) -> tensor<300x512xf32>
-    %10439 = stablehlo.broadcast_in_dim %arg939, dims = [1] : (tensor<512xf32>) -> tensor<300x512xf32>
-    %10440 = stablehlo.add %10438, %10439 : tensor<300x512xf32>
-    %10441 = stablehlo.convert %10440 : (tensor<300x512xf32>) -> tensor<300x512xbf16>
-    %10442 = stablehlo.reshape %10441 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10443 = stablehlo.add %10442, %10335 : tensor<1x300x512xbf16>
-    %10444 = stablehlo.convert %10443 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %10445 = stablehlo.convert %10444 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %10446 = stablehlo.reduce(%10445 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10447 = stablehlo.reshape %10446 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10448 = stablehlo.broadcast_in_dim %10447, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10449 = stablehlo.divide %10448, %9888 : tensor<1x300x1xf64>
-    %10450 = stablehlo.broadcast_in_dim %10445, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %10451 = stablehlo.broadcast_in_dim %10449, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %10452 = stablehlo.subtract %10450, %10451 : tensor<1x300x512xf64>
-    %10453 = stablehlo.multiply %10452, %10452 : tensor<1x300x512xf64>
-    %10454 = stablehlo.reduce(%10453 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10455 = stablehlo.reshape %10454 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10456 = stablehlo.broadcast_in_dim %10455, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10457 = stablehlo.divide %10456, %9888 : tensor<1x300x1xf64>
-    %10458 = stablehlo.convert %10457 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %10459 = stablehlo.reduce(%10444 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %10460 = stablehlo.reshape %10459 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %10461 = stablehlo.broadcast_in_dim %10460, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10462 = stablehlo.divide %10461, %9904 : tensor<1x300x1xf32>
-    %10463 = stablehlo.broadcast_in_dim %10458, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10464 = stablehlo.add %10463, %136 : tensor<1x300x1xf32>
-    %10465 = stablehlo.rsqrt %10464 : tensor<1x300x1xf32>
-    %10466 = stablehlo.broadcast_in_dim %10444, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10467 = stablehlo.broadcast_in_dim %10462, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10468 = stablehlo.subtract %10466, %10467 : tensor<1x300x512xf32>
-    %10469 = stablehlo.broadcast_in_dim %10468, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10470 = stablehlo.broadcast_in_dim %10465, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10471 = stablehlo.multiply %10469, %10470 : tensor<1x300x512xf32>
-    %10472 = stablehlo.convert %arg457 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10473 = stablehlo.broadcast_in_dim %10471, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10474 = stablehlo.broadcast_in_dim %10472, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10475 = stablehlo.multiply %10473, %10474 : tensor<1x300x512xf32>
-    %10476 = stablehlo.convert %arg458 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10477 = stablehlo.broadcast_in_dim %10475, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10478 = stablehlo.broadcast_in_dim %10476, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10479 = stablehlo.add %10477, %10478 : tensor<1x300x512xf32>
-    %10480 = stablehlo.convert %10479 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %10481 = stablehlo.reshape %10480 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10482 = stablehlo.convert %10481 : (tensor<300x512xbf16>) -> tensor<300x512xf32>
-    %10483 = stablehlo.dot_general %10482, %arg940, contracting_dims = [1] x [0] : (tensor<300x512xf32>, tensor<512x2048xf32>) -> tensor<300x2048xf32>
-    %10484 = stablehlo.broadcast_in_dim %10483, dims = [0, 1] : (tensor<300x2048xf32>) -> tensor<300x2048xf32>
-    %10485 = stablehlo.multiply %10484, %10075 : tensor<300x2048xf32>
-    %10486 = stablehlo.broadcast_in_dim %10485, dims = [0, 1] : (tensor<300x2048xf32>) -> tensor<300x2048xf32>
-    %10487 = stablehlo.broadcast_in_dim %arg941, dims = [1] : (tensor<2048xf32>) -> tensor<300x2048xf32>
-    %10488 = stablehlo.add %10486, %10487 : tensor<300x2048xf32>
-    %10489 = stablehlo.convert %10488 : (tensor<300x2048xf32>) -> tensor<300x2048xbf16>
-    %10490 = stablehlo.reshape %10489 : (tensor<300x2048xbf16>) -> tensor<1x300x2048xbf16>
-    %10491 = stablehlo.transpose %10490, dims = [0, 2, 1] : (tensor<1x300x2048xbf16>) -> tensor<1x2048x300xbf16>
-    %10492 = stablehlo.reshape %10491 : (tensor<1x2048x300xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10493 = stablehlo.convolution(%10492, %arg459) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 2048 : i64} : (tensor<1x2048x15x20xbf16>, tensor<2048x1x3x3xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10494 = stablehlo.reshape %arg460 : (tensor<2048xbf16>) -> tensor<2048x1x1xbf16>
-    %10495 = stablehlo.broadcast_in_dim %10493, dims = [0, 1, 2, 3] : (tensor<1x2048x15x20xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10496 = stablehlo.broadcast_in_dim %10494, dims = [1, 2, 3] : (tensor<2048x1x1xbf16>) -> tensor<1x2048x15x20xbf16>
-    %10497 = stablehlo.add %10495, %10496 : tensor<1x2048x15x20xbf16>
-    %10498 = stablehlo.reshape %10497 : (tensor<1x2048x15x20xbf16>) -> tensor<1x2048x300xbf16>
-    %10499 = stablehlo.transpose %10498, dims = [0, 2, 1] : (tensor<1x2048x300xbf16>) -> tensor<1x300x2048xbf16>
-    %10500 = stablehlo.multiply %10499, %cst_61 : tensor<1x300x2048xbf16>
-    %10501 = stablehlo.multiply %10499, %10092 : tensor<1x300x2048xbf16>
-    %10502 = stablehlo.convert %10501 : (tensor<1x300x2048xbf16>) -> tensor<1x300x2048xf32>
-    %10503 = stablehlo.clamp %cst_62, %10502, %cst_63 : tensor<1x300x2048xf32>
-    %10504 = stablehlo.multiply %10503, %10503 : tensor<1x300x2048xf32>
-    %10505 = stablehlo.multiply %cst_64, %10504 : tensor<1x300x2048xf32>
-    %10506 = stablehlo.add %10505, %cst_65 : tensor<1x300x2048xf32>
-    %10507 = stablehlo.multiply %10506, %10504 : tensor<1x300x2048xf32>
-    %10508 = stablehlo.add %10507, %cst_66 : tensor<1x300x2048xf32>
-    %10509 = stablehlo.multiply %10508, %10504 : tensor<1x300x2048xf32>
-    %10510 = stablehlo.add %10509, %cst_67 : tensor<1x300x2048xf32>
-    %10511 = stablehlo.multiply %10510, %10504 : tensor<1x300x2048xf32>
-    %10512 = stablehlo.add %10511, %cst_68 : tensor<1x300x2048xf32>
-    %10513 = stablehlo.multiply %10512, %10504 : tensor<1x300x2048xf32>
-    %10514 = stablehlo.add %10513, %cst_69 : tensor<1x300x2048xf32>
-    %10515 = stablehlo.multiply %10514, %10504 : tensor<1x300x2048xf32>
-    %10516 = stablehlo.add %10515, %cst_70 : tensor<1x300x2048xf32>
-    %10517 = stablehlo.multiply %cst_71, %10504 : tensor<1x300x2048xf32>
-    %10518 = stablehlo.add %10517, %cst_72 : tensor<1x300x2048xf32>
-    %10519 = stablehlo.multiply %10518, %10504 : tensor<1x300x2048xf32>
-    %10520 = stablehlo.add %10519, %cst_73 : tensor<1x300x2048xf32>
-    %10521 = stablehlo.multiply %10520, %10504 : tensor<1x300x2048xf32>
-    %10522 = stablehlo.add %10521, %cst_74 : tensor<1x300x2048xf32>
-    %10523 = stablehlo.multiply %10522, %10504 : tensor<1x300x2048xf32>
-    %10524 = stablehlo.add %10523, %cst_75 : tensor<1x300x2048xf32>
-    %10525 = stablehlo.multiply %10503, %10516 : tensor<1x300x2048xf32>
-    %10526 = stablehlo.divide %10525, %10524 : tensor<1x300x2048xf32>
-    %10527 = stablehlo.clamp %cst_76, %10526, %cst_77 : tensor<1x300x2048xf32>
-    %10528 = stablehlo.convert %10527 : (tensor<1x300x2048xf32>) -> tensor<1x300x2048xbf16>
-    %10529 = stablehlo.add %10528, %cst_59 : tensor<1x300x2048xbf16>
-    %10530 = stablehlo.multiply %10529, %10500 : tensor<1x300x2048xbf16>
-    %10531 = stablehlo.reshape %10530 : (tensor<1x300x2048xbf16>) -> tensor<300x2048xbf16>
-    %10532 = stablehlo.dot_general %10531, %arg942, contracting_dims = [1] x [0] : (tensor<300x2048xbf16>, tensor<2048x512xbf16>) -> tensor<300x512xbf16>
-    %10533 = stablehlo.reshape %10532 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10534 = stablehlo.broadcast_in_dim %10533, dims = [0, 1, 2] : (tensor<1x300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10535 = stablehlo.broadcast_in_dim %arg461, dims = [2] : (tensor<512xbf16>) -> tensor<1x300x512xbf16>
-    %10536 = stablehlo.add %10534, %10535 : tensor<1x300x512xbf16>
-    %10537 = stablehlo.reshape %10536 : (tensor<1x300x512xbf16>) -> tensor<300x512xbf16>
-    %10538 = stablehlo.reshape %10537 : (tensor<300x512xbf16>) -> tensor<1x300x512xbf16>
-    %10539 = stablehlo.add %10538, %10443 : tensor<1x300x512xbf16>
-    %10540 = stablehlo.convert %10539 : (tensor<1x300x512xbf16>) -> tensor<1x300x512xf32>
-    %10541 = stablehlo.convert %10540 : (tensor<1x300x512xf32>) -> tensor<1x300x512xf64>
-    %10542 = stablehlo.reduce(%10541 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10543 = stablehlo.reshape %10542 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10544 = stablehlo.broadcast_in_dim %10543, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10545 = stablehlo.divide %10544, %9888 : tensor<1x300x1xf64>
-    %10546 = stablehlo.broadcast_in_dim %10541, dims = [0, 1, 2] : (tensor<1x300x512xf64>) -> tensor<1x300x512xf64>
-    %10547 = stablehlo.broadcast_in_dim %10545, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x512xf64>
-    %10548 = stablehlo.subtract %10546, %10547 : tensor<1x300x512xf64>
-    %10549 = stablehlo.multiply %10548, %10548 : tensor<1x300x512xf64>
-    %10550 = stablehlo.reduce(%10549 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf64>, tensor<f64>) -> tensor<1x300xf64>
-    %10551 = stablehlo.reshape %10550 : (tensor<1x300xf64>) -> tensor<1x300x1xf64>
-    %10552 = stablehlo.broadcast_in_dim %10551, dims = [0, 1, 2] : (tensor<1x300x1xf64>) -> tensor<1x300x1xf64>
-    %10553 = stablehlo.divide %10552, %9888 : tensor<1x300x1xf64>
-    %10554 = stablehlo.convert %10553 : (tensor<1x300x1xf64>) -> tensor<1x300x1xf32>
-    %10555 = stablehlo.reduce(%10540 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x300x512xf32>, tensor<f32>) -> tensor<1x300xf32>
-    %10556 = stablehlo.reshape %10555 : (tensor<1x300xf32>) -> tensor<1x300x1xf32>
-    %10557 = stablehlo.broadcast_in_dim %10556, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10558 = stablehlo.divide %10557, %9904 : tensor<1x300x1xf32>
-    %10559 = stablehlo.broadcast_in_dim %10554, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x1xf32>
-    %10560 = stablehlo.add %10559, %136 : tensor<1x300x1xf32>
-    %10561 = stablehlo.rsqrt %10560 : tensor<1x300x1xf32>
-    %10562 = stablehlo.broadcast_in_dim %10540, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10563 = stablehlo.broadcast_in_dim %10558, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10564 = stablehlo.subtract %10562, %10563 : tensor<1x300x512xf32>
-    %10565 = stablehlo.broadcast_in_dim %10564, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10566 = stablehlo.broadcast_in_dim %10561, dims = [0, 1, 2] : (tensor<1x300x1xf32>) -> tensor<1x300x512xf32>
-    %10567 = stablehlo.multiply %10565, %10566 : tensor<1x300x512xf32>
-    %10568 = stablehlo.convert %arg462 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10569 = stablehlo.broadcast_in_dim %10567, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10570 = stablehlo.broadcast_in_dim %10568, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10571 = stablehlo.multiply %10569, %10570 : tensor<1x300x512xf32>
-    %10572 = stablehlo.convert %arg463 : (tensor<512xbf16>) -> tensor<512xf32>
-    %10573 = stablehlo.broadcast_in_dim %10571, dims = [0, 1, 2] : (tensor<1x300x512xf32>) -> tensor<1x300x512xf32>
-    %10574 = stablehlo.broadcast_in_dim %10572, dims = [2] : (tensor<512xf32>) -> tensor<1x300x512xf32>
-    %10575 = stablehlo.add %10573, %10574 : tensor<1x300x512xf32>
-    %10576 = stablehlo.convert %10575 : (tensor<1x300x512xf32>) -> tensor<1x300x512xbf16>
-    %10577 = stablehlo.reshape %10576 : (tensor<1x300x512xbf16>) -> tensor<1x15x20x512xbf16>
-    %10578 = stablehlo.transpose %10577, dims = [0, 3, 1, 2] : (tensor<1x15x20x512xbf16>) -> tensor<1x512x15x20xbf16>
-    %10579 = stablehlo.convolution(%10578, %arg464) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x15x20xbf16>, tensor<64x512x1x1xbf16>) -> tensor<1x64x15x20xbf16>
-    %10580 = stablehlo.reshape %arg465 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10581 = stablehlo.broadcast_in_dim %10579, dims = [0, 1, 2, 3] : (tensor<1x64x15x20xbf16>) -> tensor<1x64x15x20xbf16>
-    %10582 = stablehlo.broadcast_in_dim %10580, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x15x20xbf16>
-    %10583 = stablehlo.add %10581, %10582 : tensor<1x64x15x20xbf16>
-    %10584 = stablehlo.transpose %10583, dims = [0, 1, 3, 2] : (tensor<1x64x15x20xbf16>) -> tensor<1x64x20x15xbf16>
-    %10585 = stablehlo.reshape %10584 : (tensor<1x64x20x15xbf16>) -> tensor<64x20x15xbf16>
-    %10586 = stablehlo.broadcast_in_dim %arg943, dims = [0, 1, 2] : (tensor<64x15x30xbf16>) -> tensor<64x15x30xbf16>
-    %10587 = stablehlo.dot_general %10585, %10586, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x20x15xbf16>, tensor<64x15x30xbf16>) -> tensor<64x20x30xbf16>
-    %10588 = stablehlo.reshape %10587 : (tensor<64x20x30xbf16>) -> tensor<1x64x20x30xbf16>
-    %10589 = stablehlo.transpose %10588, dims = [0, 1, 3, 2] : (tensor<1x64x20x30xbf16>) -> tensor<1x64x30x20xbf16>
-    %10590 = stablehlo.reshape %10589 : (tensor<1x64x30x20xbf16>) -> tensor<64x30x20xbf16>
-    %10591 = stablehlo.broadcast_in_dim %arg944, dims = [0, 1, 2] : (tensor<64x20x40xbf16>) -> tensor<64x20x40xbf16>
-    %10592 = stablehlo.dot_general %10590, %10591, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x30x20xbf16>, tensor<64x20x40xbf16>) -> tensor<64x30x40xbf16>
-    %10593 = stablehlo.reshape %10592 : (tensor<64x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10594 = stablehlo.convolution(%9873, %arg466) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x30x40xbf16>, tensor<64x320x1x1xbf16>) -> tensor<1x64x30x40xbf16>
-    %10595 = stablehlo.reshape %arg467 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10596 = stablehlo.broadcast_in_dim %10594, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10597 = stablehlo.broadcast_in_dim %10595, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x30x40xbf16>
-    %10598 = stablehlo.add %10596, %10597 : tensor<1x64x30x40xbf16>
-    %10599 = stablehlo.concatenate %10598, %10593, dim = 1 : (tensor<1x64x30x40xbf16>, tensor<1x64x30x40xbf16>) -> tensor<1x128x30x40xbf16>
-    %10600 = stablehlo.convolution(%10599, %arg468) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x30x40xbf16>, tensor<64x128x3x3xbf16>) -> tensor<1x64x30x40xbf16>
-    %10601 = stablehlo.reshape %arg469 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10602 = stablehlo.broadcast_in_dim %10600, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10603 = stablehlo.broadcast_in_dim %10601, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x30x40xbf16>
-    %10604 = stablehlo.add %10602, %10603 : tensor<1x64x30x40xbf16>
-    %10605 = stablehlo.convert %10604 : (tensor<1x64x30x40xbf16>) -> tensor<1x64x30x40xf32>
-    %10606 = stablehlo.broadcast_in_dim %10605, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xf32>) -> tensor<1x64x30x40xf32>
-    %10607 = stablehlo.broadcast_in_dim %arg945, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x30x40xf32>
-    %10608 = stablehlo.subtract %10606, %10607 : tensor<1x64x30x40xf32>
-    %10609 = stablehlo.broadcast_in_dim %10608, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xf32>) -> tensor<1x64x30x40xf32>
-    %10610 = stablehlo.broadcast_in_dim %arg946, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x30x40xf32>
-    %10611 = stablehlo.multiply %10609, %10610 : tensor<1x64x30x40xf32>
-    %10612 = stablehlo.convert %arg947 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %10613 = stablehlo.broadcast_in_dim %10611, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xf32>) -> tensor<1x64x30x40xf32>
-    %10614 = stablehlo.broadcast_in_dim %10612, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x30x40xf32>
-    %10615 = stablehlo.multiply %10613, %10614 : tensor<1x64x30x40xf32>
-    %10616 = stablehlo.convert %arg948 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %10617 = stablehlo.broadcast_in_dim %10615, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xf32>) -> tensor<1x64x30x40xf32>
-    %10618 = stablehlo.broadcast_in_dim %10616, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x30x40xf32>
-    %10619 = stablehlo.add %10617, %10618 : tensor<1x64x30x40xf32>
-    %10620 = stablehlo.convert %10619 : (tensor<1x64x30x40xf32>) -> tensor<1x64x30x40xbf16>
-    %10621 = stablehlo.maximum %10620, %cst_78 : tensor<1x64x30x40xbf16>
-    %10622 = stablehlo.convolution(%10621, %arg470) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x30x40xbf16>, tensor<32x64x3x3xbf16>) -> tensor<1x32x30x40xbf16>
-    %10623 = stablehlo.reshape %arg471 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %10624 = stablehlo.broadcast_in_dim %10622, dims = [0, 1, 2, 3] : (tensor<1x32x30x40xbf16>) -> tensor<1x32x30x40xbf16>
-    %10625 = stablehlo.broadcast_in_dim %10623, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x30x40xbf16>
-    %10626 = stablehlo.add %10624, %10625 : tensor<1x32x30x40xbf16>
-    %10627 = stablehlo.convert %10626 : (tensor<1x32x30x40xbf16>) -> tensor<1x32x30x40xf32>
-    %10628 = stablehlo.broadcast_in_dim %10627, dims = [0, 1, 2, 3] : (tensor<1x32x30x40xf32>) -> tensor<1x32x30x40xf32>
-    %10629 = stablehlo.broadcast_in_dim %arg949, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x30x40xf32>
-    %10630 = stablehlo.subtract %10628, %10629 : tensor<1x32x30x40xf32>
-    %10631 = stablehlo.broadcast_in_dim %10630, dims = [0, 1, 2, 3] : (tensor<1x32x30x40xf32>) -> tensor<1x32x30x40xf32>
-    %10632 = stablehlo.broadcast_in_dim %arg950, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x30x40xf32>
-    %10633 = stablehlo.multiply %10631, %10632 : tensor<1x32x30x40xf32>
-    %10634 = stablehlo.convert %arg951 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %10635 = stablehlo.broadcast_in_dim %10633, dims = [0, 1, 2, 3] : (tensor<1x32x30x40xf32>) -> tensor<1x32x30x40xf32>
-    %10636 = stablehlo.broadcast_in_dim %10634, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x30x40xf32>
-    %10637 = stablehlo.multiply %10635, %10636 : tensor<1x32x30x40xf32>
-    %10638 = stablehlo.convert %arg952 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %10639 = stablehlo.broadcast_in_dim %10637, dims = [0, 1, 2, 3] : (tensor<1x32x30x40xf32>) -> tensor<1x32x30x40xf32>
-    %10640 = stablehlo.broadcast_in_dim %10638, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x30x40xf32>
-    %10641 = stablehlo.add %10639, %10640 : tensor<1x32x30x40xf32>
-    %10642 = stablehlo.convert %10641 : (tensor<1x32x30x40xf32>) -> tensor<1x32x30x40xbf16>
-    %10643 = stablehlo.maximum %10642, %cst_79 : tensor<1x32x30x40xbf16>
-    %10644 = stablehlo.convolution(%10643, %arg472) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x30x40xbf16>, tensor<2x32x3x3xbf16>) -> tensor<1x2x30x40xbf16>
-    %10645 = stablehlo.reshape %arg473 : (tensor<2xbf16>) -> tensor<2x1x1xbf16>
-    %10646 = stablehlo.broadcast_in_dim %10644, dims = [0, 1, 2, 3] : (tensor<1x2x30x40xbf16>) -> tensor<1x2x30x40xbf16>
-    %10647 = stablehlo.broadcast_in_dim %10645, dims = [1, 2, 3] : (tensor<2x1x1xbf16>) -> tensor<1x2x30x40xbf16>
-    %10648 = stablehlo.add %10646, %10647 : tensor<1x2x30x40xbf16>
-    %10649 = stablehlo.logistic %10648 : tensor<1x2x30x40xbf16>
-    %10650 = stablehlo.slice %10649 [0:1, 0:1, 0:30, 0:40] : (tensor<1x2x30x40xbf16>) -> tensor<1x1x30x40xbf16>
-    %10651 = stablehlo.reshape %10650 : (tensor<1x1x30x40xbf16>) -> tensor<1x30x40xbf16>
-    %10652 = stablehlo.reshape %10651 : (tensor<1x30x40xbf16>) -> tensor<1x1x30x40xbf16>
-    %10653 = stablehlo.broadcast_in_dim %10598, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10654 = stablehlo.broadcast_in_dim %10652, dims = [0, 1, 2, 3] : (tensor<1x1x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10655 = stablehlo.multiply %10653, %10654 : tensor<1x64x30x40xbf16>
-    %10656 = stablehlo.slice %10649 [0:1, 1:2, 0:30, 0:40] : (tensor<1x2x30x40xbf16>) -> tensor<1x1x30x40xbf16>
-    %10657 = stablehlo.reshape %10656 : (tensor<1x1x30x40xbf16>) -> tensor<1x30x40xbf16>
-    %10658 = stablehlo.reshape %10657 : (tensor<1x30x40xbf16>) -> tensor<1x1x30x40xbf16>
-    %10659 = stablehlo.broadcast_in_dim %10593, dims = [0, 1, 2, 3] : (tensor<1x64x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10660 = stablehlo.broadcast_in_dim %10658, dims = [0, 1, 2, 3] : (tensor<1x1x30x40xbf16>) -> tensor<1x64x30x40xbf16>
-    %10661 = stablehlo.multiply %10659, %10660 : tensor<1x64x30x40xbf16>
-    %10662 = stablehlo.add %10655, %10661 : tensor<1x64x30x40xbf16>
-    %10663 = stablehlo.transpose %10662, dims = [0, 1, 3, 2] : (tensor<1x64x30x40xbf16>) -> tensor<1x64x40x30xbf16>
-    %10664 = stablehlo.reshape %10663 : (tensor<1x64x40x30xbf16>) -> tensor<64x40x30xbf16>
-    %10665 = stablehlo.broadcast_in_dim %arg953, dims = [0, 1, 2] : (tensor<64x30x60xbf16>) -> tensor<64x30x60xbf16>
-    %10666 = stablehlo.dot_general %10664, %10665, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x40x30xbf16>, tensor<64x30x60xbf16>) -> tensor<64x40x60xbf16>
-    %10667 = stablehlo.reshape %10666 : (tensor<64x40x60xbf16>) -> tensor<1x64x40x60xbf16>
-    %10668 = stablehlo.transpose %10667, dims = [0, 1, 3, 2] : (tensor<1x64x40x60xbf16>) -> tensor<1x64x60x40xbf16>
-    %10669 = stablehlo.reshape %10668 : (tensor<1x64x60x40xbf16>) -> tensor<64x60x40xbf16>
-    %10670 = stablehlo.broadcast_in_dim %arg954, dims = [0, 1, 2] : (tensor<64x40x80xbf16>) -> tensor<64x40x80xbf16>
-    %10671 = stablehlo.dot_general %10669, %10670, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x60x40xbf16>, tensor<64x40x80xbf16>) -> tensor<64x60x80xbf16>
-    %10672 = stablehlo.reshape %10671 : (tensor<64x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10673 = stablehlo.convolution(%2972, %arg474) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<64x128x1x1xbf16>) -> tensor<1x64x60x80xbf16>
-    %10674 = stablehlo.reshape %arg475 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10675 = stablehlo.broadcast_in_dim %10673, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10676 = stablehlo.broadcast_in_dim %10674, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x60x80xbf16>
-    %10677 = stablehlo.add %10675, %10676 : tensor<1x64x60x80xbf16>
-    %10678 = stablehlo.concatenate %10677, %10672, dim = 1 : (tensor<1x64x60x80xbf16>, tensor<1x64x60x80xbf16>) -> tensor<1x128x60x80xbf16>
-    %10679 = stablehlo.convolution(%10678, %arg476) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x60x80xbf16>, tensor<64x128x3x3xbf16>) -> tensor<1x64x60x80xbf16>
-    %10680 = stablehlo.reshape %arg477 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10681 = stablehlo.broadcast_in_dim %10679, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10682 = stablehlo.broadcast_in_dim %10680, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x60x80xbf16>
-    %10683 = stablehlo.add %10681, %10682 : tensor<1x64x60x80xbf16>
-    %10684 = stablehlo.convert %10683 : (tensor<1x64x60x80xbf16>) -> tensor<1x64x60x80xf32>
-    %10685 = stablehlo.broadcast_in_dim %10684, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xf32>) -> tensor<1x64x60x80xf32>
-    %10686 = stablehlo.broadcast_in_dim %arg955, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x60x80xf32>
-    %10687 = stablehlo.subtract %10685, %10686 : tensor<1x64x60x80xf32>
-    %10688 = stablehlo.broadcast_in_dim %10687, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xf32>) -> tensor<1x64x60x80xf32>
-    %10689 = stablehlo.broadcast_in_dim %arg956, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x60x80xf32>
-    %10690 = stablehlo.multiply %10688, %10689 : tensor<1x64x60x80xf32>
-    %10691 = stablehlo.convert %arg957 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %10692 = stablehlo.broadcast_in_dim %10690, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xf32>) -> tensor<1x64x60x80xf32>
-    %10693 = stablehlo.broadcast_in_dim %10691, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x60x80xf32>
-    %10694 = stablehlo.multiply %10692, %10693 : tensor<1x64x60x80xf32>
-    %10695 = stablehlo.convert %arg958 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %10696 = stablehlo.broadcast_in_dim %10694, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xf32>) -> tensor<1x64x60x80xf32>
-    %10697 = stablehlo.broadcast_in_dim %10695, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x60x80xf32>
-    %10698 = stablehlo.add %10696, %10697 : tensor<1x64x60x80xf32>
-    %10699 = stablehlo.convert %10698 : (tensor<1x64x60x80xf32>) -> tensor<1x64x60x80xbf16>
-    %10700 = stablehlo.maximum %10699, %cst_80 : tensor<1x64x60x80xbf16>
-    %10701 = stablehlo.convolution(%10700, %arg478) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x60x80xbf16>, tensor<32x64x3x3xbf16>) -> tensor<1x32x60x80xbf16>
-    %10702 = stablehlo.reshape %arg479 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %10703 = stablehlo.broadcast_in_dim %10701, dims = [0, 1, 2, 3] : (tensor<1x32x60x80xbf16>) -> tensor<1x32x60x80xbf16>
-    %10704 = stablehlo.broadcast_in_dim %10702, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x60x80xbf16>
-    %10705 = stablehlo.add %10703, %10704 : tensor<1x32x60x80xbf16>
-    %10706 = stablehlo.convert %10705 : (tensor<1x32x60x80xbf16>) -> tensor<1x32x60x80xf32>
-    %10707 = stablehlo.broadcast_in_dim %10706, dims = [0, 1, 2, 3] : (tensor<1x32x60x80xf32>) -> tensor<1x32x60x80xf32>
-    %10708 = stablehlo.broadcast_in_dim %arg959, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x60x80xf32>
-    %10709 = stablehlo.subtract %10707, %10708 : tensor<1x32x60x80xf32>
-    %10710 = stablehlo.broadcast_in_dim %10709, dims = [0, 1, 2, 3] : (tensor<1x32x60x80xf32>) -> tensor<1x32x60x80xf32>
-    %10711 = stablehlo.broadcast_in_dim %arg960, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x60x80xf32>
-    %10712 = stablehlo.multiply %10710, %10711 : tensor<1x32x60x80xf32>
-    %10713 = stablehlo.convert %arg961 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %10714 = stablehlo.broadcast_in_dim %10712, dims = [0, 1, 2, 3] : (tensor<1x32x60x80xf32>) -> tensor<1x32x60x80xf32>
-    %10715 = stablehlo.broadcast_in_dim %10713, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x60x80xf32>
-    %10716 = stablehlo.multiply %10714, %10715 : tensor<1x32x60x80xf32>
-    %10717 = stablehlo.convert %arg962 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %10718 = stablehlo.broadcast_in_dim %10716, dims = [0, 1, 2, 3] : (tensor<1x32x60x80xf32>) -> tensor<1x32x60x80xf32>
-    %10719 = stablehlo.broadcast_in_dim %10717, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x60x80xf32>
-    %10720 = stablehlo.add %10718, %10719 : tensor<1x32x60x80xf32>
-    %10721 = stablehlo.convert %10720 : (tensor<1x32x60x80xf32>) -> tensor<1x32x60x80xbf16>
-    %10722 = stablehlo.maximum %10721, %cst_81 : tensor<1x32x60x80xbf16>
-    %10723 = stablehlo.convolution(%10722, %arg480) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x60x80xbf16>, tensor<2x32x3x3xbf16>) -> tensor<1x2x60x80xbf16>
-    %10724 = stablehlo.reshape %arg481 : (tensor<2xbf16>) -> tensor<2x1x1xbf16>
-    %10725 = stablehlo.broadcast_in_dim %10723, dims = [0, 1, 2, 3] : (tensor<1x2x60x80xbf16>) -> tensor<1x2x60x80xbf16>
-    %10726 = stablehlo.broadcast_in_dim %10724, dims = [1, 2, 3] : (tensor<2x1x1xbf16>) -> tensor<1x2x60x80xbf16>
-    %10727 = stablehlo.add %10725, %10726 : tensor<1x2x60x80xbf16>
-    %10728 = stablehlo.logistic %10727 : tensor<1x2x60x80xbf16>
-    %10729 = stablehlo.slice %10728 [0:1, 0:1, 0:60, 0:80] : (tensor<1x2x60x80xbf16>) -> tensor<1x1x60x80xbf16>
-    %10730 = stablehlo.reshape %10729 : (tensor<1x1x60x80xbf16>) -> tensor<1x60x80xbf16>
-    %10731 = stablehlo.reshape %10730 : (tensor<1x60x80xbf16>) -> tensor<1x1x60x80xbf16>
-    %10732 = stablehlo.broadcast_in_dim %10677, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10733 = stablehlo.broadcast_in_dim %10731, dims = [0, 1, 2, 3] : (tensor<1x1x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10734 = stablehlo.multiply %10732, %10733 : tensor<1x64x60x80xbf16>
-    %10735 = stablehlo.slice %10728 [0:1, 1:2, 0:60, 0:80] : (tensor<1x2x60x80xbf16>) -> tensor<1x1x60x80xbf16>
-    %10736 = stablehlo.reshape %10735 : (tensor<1x1x60x80xbf16>) -> tensor<1x60x80xbf16>
-    %10737 = stablehlo.reshape %10736 : (tensor<1x60x80xbf16>) -> tensor<1x1x60x80xbf16>
-    %10738 = stablehlo.broadcast_in_dim %10672, dims = [0, 1, 2, 3] : (tensor<1x64x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10739 = stablehlo.broadcast_in_dim %10737, dims = [0, 1, 2, 3] : (tensor<1x1x60x80xbf16>) -> tensor<1x64x60x80xbf16>
-    %10740 = stablehlo.multiply %10738, %10739 : tensor<1x64x60x80xbf16>
-    %10741 = stablehlo.add %10734, %10740 : tensor<1x64x60x80xbf16>
-    %10742 = stablehlo.transpose %10741, dims = [0, 1, 3, 2] : (tensor<1x64x60x80xbf16>) -> tensor<1x64x80x60xbf16>
-    %10743 = stablehlo.reshape %10742 : (tensor<1x64x80x60xbf16>) -> tensor<64x80x60xbf16>
-    %10744 = stablehlo.broadcast_in_dim %arg963, dims = [0, 1, 2] : (tensor<64x60x120xbf16>) -> tensor<64x60x120xbf16>
-    %10745 = stablehlo.dot_general %10743, %10744, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x80x60xbf16>, tensor<64x60x120xbf16>) -> tensor<64x80x120xbf16>
-    %10746 = stablehlo.reshape %10745 : (tensor<64x80x120xbf16>) -> tensor<1x64x80x120xbf16>
-    %10747 = stablehlo.transpose %10746, dims = [0, 1, 3, 2] : (tensor<1x64x80x120xbf16>) -> tensor<1x64x120x80xbf16>
-    %10748 = stablehlo.reshape %10747 : (tensor<1x64x120x80xbf16>) -> tensor<64x120x80xbf16>
-    %10749 = stablehlo.broadcast_in_dim %arg964, dims = [0, 1, 2] : (tensor<64x80x160xbf16>) -> tensor<64x80x160xbf16>
-    %10750 = stablehlo.dot_general %10748, %10749, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x120x80xbf16>, tensor<64x80x160xbf16>) -> tensor<64x120x160xbf16>
-    %10751 = stablehlo.reshape %10750 : (tensor<64x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %10752 = stablehlo.concatenate %859, %10751, dim = 1 : (tensor<1x64x120x160xbf16>, tensor<1x64x120x160xbf16>) -> tensor<1x128x120x160xbf16>
-    %10753 = stablehlo.convolution(%10752, %arg482) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x120x160xbf16>, tensor<64x128x3x3xbf16>) -> tensor<1x64x120x160xbf16>
-    %10754 = stablehlo.reshape %arg483 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10755 = stablehlo.broadcast_in_dim %10753, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %10756 = stablehlo.broadcast_in_dim %10754, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x120x160xbf16>
-    %10757 = stablehlo.add %10755, %10756 : tensor<1x64x120x160xbf16>
-    %10758 = stablehlo.convert %10757 : (tensor<1x64x120x160xbf16>) -> tensor<1x64x120x160xf32>
-    %10759 = stablehlo.broadcast_in_dim %10758, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xf32>) -> tensor<1x64x120x160xf32>
-    %10760 = stablehlo.broadcast_in_dim %arg965, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x120x160xf32>
-    %10761 = stablehlo.subtract %10759, %10760 : tensor<1x64x120x160xf32>
-    %10762 = stablehlo.broadcast_in_dim %10761, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xf32>) -> tensor<1x64x120x160xf32>
-    %10763 = stablehlo.broadcast_in_dim %arg966, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x120x160xf32>
-    %10764 = stablehlo.multiply %10762, %10763 : tensor<1x64x120x160xf32>
-    %10765 = stablehlo.convert %arg967 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %10766 = stablehlo.broadcast_in_dim %10764, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xf32>) -> tensor<1x64x120x160xf32>
-    %10767 = stablehlo.broadcast_in_dim %10765, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x120x160xf32>
-    %10768 = stablehlo.multiply %10766, %10767 : tensor<1x64x120x160xf32>
-    %10769 = stablehlo.convert %arg968 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %10770 = stablehlo.broadcast_in_dim %10768, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xf32>) -> tensor<1x64x120x160xf32>
-    %10771 = stablehlo.broadcast_in_dim %10769, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x120x160xf32>
-    %10772 = stablehlo.add %10770, %10771 : tensor<1x64x120x160xf32>
-    %10773 = stablehlo.convert %10772 : (tensor<1x64x120x160xf32>) -> tensor<1x64x120x160xbf16>
-    %10774 = stablehlo.maximum %10773, %cst_82 : tensor<1x64x120x160xbf16>
-    %10775 = stablehlo.convolution(%10774, %arg484) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x120x160xbf16>, tensor<32x64x3x3xbf16>) -> tensor<1x32x120x160xbf16>
-    %10776 = stablehlo.reshape %arg485 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %10777 = stablehlo.broadcast_in_dim %10775, dims = [0, 1, 2, 3] : (tensor<1x32x120x160xbf16>) -> tensor<1x32x120x160xbf16>
-    %10778 = stablehlo.broadcast_in_dim %10776, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x120x160xbf16>
-    %10779 = stablehlo.add %10777, %10778 : tensor<1x32x120x160xbf16>
-    %10780 = stablehlo.convert %10779 : (tensor<1x32x120x160xbf16>) -> tensor<1x32x120x160xf32>
-    %10781 = stablehlo.broadcast_in_dim %10780, dims = [0, 1, 2, 3] : (tensor<1x32x120x160xf32>) -> tensor<1x32x120x160xf32>
-    %10782 = stablehlo.broadcast_in_dim %arg969, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x120x160xf32>
-    %10783 = stablehlo.subtract %10781, %10782 : tensor<1x32x120x160xf32>
-    %10784 = stablehlo.broadcast_in_dim %10783, dims = [0, 1, 2, 3] : (tensor<1x32x120x160xf32>) -> tensor<1x32x120x160xf32>
-    %10785 = stablehlo.broadcast_in_dim %arg970, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x120x160xf32>
-    %10786 = stablehlo.multiply %10784, %10785 : tensor<1x32x120x160xf32>
-    %10787 = stablehlo.convert %arg971 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %10788 = stablehlo.broadcast_in_dim %10786, dims = [0, 1, 2, 3] : (tensor<1x32x120x160xf32>) -> tensor<1x32x120x160xf32>
-    %10789 = stablehlo.broadcast_in_dim %10787, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x120x160xf32>
-    %10790 = stablehlo.multiply %10788, %10789 : tensor<1x32x120x160xf32>
-    %10791 = stablehlo.convert %arg972 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %10792 = stablehlo.broadcast_in_dim %10790, dims = [0, 1, 2, 3] : (tensor<1x32x120x160xf32>) -> tensor<1x32x120x160xf32>
-    %10793 = stablehlo.broadcast_in_dim %10791, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x120x160xf32>
-    %10794 = stablehlo.add %10792, %10793 : tensor<1x32x120x160xf32>
-    %10795 = stablehlo.convert %10794 : (tensor<1x32x120x160xf32>) -> tensor<1x32x120x160xbf16>
-    %10796 = stablehlo.maximum %10795, %cst_83 : tensor<1x32x120x160xbf16>
-    %10797 = stablehlo.convolution(%10796, %arg486) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x120x160xbf16>, tensor<2x32x3x3xbf16>) -> tensor<1x2x120x160xbf16>
-    %10798 = stablehlo.reshape %arg487 : (tensor<2xbf16>) -> tensor<2x1x1xbf16>
-    %10799 = stablehlo.broadcast_in_dim %10797, dims = [0, 1, 2, 3] : (tensor<1x2x120x160xbf16>) -> tensor<1x2x120x160xbf16>
-    %10800 = stablehlo.broadcast_in_dim %10798, dims = [1, 2, 3] : (tensor<2x1x1xbf16>) -> tensor<1x2x120x160xbf16>
-    %10801 = stablehlo.add %10799, %10800 : tensor<1x2x120x160xbf16>
-    %10802 = stablehlo.logistic %10801 : tensor<1x2x120x160xbf16>
-    %10803 = stablehlo.slice %10802 [0:1, 0:1, 0:120, 0:160] : (tensor<1x2x120x160xbf16>) -> tensor<1x1x120x160xbf16>
-    %10804 = stablehlo.reshape %10803 : (tensor<1x1x120x160xbf16>) -> tensor<1x120x160xbf16>
-    %10805 = stablehlo.reshape %10804 : (tensor<1x120x160xbf16>) -> tensor<1x1x120x160xbf16>
-    %10806 = stablehlo.broadcast_in_dim %859, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %10807 = stablehlo.broadcast_in_dim %10805, dims = [0, 1, 2, 3] : (tensor<1x1x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %10808 = stablehlo.multiply %10806, %10807 : tensor<1x64x120x160xbf16>
-    %10809 = stablehlo.slice %10802 [0:1, 1:2, 0:120, 0:160] : (tensor<1x2x120x160xbf16>) -> tensor<1x1x120x160xbf16>
-    %10810 = stablehlo.reshape %10809 : (tensor<1x1x120x160xbf16>) -> tensor<1x120x160xbf16>
-    %10811 = stablehlo.reshape %10810 : (tensor<1x120x160xbf16>) -> tensor<1x1x120x160xbf16>
-    %10812 = stablehlo.broadcast_in_dim %10751, dims = [0, 1, 2, 3] : (tensor<1x64x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %10813 = stablehlo.broadcast_in_dim %10811, dims = [0, 1, 2, 3] : (tensor<1x1x120x160xbf16>) -> tensor<1x64x120x160xbf16>
-    %10814 = stablehlo.multiply %10812, %10813 : tensor<1x64x120x160xbf16>
-    %10815 = stablehlo.add %10808, %10814 : tensor<1x64x120x160xbf16>
-    %10816 = stablehlo.transpose %10815, dims = [0, 1, 3, 2] : (tensor<1x64x120x160xbf16>) -> tensor<1x64x160x120xbf16>
-    %10817 = stablehlo.reshape %10816 : (tensor<1x64x160x120xbf16>) -> tensor<64x160x120xbf16>
-    %10818 = stablehlo.broadcast_in_dim %arg973, dims = [0, 1, 2] : (tensor<64x120x240xbf16>) -> tensor<64x120x240xbf16>
-    %10819 = stablehlo.dot_general %10817, %10818, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x160x120xbf16>, tensor<64x120x240xbf16>) -> tensor<64x160x240xbf16>
-    %10820 = stablehlo.reshape %10819 : (tensor<64x160x240xbf16>) -> tensor<1x64x160x240xbf16>
-    %10821 = stablehlo.transpose %10820, dims = [0, 1, 3, 2] : (tensor<1x64x160x240xbf16>) -> tensor<1x64x240x160xbf16>
-    %10822 = stablehlo.reshape %10821 : (tensor<1x64x240x160xbf16>) -> tensor<64x240x160xbf16>
-    %10823 = stablehlo.broadcast_in_dim %arg974, dims = [0, 1, 2] : (tensor<64x160x320xbf16>) -> tensor<64x160x320xbf16>
-    %10824 = stablehlo.dot_general %10822, %10823, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x240x160xbf16>, tensor<64x160x320xbf16>) -> tensor<64x240x320xbf16>
-    %10825 = stablehlo.reshape %10824 : (tensor<64x240x320xbf16>) -> tensor<1x64x240x320xbf16>
-    %10826 = stablehlo.transpose %10825, dims = [0, 1, 3, 2] : (tensor<1x64x240x320xbf16>) -> tensor<1x64x320x240xbf16>
-    %10827 = stablehlo.reshape %10826 : (tensor<1x64x320x240xbf16>) -> tensor<64x320x240xbf16>
-    %10828 = stablehlo.broadcast_in_dim %arg975, dims = [0, 1, 2] : (tensor<64x240x480xbf16>) -> tensor<64x240x480xbf16>
-    %10829 = stablehlo.dot_general %10827, %10828, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x320x240xbf16>, tensor<64x240x480xbf16>) -> tensor<64x320x480xbf16>
-    %10830 = stablehlo.reshape %10829 : (tensor<64x320x480xbf16>) -> tensor<1x64x320x480xbf16>
-    %10831 = stablehlo.transpose %10830, dims = [0, 1, 3, 2] : (tensor<1x64x320x480xbf16>) -> tensor<1x64x480x320xbf16>
-    %10832 = stablehlo.reshape %10831 : (tensor<1x64x480x320xbf16>) -> tensor<64x480x320xbf16>
-    %10833 = stablehlo.broadcast_in_dim %arg976, dims = [0, 1, 2] : (tensor<64x320x640xbf16>) -> tensor<64x320x640xbf16>
-    %10834 = stablehlo.dot_general %10832, %10833, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<64x480x320xbf16>, tensor<64x320x640xbf16>) -> tensor<64x480x640xbf16>
-    %10835 = stablehlo.reshape %10834 : (tensor<64x480x640xbf16>) -> tensor<1x64x480x640xbf16>
-    %10836 = stablehlo.convolution(%10835, %arg488) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x480x640xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x480x640xbf16>
-    %10837 = stablehlo.reshape %arg489 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %10838 = stablehlo.broadcast_in_dim %10836, dims = [0, 1, 2, 3] : (tensor<1x64x480x640xbf16>) -> tensor<1x64x480x640xbf16>
-    %10839 = stablehlo.broadcast_in_dim %10837, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x480x640xbf16>
-    %10840 = stablehlo.add %10838, %10839 : tensor<1x64x480x640xbf16>
-    %10841 = stablehlo.maximum %10840, %cst_84 : tensor<1x64x480x640xbf16>
-    %10842 = stablehlo.convolution(%10841, %arg490) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x480x640xbf16>, tensor<1x64x3x3xbf16>) -> tensor<1x1x480x640xbf16>
-    %10843 = stablehlo.reshape %arg491 : (tensor<1xbf16>) -> tensor<1x1x1xbf16>
-    %10844 = stablehlo.broadcast_in_dim %10842, dims = [0, 1, 2, 3] : (tensor<1x1x480x640xbf16>) -> tensor<1x1x480x640xbf16>
-    %10845 = stablehlo.broadcast_in_dim %10843, dims = [1, 2, 3] : (tensor<1x1x1xbf16>) -> tensor<1x1x480x640xbf16>
-    %10846 = stablehlo.add %10844, %10845 : tensor<1x1x480x640xbf16>
-    %10847 = stablehlo.logistic %10846 : tensor<1x1x480x640xbf16>
-    %10848 = stablehlo.convert %cst_92 : (tensor<1xi64>) -> tensor<1xbf16>
-    %10849 = stablehlo.reshape %10848 : (tensor<1xbf16>) -> tensor<bf16>
-    %10850 = stablehlo.broadcast_in_dim %10847, dims = [0, 1, 2, 3] : (tensor<1x1x480x640xbf16>) -> tensor<1x1x480x640xbf16>
-    %10851 = stablehlo.broadcast_in_dim %10849, dims = [] : (tensor<bf16>) -> tensor<1x1x480x640xbf16>
-    %10852 = stablehlo.multiply %10850, %10851 : tensor<1x1x480x640xbf16>
-    %10853 = stablehlo.reshape %10852 : (tensor<1x1x480x640xbf16>) -> tensor<1x480x640xbf16>
-    return %10853 : tensor<1x480x640xbf16>
-  }
-}
diff --git a/mlir_tests/MLPMixer.mlir b/mlir_tests/MLPMixer.mlir
deleted file mode 100644
index 0d2e329a..00000000
--- a/mlir_tests/MLPMixer.mlir
+++ /dev/null
@@ -1,2147 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x256x256xbf16>, %arg1: tensor<512xbf16>, %arg2: tensor<512xbf16>, %arg3: tensor<1024x256x1xbf16>, %arg4: tensor<1024xbf16>, %arg5: tensor<256x1024x1xbf16>, %arg6: tensor<256xbf16>, %arg7: tensor<512xbf16>, %arg8: tensor<512xbf16>, %arg9: tensor<512xbf16>, %arg10: tensor<512xbf16>, %arg11: tensor<1024x256x1xbf16>, %arg12: tensor<1024xbf16>, %arg13: tensor<256x1024x1xbf16>, %arg14: tensor<256xbf16>, %arg15: tensor<512xbf16>, %arg16: tensor<512xbf16>, %arg17: tensor<512xbf16>, %arg18: tensor<512xbf16>, %arg19: tensor<1024x256x1xbf16>, %arg20: tensor<1024xbf16>, %arg21: tensor<256x1024x1xbf16>, %arg22: tensor<256xbf16>, %arg23: tensor<512xbf16>, %arg24: tensor<512xbf16>, %arg25: tensor<512xbf16>, %arg26: tensor<512xbf16>, %arg27: tensor<1024x256x1xbf16>, %arg28: tensor<1024xbf16>, %arg29: tensor<256x1024x1xbf16>, %arg30: tensor<256xbf16>, %arg31: tensor<512xbf16>, %arg32: tensor<512xbf16>, %arg33: tensor<512xbf16>, %arg34: tensor<512xbf16>, %arg35: tensor<1024x256x1xbf16>, %arg36: tensor<1024xbf16>, %arg37: tensor<256x1024x1xbf16>, %arg38: tensor<256xbf16>, %arg39: tensor<512xbf16>, %arg40: tensor<512xbf16>, %arg41: tensor<512xbf16>, %arg42: tensor<512xbf16>, %arg43: tensor<1024x256x1xbf16>, %arg44: tensor<1024xbf16>, %arg45: tensor<256x1024x1xbf16>, %arg46: tensor<256xbf16>, %arg47: tensor<512xbf16>, %arg48: tensor<512xbf16>, %arg49: tensor<512xbf16>, %arg50: tensor<512xbf16>, %arg51: tensor<1024x256x1xbf16>, %arg52: tensor<1024xbf16>, %arg53: tensor<256x1024x1xbf16>, %arg54: tensor<256xbf16>, %arg55: tensor<512xbf16>, %arg56: tensor<512xbf16>, %arg57: tensor<512xbf16>, %arg58: tensor<512xbf16>, %arg59: tensor<1024x256x1xbf16>, %arg60: tensor<1024xbf16>, %arg61: tensor<256x1024x1xbf16>, %arg62: tensor<256xbf16>, %arg63: tensor<512xbf16>, %arg64: tensor<512xbf16>, %arg65: tensor<512xbf16>, %arg66: tensor<512xbf16>, %arg67: tensor<1024x256x1xbf16>, %arg68: tensor<1024xbf16>, %arg69: tensor<256x1024x1xbf16>, %arg70: tensor<256xbf16>, %arg71: tensor<512xbf16>, %arg72: tensor<512xbf16>, %arg73: tensor<512xbf16>, %arg74: tensor<512xbf16>, %arg75: tensor<1024x256x1xbf16>, %arg76: tensor<1024xbf16>, %arg77: tensor<256x1024x1xbf16>, %arg78: tensor<256xbf16>, %arg79: tensor<512xbf16>, %arg80: tensor<512xbf16>, %arg81: tensor<512xbf16>, %arg82: tensor<512xbf16>, %arg83: tensor<1024x256x1xbf16>, %arg84: tensor<1024xbf16>, %arg85: tensor<256x1024x1xbf16>, %arg86: tensor<256xbf16>, %arg87: tensor<512xbf16>, %arg88: tensor<512xbf16>, %arg89: tensor<512xbf16>, %arg90: tensor<512xbf16>, %arg91: tensor<1024x256x1xbf16>, %arg92: tensor<1024xbf16>, %arg93: tensor<256x1024x1xbf16>, %arg94: tensor<256xbf16>, %arg95: tensor<512xbf16>, %arg96: tensor<512xbf16>, %arg97: tensor<512xbf16>, %arg98: tensor<512xbf16>, %arg99: tensor<768x512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512x256xf32>, %arg102: tensor<256xf32>, %arg103: tensor<256x512xf32>, %arg104: tensor<512xf32>, %arg105: tensor<512x256xf32>, %arg106: tensor<256xf32>, %arg107: tensor<256x512xf32>, %arg108: tensor<512xf32>, %arg109: tensor<512x256xf32>, %arg110: tensor<256xf32>, %arg111: tensor<256x512xf32>, %arg112: tensor<512xf32>, %arg113: tensor<512x256xf32>, %arg114: tensor<256xf32>, %arg115: tensor<256x512xf32>, %arg116: tensor<512xf32>, %arg117: tensor<512x256xf32>, %arg118: tensor<256xf32>, %arg119: tensor<256x512xf32>, %arg120: tensor<512xf32>, %arg121: tensor<512x256xf32>, %arg122: tensor<256xf32>, %arg123: tensor<256x512xf32>, %arg124: tensor<512xf32>, %arg125: tensor<512x256xf32>, %arg126: tensor<256xf32>, %arg127: tensor<256x512xf32>, %arg128: tensor<512xf32>, %arg129: tensor<512x256xf32>, %arg130: tensor<256xf32>, %arg131: tensor<256x512xf32>, %arg132: tensor<512xf32>, %arg133: tensor<512x256xf32>, %arg134: tensor<256xf32>, %arg135: tensor<256x512xf32>, %arg136: tensor<512xf32>, %arg137: tensor<512x256xf32>, %arg138: tensor<256xf32>, %arg139: tensor<256x512xf32>, %arg140: tensor<512xf32>, %arg141: tensor<512x256xf32>, %arg142: tensor<256xf32>, %arg143: tensor<256x512xf32>, %arg144: tensor<512xf32>, %arg145: tensor<512x256xf32>, %arg146: tensor<256xf32>, %arg147: tensor<256x512xf32>, %arg148: tensor<512xf32>, %arg149: tensor<512x1000xf32>, %arg150: tensor<1000xf32>) -> tensor<1x1000xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<1.000000e+00> : tensor<1x1024x512xbf16>
-    %cst_2 = stablehlo.constant dense<2.000000e+00> : tensor<1x1024x512xbf16>
-    %cst_3 = stablehlo.constant dense<5.000000e-01> : tensor<1x1024x512xbf16>
-    %cst_4 = stablehlo.constant dense<-4.000000e+00> : tensor<1x1024x512xf32>
-    %cst_5 = stablehlo.constant dense<4.000000e+00> : tensor<1x1024x512xf32>
-    %cst_6 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x1024x512xf32>
-    %cst_7 = stablehlo.constant dense<2.77068146E-8> : tensor<1x1024x512xf32>
-    %cst_8 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x1024x512xf32>
-    %cst_9 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x1024x512xf32>
-    %cst_10 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x1024x512xf32>
-    %cst_11 = stablehlo.constant dense<-2.954600e-03> : tensor<1x1024x512xf32>
-    %cst_12 = stablehlo.constant dense<-0.0160960332> : tensor<1x1024x512xf32>
-    %cst_13 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x1024x512xf32>
-    %cst_14 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x1024x512xf32>
-    %cst_15 = stablehlo.constant dense<-0.00168282702> : tensor<1x1024x512xf32>
-    %cst_16 = stablehlo.constant dense<-0.00737332925> : tensor<1x1024x512xf32>
-    %cst_17 = stablehlo.constant dense<-0.0142647391> : tensor<1x1024x512xf32>
-    %cst_18 = stablehlo.constant dense<-1.000000e+00> : tensor<1x1024x512xf32>
-    %cst_19 = stablehlo.constant dense<1.000000e+00> : tensor<1x1024x512xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x256x256xbf16>
-    %cst_21 = stablehlo.constant dense<2.000000e+00> : tensor<1x256x256xbf16>
-    %cst_22 = stablehlo.constant dense<5.000000e-01> : tensor<1x256x256xbf16>
-    %cst_23 = stablehlo.constant dense<-4.000000e+00> : tensor<1x256x256xf32>
-    %cst_24 = stablehlo.constant dense<4.000000e+00> : tensor<1x256x256xf32>
-    %cst_25 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x256x256xf32>
-    %cst_26 = stablehlo.constant dense<2.77068146E-8> : tensor<1x256x256xf32>
-    %cst_27 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x256x256xf32>
-    %cst_28 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x256x256xf32>
-    %cst_29 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x256x256xf32>
-    %cst_30 = stablehlo.constant dense<-2.954600e-03> : tensor<1x256x256xf32>
-    %cst_31 = stablehlo.constant dense<-0.0160960332> : tensor<1x256x256xf32>
-    %cst_32 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x256x256xf32>
-    %cst_33 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x256x256xf32>
-    %cst_34 = stablehlo.constant dense<-0.00168282702> : tensor<1x256x256xf32>
-    %cst_35 = stablehlo.constant dense<-0.00737332925> : tensor<1x256x256xf32>
-    %cst_36 = stablehlo.constant dense<-0.0142647391> : tensor<1x256x256xf32>
-    %cst_37 = stablehlo.constant dense<-1.000000e+00> : tensor<1x256x256xf32>
-    %cst_38 = stablehlo.constant dense<1.000000e+00> : tensor<1x256x256xf32>
-    %cst_39 = stablehlo.constant dense<0.000000e+00> : tensor<bf16>
-    %cst_40 = arith.constant dense<1> : tensor<1xi64>
-    %cst_41 = arith.constant dense<512> : tensor<1xi64>
-    %cst_42 = arith.constant dense<1.000000e-05> : tensor<1xf64>
-    %cst_43 = arith.constant dense<256> : tensor<1xi64>
-    %0 = stablehlo.reshape %arg0 : (tensor<1x3x256x256xbf16>) -> tensor<1x3x16x16x16x16xbf16>
-    %1 = stablehlo.transpose %0, dims = [0, 2, 4, 3, 5, 1] : (tensor<1x3x16x16x16x16xbf16>) -> tensor<1x16x16x16x16x3xbf16>
-    %2 = stablehlo.reshape %1 : (tensor<1x16x16x16x16x3xbf16>) -> tensor<1x256x768xbf16>
-    %3 = stablehlo.reshape %2 : (tensor<1x256x768xbf16>) -> tensor<256x768xbf16>
-    %4 = stablehlo.convert %3 : (tensor<256x768xbf16>) -> tensor<256x768xf32>
-    %5 = stablehlo.dot_general %4, %arg99, contracting_dims = [1] x [0] : (tensor<256x768xf32>, tensor<768x512xf32>) -> tensor<256x512xf32>
-    %6 = stablehlo.convert %cst_40 : (tensor<1xi64>) -> tensor<1xf32>
-    %7 = stablehlo.reshape %6 : (tensor<1xf32>) -> tensor<f32>
-    %8 = stablehlo.broadcast_in_dim %5, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [] : (tensor<f32>) -> tensor<256x512xf32>
-    %10 = stablehlo.multiply %8, %9 : tensor<256x512xf32>
-    %11 = stablehlo.broadcast_in_dim %10, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %12 = stablehlo.broadcast_in_dim %arg100, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %13 = stablehlo.add %11, %12 : tensor<256x512xf32>
-    %14 = stablehlo.convert %13 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %15 = stablehlo.reshape %14 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %16 = stablehlo.convert %15 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %17 = stablehlo.convert %16 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %18 = stablehlo.reduce(%17 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %19 = stablehlo.reshape %18 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %20 = stablehlo.convert %cst_41 : (tensor<1xi64>) -> tensor<1xf64>
-    %21 = stablehlo.reshape %20 : (tensor<1xf64>) -> tensor<f64>
-    %22 = stablehlo.broadcast_in_dim %19, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %23 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<f64>) -> tensor<1x256x1xf64>
-    %24 = stablehlo.divide %22, %23 : tensor<1x256x1xf64>
-    %25 = stablehlo.broadcast_in_dim %17, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %26 = stablehlo.broadcast_in_dim %24, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %27 = stablehlo.subtract %25, %26 : tensor<1x256x512xf64>
-    %28 = stablehlo.multiply %27, %27 : tensor<1x256x512xf64>
-    %29 = stablehlo.reduce(%28 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %30 = stablehlo.reshape %29 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %31 = stablehlo.broadcast_in_dim %30, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %32 = stablehlo.divide %31, %23 : tensor<1x256x1xf64>
-    %33 = stablehlo.convert %32 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %34 = stablehlo.reduce(%16 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %35 = stablehlo.reshape %34 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %36 = stablehlo.convert %cst_41 : (tensor<1xi64>) -> tensor<1xf32>
-    %37 = stablehlo.reshape %36 : (tensor<1xf32>) -> tensor<f32>
-    %38 = stablehlo.broadcast_in_dim %35, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %39 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %40 = stablehlo.divide %38, %39 : tensor<1x256x1xf32>
-    %41 = stablehlo.convert %cst_42 : (tensor<1xf64>) -> tensor<1xf32>
-    %42 = stablehlo.reshape %41 : (tensor<1xf32>) -> tensor<f32>
-    %43 = stablehlo.broadcast_in_dim %33, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %44 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %45 = stablehlo.add %43, %44 : tensor<1x256x1xf32>
-    %46 = stablehlo.rsqrt %45 : tensor<1x256x1xf32>
-    %47 = stablehlo.broadcast_in_dim %16, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %48 = stablehlo.broadcast_in_dim %40, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %49 = stablehlo.subtract %47, %48 : tensor<1x256x512xf32>
-    %50 = stablehlo.broadcast_in_dim %49, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %51 = stablehlo.broadcast_in_dim %46, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %52 = stablehlo.multiply %50, %51 : tensor<1x256x512xf32>
-    %53 = stablehlo.convert %arg1 : (tensor<512xbf16>) -> tensor<512xf32>
-    %54 = stablehlo.broadcast_in_dim %52, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %55 = stablehlo.broadcast_in_dim %53, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %56 = stablehlo.multiply %54, %55 : tensor<1x256x512xf32>
-    %57 = stablehlo.convert %arg2 : (tensor<512xbf16>) -> tensor<512xf32>
-    %58 = stablehlo.broadcast_in_dim %56, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %59 = stablehlo.broadcast_in_dim %57, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %60 = stablehlo.add %58, %59 : tensor<1x256x512xf32>
-    %61 = stablehlo.convert %60 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %62 = stablehlo.convolution(%61, %arg3) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %63 = stablehlo.reshape %arg4 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %64 = stablehlo.broadcast_in_dim %62, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %65 = stablehlo.broadcast_in_dim %63, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %66 = stablehlo.add %64, %65 : tensor<1x1024x512xbf16>
-    %67 = stablehlo.multiply %66, %cst_3 : tensor<1x1024x512xbf16>
-    %68 = stablehlo.rsqrt %cst_2 : tensor<1x1024x512xbf16>
-    %69 = stablehlo.multiply %66, %68 : tensor<1x1024x512xbf16>
-    %70 = stablehlo.convert %69 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %71 = stablehlo.clamp %cst_4, %70, %cst_5 : tensor<1x1024x512xf32>
-    %72 = stablehlo.multiply %71, %71 : tensor<1x1024x512xf32>
-    %73 = stablehlo.multiply %cst_6, %72 : tensor<1x1024x512xf32>
-    %74 = stablehlo.add %73, %cst_7 : tensor<1x1024x512xf32>
-    %75 = stablehlo.multiply %74, %72 : tensor<1x1024x512xf32>
-    %76 = stablehlo.add %75, %cst_8 : tensor<1x1024x512xf32>
-    %77 = stablehlo.multiply %76, %72 : tensor<1x1024x512xf32>
-    %78 = stablehlo.add %77, %cst_9 : tensor<1x1024x512xf32>
-    %79 = stablehlo.multiply %78, %72 : tensor<1x1024x512xf32>
-    %80 = stablehlo.add %79, %cst_10 : tensor<1x1024x512xf32>
-    %81 = stablehlo.multiply %80, %72 : tensor<1x1024x512xf32>
-    %82 = stablehlo.add %81, %cst_11 : tensor<1x1024x512xf32>
-    %83 = stablehlo.multiply %82, %72 : tensor<1x1024x512xf32>
-    %84 = stablehlo.add %83, %cst_12 : tensor<1x1024x512xf32>
-    %85 = stablehlo.multiply %cst_13, %72 : tensor<1x1024x512xf32>
-    %86 = stablehlo.add %85, %cst_14 : tensor<1x1024x512xf32>
-    %87 = stablehlo.multiply %86, %72 : tensor<1x1024x512xf32>
-    %88 = stablehlo.add %87, %cst_15 : tensor<1x1024x512xf32>
-    %89 = stablehlo.multiply %88, %72 : tensor<1x1024x512xf32>
-    %90 = stablehlo.add %89, %cst_16 : tensor<1x1024x512xf32>
-    %91 = stablehlo.multiply %90, %72 : tensor<1x1024x512xf32>
-    %92 = stablehlo.add %91, %cst_17 : tensor<1x1024x512xf32>
-    %93 = stablehlo.multiply %71, %84 : tensor<1x1024x512xf32>
-    %94 = stablehlo.divide %93, %92 : tensor<1x1024x512xf32>
-    %95 = stablehlo.clamp %cst_18, %94, %cst_19 : tensor<1x1024x512xf32>
-    %96 = stablehlo.convert %95 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %97 = stablehlo.add %96, %cst_1 : tensor<1x1024x512xbf16>
-    %98 = stablehlo.multiply %97, %67 : tensor<1x1024x512xbf16>
-    %99 = stablehlo.convolution(%98, %arg5) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %100 = stablehlo.reshape %arg6 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %101 = stablehlo.broadcast_in_dim %99, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %102 = stablehlo.broadcast_in_dim %100, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %103 = stablehlo.add %101, %102 : tensor<1x256x512xbf16>
-    %104 = stablehlo.add %103, %15 : tensor<1x256x512xbf16>
-    %105 = stablehlo.convert %104 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %106 = stablehlo.convert %105 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %107 = stablehlo.reduce(%106 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %108 = stablehlo.reshape %107 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %109 = stablehlo.broadcast_in_dim %108, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %110 = stablehlo.divide %109, %23 : tensor<1x256x1xf64>
-    %111 = stablehlo.broadcast_in_dim %106, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %112 = stablehlo.broadcast_in_dim %110, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %113 = stablehlo.subtract %111, %112 : tensor<1x256x512xf64>
-    %114 = stablehlo.multiply %113, %113 : tensor<1x256x512xf64>
-    %115 = stablehlo.reduce(%114 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %116 = stablehlo.reshape %115 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %117 = stablehlo.broadcast_in_dim %116, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %118 = stablehlo.divide %117, %23 : tensor<1x256x1xf64>
-    %119 = stablehlo.convert %118 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %120 = stablehlo.reduce(%105 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %121 = stablehlo.reshape %120 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %122 = stablehlo.broadcast_in_dim %121, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %123 = stablehlo.divide %122, %39 : tensor<1x256x1xf32>
-    %124 = stablehlo.broadcast_in_dim %119, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %125 = stablehlo.add %124, %44 : tensor<1x256x1xf32>
-    %126 = stablehlo.rsqrt %125 : tensor<1x256x1xf32>
-    %127 = stablehlo.broadcast_in_dim %105, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %128 = stablehlo.broadcast_in_dim %123, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %129 = stablehlo.subtract %127, %128 : tensor<1x256x512xf32>
-    %130 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %131 = stablehlo.broadcast_in_dim %126, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %132 = stablehlo.multiply %130, %131 : tensor<1x256x512xf32>
-    %133 = stablehlo.convert %arg7 : (tensor<512xbf16>) -> tensor<512xf32>
-    %134 = stablehlo.broadcast_in_dim %132, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %135 = stablehlo.broadcast_in_dim %133, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %136 = stablehlo.multiply %134, %135 : tensor<1x256x512xf32>
-    %137 = stablehlo.convert %arg8 : (tensor<512xbf16>) -> tensor<512xf32>
-    %138 = stablehlo.broadcast_in_dim %136, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %139 = stablehlo.broadcast_in_dim %137, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %140 = stablehlo.add %138, %139 : tensor<1x256x512xf32>
-    %141 = stablehlo.convert %140 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %142 = stablehlo.reshape %141 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %143 = stablehlo.convert %142 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %144 = stablehlo.dot_general %143, %arg101, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %145 = stablehlo.broadcast_in_dim %144, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %146 = stablehlo.broadcast_in_dim %7, dims = [] : (tensor<f32>) -> tensor<256x256xf32>
-    %147 = stablehlo.multiply %145, %146 : tensor<256x256xf32>
-    %148 = stablehlo.broadcast_in_dim %147, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %149 = stablehlo.broadcast_in_dim %arg102, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %150 = stablehlo.add %148, %149 : tensor<256x256xf32>
-    %151 = stablehlo.convert %150 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %152 = stablehlo.reshape %151 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %153 = stablehlo.multiply %152, %cst_22 : tensor<1x256x256xbf16>
-    %154 = stablehlo.rsqrt %cst_21 : tensor<1x256x256xbf16>
-    %155 = stablehlo.multiply %152, %154 : tensor<1x256x256xbf16>
-    %156 = stablehlo.convert %155 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %157 = stablehlo.clamp %cst_23, %156, %cst_24 : tensor<1x256x256xf32>
-    %158 = stablehlo.multiply %157, %157 : tensor<1x256x256xf32>
-    %159 = stablehlo.multiply %cst_25, %158 : tensor<1x256x256xf32>
-    %160 = stablehlo.add %159, %cst_26 : tensor<1x256x256xf32>
-    %161 = stablehlo.multiply %160, %158 : tensor<1x256x256xf32>
-    %162 = stablehlo.add %161, %cst_27 : tensor<1x256x256xf32>
-    %163 = stablehlo.multiply %162, %158 : tensor<1x256x256xf32>
-    %164 = stablehlo.add %163, %cst_28 : tensor<1x256x256xf32>
-    %165 = stablehlo.multiply %164, %158 : tensor<1x256x256xf32>
-    %166 = stablehlo.add %165, %cst_29 : tensor<1x256x256xf32>
-    %167 = stablehlo.multiply %166, %158 : tensor<1x256x256xf32>
-    %168 = stablehlo.add %167, %cst_30 : tensor<1x256x256xf32>
-    %169 = stablehlo.multiply %168, %158 : tensor<1x256x256xf32>
-    %170 = stablehlo.add %169, %cst_31 : tensor<1x256x256xf32>
-    %171 = stablehlo.multiply %cst_32, %158 : tensor<1x256x256xf32>
-    %172 = stablehlo.add %171, %cst_33 : tensor<1x256x256xf32>
-    %173 = stablehlo.multiply %172, %158 : tensor<1x256x256xf32>
-    %174 = stablehlo.add %173, %cst_34 : tensor<1x256x256xf32>
-    %175 = stablehlo.multiply %174, %158 : tensor<1x256x256xf32>
-    %176 = stablehlo.add %175, %cst_35 : tensor<1x256x256xf32>
-    %177 = stablehlo.multiply %176, %158 : tensor<1x256x256xf32>
-    %178 = stablehlo.add %177, %cst_36 : tensor<1x256x256xf32>
-    %179 = stablehlo.multiply %157, %170 : tensor<1x256x256xf32>
-    %180 = stablehlo.divide %179, %178 : tensor<1x256x256xf32>
-    %181 = stablehlo.clamp %cst_37, %180, %cst_38 : tensor<1x256x256xf32>
-    %182 = stablehlo.convert %181 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %183 = stablehlo.add %182, %cst_20 : tensor<1x256x256xbf16>
-    %184 = stablehlo.multiply %183, %153 : tensor<1x256x256xbf16>
-    %185 = stablehlo.reshape %184 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %186 = stablehlo.convert %185 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %187 = stablehlo.dot_general %186, %arg103, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %188 = stablehlo.broadcast_in_dim %187, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %189 = stablehlo.multiply %188, %9 : tensor<256x512xf32>
-    %190 = stablehlo.broadcast_in_dim %189, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %191 = stablehlo.broadcast_in_dim %arg104, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %192 = stablehlo.add %190, %191 : tensor<256x512xf32>
-    %193 = stablehlo.convert %192 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %194 = stablehlo.reshape %193 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %195 = stablehlo.add %194, %104 : tensor<1x256x512xbf16>
-    %196 = stablehlo.convert %195 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %197 = stablehlo.convert %196 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %198 = stablehlo.reduce(%197 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %199 = stablehlo.reshape %198 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %200 = stablehlo.broadcast_in_dim %199, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %201 = stablehlo.divide %200, %23 : tensor<1x256x1xf64>
-    %202 = stablehlo.broadcast_in_dim %197, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %203 = stablehlo.broadcast_in_dim %201, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %204 = stablehlo.subtract %202, %203 : tensor<1x256x512xf64>
-    %205 = stablehlo.multiply %204, %204 : tensor<1x256x512xf64>
-    %206 = stablehlo.reduce(%205 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %207 = stablehlo.reshape %206 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %208 = stablehlo.broadcast_in_dim %207, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %209 = stablehlo.divide %208, %23 : tensor<1x256x1xf64>
-    %210 = stablehlo.convert %209 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %211 = stablehlo.reduce(%196 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %212 = stablehlo.reshape %211 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %213 = stablehlo.broadcast_in_dim %212, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %214 = stablehlo.divide %213, %39 : tensor<1x256x1xf32>
-    %215 = stablehlo.broadcast_in_dim %210, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %216 = stablehlo.add %215, %44 : tensor<1x256x1xf32>
-    %217 = stablehlo.rsqrt %216 : tensor<1x256x1xf32>
-    %218 = stablehlo.broadcast_in_dim %196, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %219 = stablehlo.broadcast_in_dim %214, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %220 = stablehlo.subtract %218, %219 : tensor<1x256x512xf32>
-    %221 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %222 = stablehlo.broadcast_in_dim %217, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %223 = stablehlo.multiply %221, %222 : tensor<1x256x512xf32>
-    %224 = stablehlo.convert %arg9 : (tensor<512xbf16>) -> tensor<512xf32>
-    %225 = stablehlo.broadcast_in_dim %223, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %226 = stablehlo.broadcast_in_dim %224, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %227 = stablehlo.multiply %225, %226 : tensor<1x256x512xf32>
-    %228 = stablehlo.convert %arg10 : (tensor<512xbf16>) -> tensor<512xf32>
-    %229 = stablehlo.broadcast_in_dim %227, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %230 = stablehlo.broadcast_in_dim %228, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %231 = stablehlo.add %229, %230 : tensor<1x256x512xf32>
-    %232 = stablehlo.convert %231 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %233 = stablehlo.convolution(%232, %arg11) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %234 = stablehlo.reshape %arg12 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %235 = stablehlo.broadcast_in_dim %233, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %236 = stablehlo.broadcast_in_dim %234, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %237 = stablehlo.add %235, %236 : tensor<1x1024x512xbf16>
-    %238 = stablehlo.multiply %237, %cst_3 : tensor<1x1024x512xbf16>
-    %239 = stablehlo.multiply %237, %68 : tensor<1x1024x512xbf16>
-    %240 = stablehlo.convert %239 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %241 = stablehlo.clamp %cst_4, %240, %cst_5 : tensor<1x1024x512xf32>
-    %242 = stablehlo.multiply %241, %241 : tensor<1x1024x512xf32>
-    %243 = stablehlo.multiply %cst_6, %242 : tensor<1x1024x512xf32>
-    %244 = stablehlo.add %243, %cst_7 : tensor<1x1024x512xf32>
-    %245 = stablehlo.multiply %244, %242 : tensor<1x1024x512xf32>
-    %246 = stablehlo.add %245, %cst_8 : tensor<1x1024x512xf32>
-    %247 = stablehlo.multiply %246, %242 : tensor<1x1024x512xf32>
-    %248 = stablehlo.add %247, %cst_9 : tensor<1x1024x512xf32>
-    %249 = stablehlo.multiply %248, %242 : tensor<1x1024x512xf32>
-    %250 = stablehlo.add %249, %cst_10 : tensor<1x1024x512xf32>
-    %251 = stablehlo.multiply %250, %242 : tensor<1x1024x512xf32>
-    %252 = stablehlo.add %251, %cst_11 : tensor<1x1024x512xf32>
-    %253 = stablehlo.multiply %252, %242 : tensor<1x1024x512xf32>
-    %254 = stablehlo.add %253, %cst_12 : tensor<1x1024x512xf32>
-    %255 = stablehlo.multiply %cst_13, %242 : tensor<1x1024x512xf32>
-    %256 = stablehlo.add %255, %cst_14 : tensor<1x1024x512xf32>
-    %257 = stablehlo.multiply %256, %242 : tensor<1x1024x512xf32>
-    %258 = stablehlo.add %257, %cst_15 : tensor<1x1024x512xf32>
-    %259 = stablehlo.multiply %258, %242 : tensor<1x1024x512xf32>
-    %260 = stablehlo.add %259, %cst_16 : tensor<1x1024x512xf32>
-    %261 = stablehlo.multiply %260, %242 : tensor<1x1024x512xf32>
-    %262 = stablehlo.add %261, %cst_17 : tensor<1x1024x512xf32>
-    %263 = stablehlo.multiply %241, %254 : tensor<1x1024x512xf32>
-    %264 = stablehlo.divide %263, %262 : tensor<1x1024x512xf32>
-    %265 = stablehlo.clamp %cst_18, %264, %cst_19 : tensor<1x1024x512xf32>
-    %266 = stablehlo.convert %265 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %267 = stablehlo.add %266, %cst_1 : tensor<1x1024x512xbf16>
-    %268 = stablehlo.multiply %267, %238 : tensor<1x1024x512xbf16>
-    %269 = stablehlo.convolution(%268, %arg13) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %270 = stablehlo.reshape %arg14 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %271 = stablehlo.broadcast_in_dim %269, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %272 = stablehlo.broadcast_in_dim %270, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %273 = stablehlo.add %271, %272 : tensor<1x256x512xbf16>
-    %274 = stablehlo.add %273, %195 : tensor<1x256x512xbf16>
-    %275 = stablehlo.convert %274 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %276 = stablehlo.convert %275 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %277 = stablehlo.reduce(%276 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %278 = stablehlo.reshape %277 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %279 = stablehlo.broadcast_in_dim %278, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %280 = stablehlo.divide %279, %23 : tensor<1x256x1xf64>
-    %281 = stablehlo.broadcast_in_dim %276, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %282 = stablehlo.broadcast_in_dim %280, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %283 = stablehlo.subtract %281, %282 : tensor<1x256x512xf64>
-    %284 = stablehlo.multiply %283, %283 : tensor<1x256x512xf64>
-    %285 = stablehlo.reduce(%284 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %286 = stablehlo.reshape %285 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %287 = stablehlo.broadcast_in_dim %286, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %288 = stablehlo.divide %287, %23 : tensor<1x256x1xf64>
-    %289 = stablehlo.convert %288 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %290 = stablehlo.reduce(%275 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %291 = stablehlo.reshape %290 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %292 = stablehlo.broadcast_in_dim %291, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %293 = stablehlo.divide %292, %39 : tensor<1x256x1xf32>
-    %294 = stablehlo.broadcast_in_dim %289, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %295 = stablehlo.add %294, %44 : tensor<1x256x1xf32>
-    %296 = stablehlo.rsqrt %295 : tensor<1x256x1xf32>
-    %297 = stablehlo.broadcast_in_dim %275, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %298 = stablehlo.broadcast_in_dim %293, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %299 = stablehlo.subtract %297, %298 : tensor<1x256x512xf32>
-    %300 = stablehlo.broadcast_in_dim %299, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %301 = stablehlo.broadcast_in_dim %296, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %302 = stablehlo.multiply %300, %301 : tensor<1x256x512xf32>
-    %303 = stablehlo.convert %arg15 : (tensor<512xbf16>) -> tensor<512xf32>
-    %304 = stablehlo.broadcast_in_dim %302, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %305 = stablehlo.broadcast_in_dim %303, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %306 = stablehlo.multiply %304, %305 : tensor<1x256x512xf32>
-    %307 = stablehlo.convert %arg16 : (tensor<512xbf16>) -> tensor<512xf32>
-    %308 = stablehlo.broadcast_in_dim %306, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %309 = stablehlo.broadcast_in_dim %307, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %310 = stablehlo.add %308, %309 : tensor<1x256x512xf32>
-    %311 = stablehlo.convert %310 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %312 = stablehlo.reshape %311 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %313 = stablehlo.convert %312 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %314 = stablehlo.dot_general %313, %arg105, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %315 = stablehlo.broadcast_in_dim %314, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %316 = stablehlo.multiply %315, %146 : tensor<256x256xf32>
-    %317 = stablehlo.broadcast_in_dim %316, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %318 = stablehlo.broadcast_in_dim %arg106, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %319 = stablehlo.add %317, %318 : tensor<256x256xf32>
-    %320 = stablehlo.convert %319 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %321 = stablehlo.reshape %320 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %322 = stablehlo.multiply %321, %cst_22 : tensor<1x256x256xbf16>
-    %323 = stablehlo.multiply %321, %154 : tensor<1x256x256xbf16>
-    %324 = stablehlo.convert %323 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %325 = stablehlo.clamp %cst_23, %324, %cst_24 : tensor<1x256x256xf32>
-    %326 = stablehlo.multiply %325, %325 : tensor<1x256x256xf32>
-    %327 = stablehlo.multiply %cst_25, %326 : tensor<1x256x256xf32>
-    %328 = stablehlo.add %327, %cst_26 : tensor<1x256x256xf32>
-    %329 = stablehlo.multiply %328, %326 : tensor<1x256x256xf32>
-    %330 = stablehlo.add %329, %cst_27 : tensor<1x256x256xf32>
-    %331 = stablehlo.multiply %330, %326 : tensor<1x256x256xf32>
-    %332 = stablehlo.add %331, %cst_28 : tensor<1x256x256xf32>
-    %333 = stablehlo.multiply %332, %326 : tensor<1x256x256xf32>
-    %334 = stablehlo.add %333, %cst_29 : tensor<1x256x256xf32>
-    %335 = stablehlo.multiply %334, %326 : tensor<1x256x256xf32>
-    %336 = stablehlo.add %335, %cst_30 : tensor<1x256x256xf32>
-    %337 = stablehlo.multiply %336, %326 : tensor<1x256x256xf32>
-    %338 = stablehlo.add %337, %cst_31 : tensor<1x256x256xf32>
-    %339 = stablehlo.multiply %cst_32, %326 : tensor<1x256x256xf32>
-    %340 = stablehlo.add %339, %cst_33 : tensor<1x256x256xf32>
-    %341 = stablehlo.multiply %340, %326 : tensor<1x256x256xf32>
-    %342 = stablehlo.add %341, %cst_34 : tensor<1x256x256xf32>
-    %343 = stablehlo.multiply %342, %326 : tensor<1x256x256xf32>
-    %344 = stablehlo.add %343, %cst_35 : tensor<1x256x256xf32>
-    %345 = stablehlo.multiply %344, %326 : tensor<1x256x256xf32>
-    %346 = stablehlo.add %345, %cst_36 : tensor<1x256x256xf32>
-    %347 = stablehlo.multiply %325, %338 : tensor<1x256x256xf32>
-    %348 = stablehlo.divide %347, %346 : tensor<1x256x256xf32>
-    %349 = stablehlo.clamp %cst_37, %348, %cst_38 : tensor<1x256x256xf32>
-    %350 = stablehlo.convert %349 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %351 = stablehlo.add %350, %cst_20 : tensor<1x256x256xbf16>
-    %352 = stablehlo.multiply %351, %322 : tensor<1x256x256xbf16>
-    %353 = stablehlo.reshape %352 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %354 = stablehlo.convert %353 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %355 = stablehlo.dot_general %354, %arg107, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %356 = stablehlo.broadcast_in_dim %355, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %357 = stablehlo.multiply %356, %9 : tensor<256x512xf32>
-    %358 = stablehlo.broadcast_in_dim %357, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %359 = stablehlo.broadcast_in_dim %arg108, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %360 = stablehlo.add %358, %359 : tensor<256x512xf32>
-    %361 = stablehlo.convert %360 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %362 = stablehlo.reshape %361 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %363 = stablehlo.add %362, %274 : tensor<1x256x512xbf16>
-    %364 = stablehlo.convert %363 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %365 = stablehlo.convert %364 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %366 = stablehlo.reduce(%365 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %367 = stablehlo.reshape %366 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %368 = stablehlo.broadcast_in_dim %367, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %369 = stablehlo.divide %368, %23 : tensor<1x256x1xf64>
-    %370 = stablehlo.broadcast_in_dim %365, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %371 = stablehlo.broadcast_in_dim %369, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %372 = stablehlo.subtract %370, %371 : tensor<1x256x512xf64>
-    %373 = stablehlo.multiply %372, %372 : tensor<1x256x512xf64>
-    %374 = stablehlo.reduce(%373 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %375 = stablehlo.reshape %374 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %376 = stablehlo.broadcast_in_dim %375, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %377 = stablehlo.divide %376, %23 : tensor<1x256x1xf64>
-    %378 = stablehlo.convert %377 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %379 = stablehlo.reduce(%364 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %380 = stablehlo.reshape %379 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %381 = stablehlo.broadcast_in_dim %380, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %382 = stablehlo.divide %381, %39 : tensor<1x256x1xf32>
-    %383 = stablehlo.broadcast_in_dim %378, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %384 = stablehlo.add %383, %44 : tensor<1x256x1xf32>
-    %385 = stablehlo.rsqrt %384 : tensor<1x256x1xf32>
-    %386 = stablehlo.broadcast_in_dim %364, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %387 = stablehlo.broadcast_in_dim %382, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %388 = stablehlo.subtract %386, %387 : tensor<1x256x512xf32>
-    %389 = stablehlo.broadcast_in_dim %388, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %390 = stablehlo.broadcast_in_dim %385, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %391 = stablehlo.multiply %389, %390 : tensor<1x256x512xf32>
-    %392 = stablehlo.convert %arg17 : (tensor<512xbf16>) -> tensor<512xf32>
-    %393 = stablehlo.broadcast_in_dim %391, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %394 = stablehlo.broadcast_in_dim %392, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %395 = stablehlo.multiply %393, %394 : tensor<1x256x512xf32>
-    %396 = stablehlo.convert %arg18 : (tensor<512xbf16>) -> tensor<512xf32>
-    %397 = stablehlo.broadcast_in_dim %395, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %398 = stablehlo.broadcast_in_dim %396, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %399 = stablehlo.add %397, %398 : tensor<1x256x512xf32>
-    %400 = stablehlo.convert %399 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %401 = stablehlo.convolution(%400, %arg19) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %402 = stablehlo.reshape %arg20 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %403 = stablehlo.broadcast_in_dim %401, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %404 = stablehlo.broadcast_in_dim %402, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %405 = stablehlo.add %403, %404 : tensor<1x1024x512xbf16>
-    %406 = stablehlo.multiply %405, %cst_3 : tensor<1x1024x512xbf16>
-    %407 = stablehlo.multiply %405, %68 : tensor<1x1024x512xbf16>
-    %408 = stablehlo.convert %407 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %409 = stablehlo.clamp %cst_4, %408, %cst_5 : tensor<1x1024x512xf32>
-    %410 = stablehlo.multiply %409, %409 : tensor<1x1024x512xf32>
-    %411 = stablehlo.multiply %cst_6, %410 : tensor<1x1024x512xf32>
-    %412 = stablehlo.add %411, %cst_7 : tensor<1x1024x512xf32>
-    %413 = stablehlo.multiply %412, %410 : tensor<1x1024x512xf32>
-    %414 = stablehlo.add %413, %cst_8 : tensor<1x1024x512xf32>
-    %415 = stablehlo.multiply %414, %410 : tensor<1x1024x512xf32>
-    %416 = stablehlo.add %415, %cst_9 : tensor<1x1024x512xf32>
-    %417 = stablehlo.multiply %416, %410 : tensor<1x1024x512xf32>
-    %418 = stablehlo.add %417, %cst_10 : tensor<1x1024x512xf32>
-    %419 = stablehlo.multiply %418, %410 : tensor<1x1024x512xf32>
-    %420 = stablehlo.add %419, %cst_11 : tensor<1x1024x512xf32>
-    %421 = stablehlo.multiply %420, %410 : tensor<1x1024x512xf32>
-    %422 = stablehlo.add %421, %cst_12 : tensor<1x1024x512xf32>
-    %423 = stablehlo.multiply %cst_13, %410 : tensor<1x1024x512xf32>
-    %424 = stablehlo.add %423, %cst_14 : tensor<1x1024x512xf32>
-    %425 = stablehlo.multiply %424, %410 : tensor<1x1024x512xf32>
-    %426 = stablehlo.add %425, %cst_15 : tensor<1x1024x512xf32>
-    %427 = stablehlo.multiply %426, %410 : tensor<1x1024x512xf32>
-    %428 = stablehlo.add %427, %cst_16 : tensor<1x1024x512xf32>
-    %429 = stablehlo.multiply %428, %410 : tensor<1x1024x512xf32>
-    %430 = stablehlo.add %429, %cst_17 : tensor<1x1024x512xf32>
-    %431 = stablehlo.multiply %409, %422 : tensor<1x1024x512xf32>
-    %432 = stablehlo.divide %431, %430 : tensor<1x1024x512xf32>
-    %433 = stablehlo.clamp %cst_18, %432, %cst_19 : tensor<1x1024x512xf32>
-    %434 = stablehlo.convert %433 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %435 = stablehlo.add %434, %cst_1 : tensor<1x1024x512xbf16>
-    %436 = stablehlo.multiply %435, %406 : tensor<1x1024x512xbf16>
-    %437 = stablehlo.convolution(%436, %arg21) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %438 = stablehlo.reshape %arg22 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %439 = stablehlo.broadcast_in_dim %437, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %440 = stablehlo.broadcast_in_dim %438, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %441 = stablehlo.add %439, %440 : tensor<1x256x512xbf16>
-    %442 = stablehlo.add %441, %363 : tensor<1x256x512xbf16>
-    %443 = stablehlo.convert %442 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %444 = stablehlo.convert %443 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %445 = stablehlo.reduce(%444 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %446 = stablehlo.reshape %445 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %447 = stablehlo.broadcast_in_dim %446, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %448 = stablehlo.divide %447, %23 : tensor<1x256x1xf64>
-    %449 = stablehlo.broadcast_in_dim %444, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %450 = stablehlo.broadcast_in_dim %448, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %451 = stablehlo.subtract %449, %450 : tensor<1x256x512xf64>
-    %452 = stablehlo.multiply %451, %451 : tensor<1x256x512xf64>
-    %453 = stablehlo.reduce(%452 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %454 = stablehlo.reshape %453 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %455 = stablehlo.broadcast_in_dim %454, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %456 = stablehlo.divide %455, %23 : tensor<1x256x1xf64>
-    %457 = stablehlo.convert %456 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %458 = stablehlo.reduce(%443 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %459 = stablehlo.reshape %458 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %460 = stablehlo.broadcast_in_dim %459, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %461 = stablehlo.divide %460, %39 : tensor<1x256x1xf32>
-    %462 = stablehlo.broadcast_in_dim %457, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %463 = stablehlo.add %462, %44 : tensor<1x256x1xf32>
-    %464 = stablehlo.rsqrt %463 : tensor<1x256x1xf32>
-    %465 = stablehlo.broadcast_in_dim %443, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %466 = stablehlo.broadcast_in_dim %461, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %467 = stablehlo.subtract %465, %466 : tensor<1x256x512xf32>
-    %468 = stablehlo.broadcast_in_dim %467, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %469 = stablehlo.broadcast_in_dim %464, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %470 = stablehlo.multiply %468, %469 : tensor<1x256x512xf32>
-    %471 = stablehlo.convert %arg23 : (tensor<512xbf16>) -> tensor<512xf32>
-    %472 = stablehlo.broadcast_in_dim %470, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %473 = stablehlo.broadcast_in_dim %471, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %474 = stablehlo.multiply %472, %473 : tensor<1x256x512xf32>
-    %475 = stablehlo.convert %arg24 : (tensor<512xbf16>) -> tensor<512xf32>
-    %476 = stablehlo.broadcast_in_dim %474, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %477 = stablehlo.broadcast_in_dim %475, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %478 = stablehlo.add %476, %477 : tensor<1x256x512xf32>
-    %479 = stablehlo.convert %478 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %480 = stablehlo.reshape %479 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %481 = stablehlo.convert %480 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %482 = stablehlo.dot_general %481, %arg109, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %483 = stablehlo.broadcast_in_dim %482, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %484 = stablehlo.multiply %483, %146 : tensor<256x256xf32>
-    %485 = stablehlo.broadcast_in_dim %484, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %486 = stablehlo.broadcast_in_dim %arg110, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %487 = stablehlo.add %485, %486 : tensor<256x256xf32>
-    %488 = stablehlo.convert %487 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %489 = stablehlo.reshape %488 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %490 = stablehlo.multiply %489, %cst_22 : tensor<1x256x256xbf16>
-    %491 = stablehlo.multiply %489, %154 : tensor<1x256x256xbf16>
-    %492 = stablehlo.convert %491 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %493 = stablehlo.clamp %cst_23, %492, %cst_24 : tensor<1x256x256xf32>
-    %494 = stablehlo.multiply %493, %493 : tensor<1x256x256xf32>
-    %495 = stablehlo.multiply %cst_25, %494 : tensor<1x256x256xf32>
-    %496 = stablehlo.add %495, %cst_26 : tensor<1x256x256xf32>
-    %497 = stablehlo.multiply %496, %494 : tensor<1x256x256xf32>
-    %498 = stablehlo.add %497, %cst_27 : tensor<1x256x256xf32>
-    %499 = stablehlo.multiply %498, %494 : tensor<1x256x256xf32>
-    %500 = stablehlo.add %499, %cst_28 : tensor<1x256x256xf32>
-    %501 = stablehlo.multiply %500, %494 : tensor<1x256x256xf32>
-    %502 = stablehlo.add %501, %cst_29 : tensor<1x256x256xf32>
-    %503 = stablehlo.multiply %502, %494 : tensor<1x256x256xf32>
-    %504 = stablehlo.add %503, %cst_30 : tensor<1x256x256xf32>
-    %505 = stablehlo.multiply %504, %494 : tensor<1x256x256xf32>
-    %506 = stablehlo.add %505, %cst_31 : tensor<1x256x256xf32>
-    %507 = stablehlo.multiply %cst_32, %494 : tensor<1x256x256xf32>
-    %508 = stablehlo.add %507, %cst_33 : tensor<1x256x256xf32>
-    %509 = stablehlo.multiply %508, %494 : tensor<1x256x256xf32>
-    %510 = stablehlo.add %509, %cst_34 : tensor<1x256x256xf32>
-    %511 = stablehlo.multiply %510, %494 : tensor<1x256x256xf32>
-    %512 = stablehlo.add %511, %cst_35 : tensor<1x256x256xf32>
-    %513 = stablehlo.multiply %512, %494 : tensor<1x256x256xf32>
-    %514 = stablehlo.add %513, %cst_36 : tensor<1x256x256xf32>
-    %515 = stablehlo.multiply %493, %506 : tensor<1x256x256xf32>
-    %516 = stablehlo.divide %515, %514 : tensor<1x256x256xf32>
-    %517 = stablehlo.clamp %cst_37, %516, %cst_38 : tensor<1x256x256xf32>
-    %518 = stablehlo.convert %517 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %519 = stablehlo.add %518, %cst_20 : tensor<1x256x256xbf16>
-    %520 = stablehlo.multiply %519, %490 : tensor<1x256x256xbf16>
-    %521 = stablehlo.reshape %520 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %522 = stablehlo.convert %521 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %523 = stablehlo.dot_general %522, %arg111, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %524 = stablehlo.broadcast_in_dim %523, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %525 = stablehlo.multiply %524, %9 : tensor<256x512xf32>
-    %526 = stablehlo.broadcast_in_dim %525, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %527 = stablehlo.broadcast_in_dim %arg112, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %528 = stablehlo.add %526, %527 : tensor<256x512xf32>
-    %529 = stablehlo.convert %528 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %530 = stablehlo.reshape %529 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %531 = stablehlo.add %530, %442 : tensor<1x256x512xbf16>
-    %532 = stablehlo.convert %531 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %533 = stablehlo.convert %532 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %534 = stablehlo.reduce(%533 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %535 = stablehlo.reshape %534 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %536 = stablehlo.broadcast_in_dim %535, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %537 = stablehlo.divide %536, %23 : tensor<1x256x1xf64>
-    %538 = stablehlo.broadcast_in_dim %533, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %539 = stablehlo.broadcast_in_dim %537, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %540 = stablehlo.subtract %538, %539 : tensor<1x256x512xf64>
-    %541 = stablehlo.multiply %540, %540 : tensor<1x256x512xf64>
-    %542 = stablehlo.reduce(%541 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %543 = stablehlo.reshape %542 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %544 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %545 = stablehlo.divide %544, %23 : tensor<1x256x1xf64>
-    %546 = stablehlo.convert %545 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %547 = stablehlo.reduce(%532 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %548 = stablehlo.reshape %547 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %549 = stablehlo.broadcast_in_dim %548, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %550 = stablehlo.divide %549, %39 : tensor<1x256x1xf32>
-    %551 = stablehlo.broadcast_in_dim %546, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %552 = stablehlo.add %551, %44 : tensor<1x256x1xf32>
-    %553 = stablehlo.rsqrt %552 : tensor<1x256x1xf32>
-    %554 = stablehlo.broadcast_in_dim %532, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %555 = stablehlo.broadcast_in_dim %550, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %556 = stablehlo.subtract %554, %555 : tensor<1x256x512xf32>
-    %557 = stablehlo.broadcast_in_dim %556, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %558 = stablehlo.broadcast_in_dim %553, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %559 = stablehlo.multiply %557, %558 : tensor<1x256x512xf32>
-    %560 = stablehlo.convert %arg25 : (tensor<512xbf16>) -> tensor<512xf32>
-    %561 = stablehlo.broadcast_in_dim %559, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %562 = stablehlo.broadcast_in_dim %560, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %563 = stablehlo.multiply %561, %562 : tensor<1x256x512xf32>
-    %564 = stablehlo.convert %arg26 : (tensor<512xbf16>) -> tensor<512xf32>
-    %565 = stablehlo.broadcast_in_dim %563, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %566 = stablehlo.broadcast_in_dim %564, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %567 = stablehlo.add %565, %566 : tensor<1x256x512xf32>
-    %568 = stablehlo.convert %567 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %569 = stablehlo.convolution(%568, %arg27) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %570 = stablehlo.reshape %arg28 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %571 = stablehlo.broadcast_in_dim %569, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %572 = stablehlo.broadcast_in_dim %570, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %573 = stablehlo.add %571, %572 : tensor<1x1024x512xbf16>
-    %574 = stablehlo.multiply %573, %cst_3 : tensor<1x1024x512xbf16>
-    %575 = stablehlo.multiply %573, %68 : tensor<1x1024x512xbf16>
-    %576 = stablehlo.convert %575 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %577 = stablehlo.clamp %cst_4, %576, %cst_5 : tensor<1x1024x512xf32>
-    %578 = stablehlo.multiply %577, %577 : tensor<1x1024x512xf32>
-    %579 = stablehlo.multiply %cst_6, %578 : tensor<1x1024x512xf32>
-    %580 = stablehlo.add %579, %cst_7 : tensor<1x1024x512xf32>
-    %581 = stablehlo.multiply %580, %578 : tensor<1x1024x512xf32>
-    %582 = stablehlo.add %581, %cst_8 : tensor<1x1024x512xf32>
-    %583 = stablehlo.multiply %582, %578 : tensor<1x1024x512xf32>
-    %584 = stablehlo.add %583, %cst_9 : tensor<1x1024x512xf32>
-    %585 = stablehlo.multiply %584, %578 : tensor<1x1024x512xf32>
-    %586 = stablehlo.add %585, %cst_10 : tensor<1x1024x512xf32>
-    %587 = stablehlo.multiply %586, %578 : tensor<1x1024x512xf32>
-    %588 = stablehlo.add %587, %cst_11 : tensor<1x1024x512xf32>
-    %589 = stablehlo.multiply %588, %578 : tensor<1x1024x512xf32>
-    %590 = stablehlo.add %589, %cst_12 : tensor<1x1024x512xf32>
-    %591 = stablehlo.multiply %cst_13, %578 : tensor<1x1024x512xf32>
-    %592 = stablehlo.add %591, %cst_14 : tensor<1x1024x512xf32>
-    %593 = stablehlo.multiply %592, %578 : tensor<1x1024x512xf32>
-    %594 = stablehlo.add %593, %cst_15 : tensor<1x1024x512xf32>
-    %595 = stablehlo.multiply %594, %578 : tensor<1x1024x512xf32>
-    %596 = stablehlo.add %595, %cst_16 : tensor<1x1024x512xf32>
-    %597 = stablehlo.multiply %596, %578 : tensor<1x1024x512xf32>
-    %598 = stablehlo.add %597, %cst_17 : tensor<1x1024x512xf32>
-    %599 = stablehlo.multiply %577, %590 : tensor<1x1024x512xf32>
-    %600 = stablehlo.divide %599, %598 : tensor<1x1024x512xf32>
-    %601 = stablehlo.clamp %cst_18, %600, %cst_19 : tensor<1x1024x512xf32>
-    %602 = stablehlo.convert %601 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %603 = stablehlo.add %602, %cst_1 : tensor<1x1024x512xbf16>
-    %604 = stablehlo.multiply %603, %574 : tensor<1x1024x512xbf16>
-    %605 = stablehlo.convolution(%604, %arg29) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %606 = stablehlo.reshape %arg30 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %607 = stablehlo.broadcast_in_dim %605, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %608 = stablehlo.broadcast_in_dim %606, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %609 = stablehlo.add %607, %608 : tensor<1x256x512xbf16>
-    %610 = stablehlo.add %609, %531 : tensor<1x256x512xbf16>
-    %611 = stablehlo.convert %610 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %612 = stablehlo.convert %611 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %613 = stablehlo.reduce(%612 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %614 = stablehlo.reshape %613 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %615 = stablehlo.broadcast_in_dim %614, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %616 = stablehlo.divide %615, %23 : tensor<1x256x1xf64>
-    %617 = stablehlo.broadcast_in_dim %612, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %618 = stablehlo.broadcast_in_dim %616, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %619 = stablehlo.subtract %617, %618 : tensor<1x256x512xf64>
-    %620 = stablehlo.multiply %619, %619 : tensor<1x256x512xf64>
-    %621 = stablehlo.reduce(%620 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %622 = stablehlo.reshape %621 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %623 = stablehlo.broadcast_in_dim %622, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %624 = stablehlo.divide %623, %23 : tensor<1x256x1xf64>
-    %625 = stablehlo.convert %624 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %626 = stablehlo.reduce(%611 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %627 = stablehlo.reshape %626 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %628 = stablehlo.broadcast_in_dim %627, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %629 = stablehlo.divide %628, %39 : tensor<1x256x1xf32>
-    %630 = stablehlo.broadcast_in_dim %625, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %631 = stablehlo.add %630, %44 : tensor<1x256x1xf32>
-    %632 = stablehlo.rsqrt %631 : tensor<1x256x1xf32>
-    %633 = stablehlo.broadcast_in_dim %611, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %634 = stablehlo.broadcast_in_dim %629, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %635 = stablehlo.subtract %633, %634 : tensor<1x256x512xf32>
-    %636 = stablehlo.broadcast_in_dim %635, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %637 = stablehlo.broadcast_in_dim %632, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %638 = stablehlo.multiply %636, %637 : tensor<1x256x512xf32>
-    %639 = stablehlo.convert %arg31 : (tensor<512xbf16>) -> tensor<512xf32>
-    %640 = stablehlo.broadcast_in_dim %638, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %641 = stablehlo.broadcast_in_dim %639, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %642 = stablehlo.multiply %640, %641 : tensor<1x256x512xf32>
-    %643 = stablehlo.convert %arg32 : (tensor<512xbf16>) -> tensor<512xf32>
-    %644 = stablehlo.broadcast_in_dim %642, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %645 = stablehlo.broadcast_in_dim %643, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %646 = stablehlo.add %644, %645 : tensor<1x256x512xf32>
-    %647 = stablehlo.convert %646 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %648 = stablehlo.reshape %647 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %649 = stablehlo.convert %648 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %650 = stablehlo.dot_general %649, %arg113, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %651 = stablehlo.broadcast_in_dim %650, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %652 = stablehlo.multiply %651, %146 : tensor<256x256xf32>
-    %653 = stablehlo.broadcast_in_dim %652, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %654 = stablehlo.broadcast_in_dim %arg114, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %655 = stablehlo.add %653, %654 : tensor<256x256xf32>
-    %656 = stablehlo.convert %655 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %657 = stablehlo.reshape %656 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %658 = stablehlo.multiply %657, %cst_22 : tensor<1x256x256xbf16>
-    %659 = stablehlo.multiply %657, %154 : tensor<1x256x256xbf16>
-    %660 = stablehlo.convert %659 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %661 = stablehlo.clamp %cst_23, %660, %cst_24 : tensor<1x256x256xf32>
-    %662 = stablehlo.multiply %661, %661 : tensor<1x256x256xf32>
-    %663 = stablehlo.multiply %cst_25, %662 : tensor<1x256x256xf32>
-    %664 = stablehlo.add %663, %cst_26 : tensor<1x256x256xf32>
-    %665 = stablehlo.multiply %664, %662 : tensor<1x256x256xf32>
-    %666 = stablehlo.add %665, %cst_27 : tensor<1x256x256xf32>
-    %667 = stablehlo.multiply %666, %662 : tensor<1x256x256xf32>
-    %668 = stablehlo.add %667, %cst_28 : tensor<1x256x256xf32>
-    %669 = stablehlo.multiply %668, %662 : tensor<1x256x256xf32>
-    %670 = stablehlo.add %669, %cst_29 : tensor<1x256x256xf32>
-    %671 = stablehlo.multiply %670, %662 : tensor<1x256x256xf32>
-    %672 = stablehlo.add %671, %cst_30 : tensor<1x256x256xf32>
-    %673 = stablehlo.multiply %672, %662 : tensor<1x256x256xf32>
-    %674 = stablehlo.add %673, %cst_31 : tensor<1x256x256xf32>
-    %675 = stablehlo.multiply %cst_32, %662 : tensor<1x256x256xf32>
-    %676 = stablehlo.add %675, %cst_33 : tensor<1x256x256xf32>
-    %677 = stablehlo.multiply %676, %662 : tensor<1x256x256xf32>
-    %678 = stablehlo.add %677, %cst_34 : tensor<1x256x256xf32>
-    %679 = stablehlo.multiply %678, %662 : tensor<1x256x256xf32>
-    %680 = stablehlo.add %679, %cst_35 : tensor<1x256x256xf32>
-    %681 = stablehlo.multiply %680, %662 : tensor<1x256x256xf32>
-    %682 = stablehlo.add %681, %cst_36 : tensor<1x256x256xf32>
-    %683 = stablehlo.multiply %661, %674 : tensor<1x256x256xf32>
-    %684 = stablehlo.divide %683, %682 : tensor<1x256x256xf32>
-    %685 = stablehlo.clamp %cst_37, %684, %cst_38 : tensor<1x256x256xf32>
-    %686 = stablehlo.convert %685 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %687 = stablehlo.add %686, %cst_20 : tensor<1x256x256xbf16>
-    %688 = stablehlo.multiply %687, %658 : tensor<1x256x256xbf16>
-    %689 = stablehlo.reshape %688 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %690 = stablehlo.convert %689 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %691 = stablehlo.dot_general %690, %arg115, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %692 = stablehlo.broadcast_in_dim %691, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %693 = stablehlo.multiply %692, %9 : tensor<256x512xf32>
-    %694 = stablehlo.broadcast_in_dim %693, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %695 = stablehlo.broadcast_in_dim %arg116, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %696 = stablehlo.add %694, %695 : tensor<256x512xf32>
-    %697 = stablehlo.convert %696 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %698 = stablehlo.reshape %697 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %699 = stablehlo.add %698, %610 : tensor<1x256x512xbf16>
-    %700 = stablehlo.convert %699 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %701 = stablehlo.convert %700 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %702 = stablehlo.reduce(%701 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %703 = stablehlo.reshape %702 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %704 = stablehlo.broadcast_in_dim %703, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %705 = stablehlo.divide %704, %23 : tensor<1x256x1xf64>
-    %706 = stablehlo.broadcast_in_dim %701, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %707 = stablehlo.broadcast_in_dim %705, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %708 = stablehlo.subtract %706, %707 : tensor<1x256x512xf64>
-    %709 = stablehlo.multiply %708, %708 : tensor<1x256x512xf64>
-    %710 = stablehlo.reduce(%709 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %711 = stablehlo.reshape %710 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %712 = stablehlo.broadcast_in_dim %711, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %713 = stablehlo.divide %712, %23 : tensor<1x256x1xf64>
-    %714 = stablehlo.convert %713 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %715 = stablehlo.reduce(%700 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %716 = stablehlo.reshape %715 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %717 = stablehlo.broadcast_in_dim %716, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %718 = stablehlo.divide %717, %39 : tensor<1x256x1xf32>
-    %719 = stablehlo.broadcast_in_dim %714, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %720 = stablehlo.add %719, %44 : tensor<1x256x1xf32>
-    %721 = stablehlo.rsqrt %720 : tensor<1x256x1xf32>
-    %722 = stablehlo.broadcast_in_dim %700, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %723 = stablehlo.broadcast_in_dim %718, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %724 = stablehlo.subtract %722, %723 : tensor<1x256x512xf32>
-    %725 = stablehlo.broadcast_in_dim %724, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %726 = stablehlo.broadcast_in_dim %721, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %727 = stablehlo.multiply %725, %726 : tensor<1x256x512xf32>
-    %728 = stablehlo.convert %arg33 : (tensor<512xbf16>) -> tensor<512xf32>
-    %729 = stablehlo.broadcast_in_dim %727, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %730 = stablehlo.broadcast_in_dim %728, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %731 = stablehlo.multiply %729, %730 : tensor<1x256x512xf32>
-    %732 = stablehlo.convert %arg34 : (tensor<512xbf16>) -> tensor<512xf32>
-    %733 = stablehlo.broadcast_in_dim %731, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %734 = stablehlo.broadcast_in_dim %732, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %735 = stablehlo.add %733, %734 : tensor<1x256x512xf32>
-    %736 = stablehlo.convert %735 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %737 = stablehlo.convolution(%736, %arg35) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %738 = stablehlo.reshape %arg36 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %739 = stablehlo.broadcast_in_dim %737, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %740 = stablehlo.broadcast_in_dim %738, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %741 = stablehlo.add %739, %740 : tensor<1x1024x512xbf16>
-    %742 = stablehlo.multiply %741, %cst_3 : tensor<1x1024x512xbf16>
-    %743 = stablehlo.multiply %741, %68 : tensor<1x1024x512xbf16>
-    %744 = stablehlo.convert %743 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %745 = stablehlo.clamp %cst_4, %744, %cst_5 : tensor<1x1024x512xf32>
-    %746 = stablehlo.multiply %745, %745 : tensor<1x1024x512xf32>
-    %747 = stablehlo.multiply %cst_6, %746 : tensor<1x1024x512xf32>
-    %748 = stablehlo.add %747, %cst_7 : tensor<1x1024x512xf32>
-    %749 = stablehlo.multiply %748, %746 : tensor<1x1024x512xf32>
-    %750 = stablehlo.add %749, %cst_8 : tensor<1x1024x512xf32>
-    %751 = stablehlo.multiply %750, %746 : tensor<1x1024x512xf32>
-    %752 = stablehlo.add %751, %cst_9 : tensor<1x1024x512xf32>
-    %753 = stablehlo.multiply %752, %746 : tensor<1x1024x512xf32>
-    %754 = stablehlo.add %753, %cst_10 : tensor<1x1024x512xf32>
-    %755 = stablehlo.multiply %754, %746 : tensor<1x1024x512xf32>
-    %756 = stablehlo.add %755, %cst_11 : tensor<1x1024x512xf32>
-    %757 = stablehlo.multiply %756, %746 : tensor<1x1024x512xf32>
-    %758 = stablehlo.add %757, %cst_12 : tensor<1x1024x512xf32>
-    %759 = stablehlo.multiply %cst_13, %746 : tensor<1x1024x512xf32>
-    %760 = stablehlo.add %759, %cst_14 : tensor<1x1024x512xf32>
-    %761 = stablehlo.multiply %760, %746 : tensor<1x1024x512xf32>
-    %762 = stablehlo.add %761, %cst_15 : tensor<1x1024x512xf32>
-    %763 = stablehlo.multiply %762, %746 : tensor<1x1024x512xf32>
-    %764 = stablehlo.add %763, %cst_16 : tensor<1x1024x512xf32>
-    %765 = stablehlo.multiply %764, %746 : tensor<1x1024x512xf32>
-    %766 = stablehlo.add %765, %cst_17 : tensor<1x1024x512xf32>
-    %767 = stablehlo.multiply %745, %758 : tensor<1x1024x512xf32>
-    %768 = stablehlo.divide %767, %766 : tensor<1x1024x512xf32>
-    %769 = stablehlo.clamp %cst_18, %768, %cst_19 : tensor<1x1024x512xf32>
-    %770 = stablehlo.convert %769 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %771 = stablehlo.add %770, %cst_1 : tensor<1x1024x512xbf16>
-    %772 = stablehlo.multiply %771, %742 : tensor<1x1024x512xbf16>
-    %773 = stablehlo.convolution(%772, %arg37) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %774 = stablehlo.reshape %arg38 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %775 = stablehlo.broadcast_in_dim %773, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %776 = stablehlo.broadcast_in_dim %774, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %777 = stablehlo.add %775, %776 : tensor<1x256x512xbf16>
-    %778 = stablehlo.add %777, %699 : tensor<1x256x512xbf16>
-    %779 = stablehlo.convert %778 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %780 = stablehlo.convert %779 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %781 = stablehlo.reduce(%780 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %782 = stablehlo.reshape %781 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %783 = stablehlo.broadcast_in_dim %782, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %784 = stablehlo.divide %783, %23 : tensor<1x256x1xf64>
-    %785 = stablehlo.broadcast_in_dim %780, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %786 = stablehlo.broadcast_in_dim %784, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %787 = stablehlo.subtract %785, %786 : tensor<1x256x512xf64>
-    %788 = stablehlo.multiply %787, %787 : tensor<1x256x512xf64>
-    %789 = stablehlo.reduce(%788 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %790 = stablehlo.reshape %789 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %791 = stablehlo.broadcast_in_dim %790, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %792 = stablehlo.divide %791, %23 : tensor<1x256x1xf64>
-    %793 = stablehlo.convert %792 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %794 = stablehlo.reduce(%779 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %795 = stablehlo.reshape %794 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %796 = stablehlo.broadcast_in_dim %795, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %797 = stablehlo.divide %796, %39 : tensor<1x256x1xf32>
-    %798 = stablehlo.broadcast_in_dim %793, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %799 = stablehlo.add %798, %44 : tensor<1x256x1xf32>
-    %800 = stablehlo.rsqrt %799 : tensor<1x256x1xf32>
-    %801 = stablehlo.broadcast_in_dim %779, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %802 = stablehlo.broadcast_in_dim %797, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %803 = stablehlo.subtract %801, %802 : tensor<1x256x512xf32>
-    %804 = stablehlo.broadcast_in_dim %803, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %805 = stablehlo.broadcast_in_dim %800, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %806 = stablehlo.multiply %804, %805 : tensor<1x256x512xf32>
-    %807 = stablehlo.convert %arg39 : (tensor<512xbf16>) -> tensor<512xf32>
-    %808 = stablehlo.broadcast_in_dim %806, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %809 = stablehlo.broadcast_in_dim %807, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %810 = stablehlo.multiply %808, %809 : tensor<1x256x512xf32>
-    %811 = stablehlo.convert %arg40 : (tensor<512xbf16>) -> tensor<512xf32>
-    %812 = stablehlo.broadcast_in_dim %810, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %813 = stablehlo.broadcast_in_dim %811, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %814 = stablehlo.add %812, %813 : tensor<1x256x512xf32>
-    %815 = stablehlo.convert %814 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %816 = stablehlo.reshape %815 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %817 = stablehlo.convert %816 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %818 = stablehlo.dot_general %817, %arg117, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %819 = stablehlo.broadcast_in_dim %818, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %820 = stablehlo.multiply %819, %146 : tensor<256x256xf32>
-    %821 = stablehlo.broadcast_in_dim %820, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %822 = stablehlo.broadcast_in_dim %arg118, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %823 = stablehlo.add %821, %822 : tensor<256x256xf32>
-    %824 = stablehlo.convert %823 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %825 = stablehlo.reshape %824 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %826 = stablehlo.multiply %825, %cst_22 : tensor<1x256x256xbf16>
-    %827 = stablehlo.multiply %825, %154 : tensor<1x256x256xbf16>
-    %828 = stablehlo.convert %827 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %829 = stablehlo.clamp %cst_23, %828, %cst_24 : tensor<1x256x256xf32>
-    %830 = stablehlo.multiply %829, %829 : tensor<1x256x256xf32>
-    %831 = stablehlo.multiply %cst_25, %830 : tensor<1x256x256xf32>
-    %832 = stablehlo.add %831, %cst_26 : tensor<1x256x256xf32>
-    %833 = stablehlo.multiply %832, %830 : tensor<1x256x256xf32>
-    %834 = stablehlo.add %833, %cst_27 : tensor<1x256x256xf32>
-    %835 = stablehlo.multiply %834, %830 : tensor<1x256x256xf32>
-    %836 = stablehlo.add %835, %cst_28 : tensor<1x256x256xf32>
-    %837 = stablehlo.multiply %836, %830 : tensor<1x256x256xf32>
-    %838 = stablehlo.add %837, %cst_29 : tensor<1x256x256xf32>
-    %839 = stablehlo.multiply %838, %830 : tensor<1x256x256xf32>
-    %840 = stablehlo.add %839, %cst_30 : tensor<1x256x256xf32>
-    %841 = stablehlo.multiply %840, %830 : tensor<1x256x256xf32>
-    %842 = stablehlo.add %841, %cst_31 : tensor<1x256x256xf32>
-    %843 = stablehlo.multiply %cst_32, %830 : tensor<1x256x256xf32>
-    %844 = stablehlo.add %843, %cst_33 : tensor<1x256x256xf32>
-    %845 = stablehlo.multiply %844, %830 : tensor<1x256x256xf32>
-    %846 = stablehlo.add %845, %cst_34 : tensor<1x256x256xf32>
-    %847 = stablehlo.multiply %846, %830 : tensor<1x256x256xf32>
-    %848 = stablehlo.add %847, %cst_35 : tensor<1x256x256xf32>
-    %849 = stablehlo.multiply %848, %830 : tensor<1x256x256xf32>
-    %850 = stablehlo.add %849, %cst_36 : tensor<1x256x256xf32>
-    %851 = stablehlo.multiply %829, %842 : tensor<1x256x256xf32>
-    %852 = stablehlo.divide %851, %850 : tensor<1x256x256xf32>
-    %853 = stablehlo.clamp %cst_37, %852, %cst_38 : tensor<1x256x256xf32>
-    %854 = stablehlo.convert %853 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %855 = stablehlo.add %854, %cst_20 : tensor<1x256x256xbf16>
-    %856 = stablehlo.multiply %855, %826 : tensor<1x256x256xbf16>
-    %857 = stablehlo.reshape %856 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %858 = stablehlo.convert %857 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %859 = stablehlo.dot_general %858, %arg119, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %860 = stablehlo.broadcast_in_dim %859, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %861 = stablehlo.multiply %860, %9 : tensor<256x512xf32>
-    %862 = stablehlo.broadcast_in_dim %861, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %863 = stablehlo.broadcast_in_dim %arg120, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %864 = stablehlo.add %862, %863 : tensor<256x512xf32>
-    %865 = stablehlo.convert %864 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %866 = stablehlo.reshape %865 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %867 = stablehlo.add %866, %778 : tensor<1x256x512xbf16>
-    %868 = stablehlo.convert %867 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %869 = stablehlo.convert %868 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %870 = stablehlo.reduce(%869 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %871 = stablehlo.reshape %870 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %872 = stablehlo.broadcast_in_dim %871, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %873 = stablehlo.divide %872, %23 : tensor<1x256x1xf64>
-    %874 = stablehlo.broadcast_in_dim %869, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %875 = stablehlo.broadcast_in_dim %873, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %876 = stablehlo.subtract %874, %875 : tensor<1x256x512xf64>
-    %877 = stablehlo.multiply %876, %876 : tensor<1x256x512xf64>
-    %878 = stablehlo.reduce(%877 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %879 = stablehlo.reshape %878 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %880 = stablehlo.broadcast_in_dim %879, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %881 = stablehlo.divide %880, %23 : tensor<1x256x1xf64>
-    %882 = stablehlo.convert %881 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %883 = stablehlo.reduce(%868 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %884 = stablehlo.reshape %883 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %885 = stablehlo.broadcast_in_dim %884, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %886 = stablehlo.divide %885, %39 : tensor<1x256x1xf32>
-    %887 = stablehlo.broadcast_in_dim %882, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %888 = stablehlo.add %887, %44 : tensor<1x256x1xf32>
-    %889 = stablehlo.rsqrt %888 : tensor<1x256x1xf32>
-    %890 = stablehlo.broadcast_in_dim %868, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %891 = stablehlo.broadcast_in_dim %886, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %892 = stablehlo.subtract %890, %891 : tensor<1x256x512xf32>
-    %893 = stablehlo.broadcast_in_dim %892, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %894 = stablehlo.broadcast_in_dim %889, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %895 = stablehlo.multiply %893, %894 : tensor<1x256x512xf32>
-    %896 = stablehlo.convert %arg41 : (tensor<512xbf16>) -> tensor<512xf32>
-    %897 = stablehlo.broadcast_in_dim %895, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %898 = stablehlo.broadcast_in_dim %896, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %899 = stablehlo.multiply %897, %898 : tensor<1x256x512xf32>
-    %900 = stablehlo.convert %arg42 : (tensor<512xbf16>) -> tensor<512xf32>
-    %901 = stablehlo.broadcast_in_dim %899, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %902 = stablehlo.broadcast_in_dim %900, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %903 = stablehlo.add %901, %902 : tensor<1x256x512xf32>
-    %904 = stablehlo.convert %903 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %905 = stablehlo.convolution(%904, %arg43) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %906 = stablehlo.reshape %arg44 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %907 = stablehlo.broadcast_in_dim %905, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %908 = stablehlo.broadcast_in_dim %906, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %909 = stablehlo.add %907, %908 : tensor<1x1024x512xbf16>
-    %910 = stablehlo.multiply %909, %cst_3 : tensor<1x1024x512xbf16>
-    %911 = stablehlo.multiply %909, %68 : tensor<1x1024x512xbf16>
-    %912 = stablehlo.convert %911 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %913 = stablehlo.clamp %cst_4, %912, %cst_5 : tensor<1x1024x512xf32>
-    %914 = stablehlo.multiply %913, %913 : tensor<1x1024x512xf32>
-    %915 = stablehlo.multiply %cst_6, %914 : tensor<1x1024x512xf32>
-    %916 = stablehlo.add %915, %cst_7 : tensor<1x1024x512xf32>
-    %917 = stablehlo.multiply %916, %914 : tensor<1x1024x512xf32>
-    %918 = stablehlo.add %917, %cst_8 : tensor<1x1024x512xf32>
-    %919 = stablehlo.multiply %918, %914 : tensor<1x1024x512xf32>
-    %920 = stablehlo.add %919, %cst_9 : tensor<1x1024x512xf32>
-    %921 = stablehlo.multiply %920, %914 : tensor<1x1024x512xf32>
-    %922 = stablehlo.add %921, %cst_10 : tensor<1x1024x512xf32>
-    %923 = stablehlo.multiply %922, %914 : tensor<1x1024x512xf32>
-    %924 = stablehlo.add %923, %cst_11 : tensor<1x1024x512xf32>
-    %925 = stablehlo.multiply %924, %914 : tensor<1x1024x512xf32>
-    %926 = stablehlo.add %925, %cst_12 : tensor<1x1024x512xf32>
-    %927 = stablehlo.multiply %cst_13, %914 : tensor<1x1024x512xf32>
-    %928 = stablehlo.add %927, %cst_14 : tensor<1x1024x512xf32>
-    %929 = stablehlo.multiply %928, %914 : tensor<1x1024x512xf32>
-    %930 = stablehlo.add %929, %cst_15 : tensor<1x1024x512xf32>
-    %931 = stablehlo.multiply %930, %914 : tensor<1x1024x512xf32>
-    %932 = stablehlo.add %931, %cst_16 : tensor<1x1024x512xf32>
-    %933 = stablehlo.multiply %932, %914 : tensor<1x1024x512xf32>
-    %934 = stablehlo.add %933, %cst_17 : tensor<1x1024x512xf32>
-    %935 = stablehlo.multiply %913, %926 : tensor<1x1024x512xf32>
-    %936 = stablehlo.divide %935, %934 : tensor<1x1024x512xf32>
-    %937 = stablehlo.clamp %cst_18, %936, %cst_19 : tensor<1x1024x512xf32>
-    %938 = stablehlo.convert %937 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %939 = stablehlo.add %938, %cst_1 : tensor<1x1024x512xbf16>
-    %940 = stablehlo.multiply %939, %910 : tensor<1x1024x512xbf16>
-    %941 = stablehlo.convolution(%940, %arg45) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %942 = stablehlo.reshape %arg46 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %943 = stablehlo.broadcast_in_dim %941, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %944 = stablehlo.broadcast_in_dim %942, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %945 = stablehlo.add %943, %944 : tensor<1x256x512xbf16>
-    %946 = stablehlo.add %945, %867 : tensor<1x256x512xbf16>
-    %947 = stablehlo.convert %946 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %948 = stablehlo.convert %947 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %949 = stablehlo.reduce(%948 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %950 = stablehlo.reshape %949 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %951 = stablehlo.broadcast_in_dim %950, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %952 = stablehlo.divide %951, %23 : tensor<1x256x1xf64>
-    %953 = stablehlo.broadcast_in_dim %948, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %954 = stablehlo.broadcast_in_dim %952, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %955 = stablehlo.subtract %953, %954 : tensor<1x256x512xf64>
-    %956 = stablehlo.multiply %955, %955 : tensor<1x256x512xf64>
-    %957 = stablehlo.reduce(%956 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %958 = stablehlo.reshape %957 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %959 = stablehlo.broadcast_in_dim %958, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %960 = stablehlo.divide %959, %23 : tensor<1x256x1xf64>
-    %961 = stablehlo.convert %960 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %962 = stablehlo.reduce(%947 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %963 = stablehlo.reshape %962 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %964 = stablehlo.broadcast_in_dim %963, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %965 = stablehlo.divide %964, %39 : tensor<1x256x1xf32>
-    %966 = stablehlo.broadcast_in_dim %961, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %967 = stablehlo.add %966, %44 : tensor<1x256x1xf32>
-    %968 = stablehlo.rsqrt %967 : tensor<1x256x1xf32>
-    %969 = stablehlo.broadcast_in_dim %947, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %970 = stablehlo.broadcast_in_dim %965, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %971 = stablehlo.subtract %969, %970 : tensor<1x256x512xf32>
-    %972 = stablehlo.broadcast_in_dim %971, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %973 = stablehlo.broadcast_in_dim %968, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %974 = stablehlo.multiply %972, %973 : tensor<1x256x512xf32>
-    %975 = stablehlo.convert %arg47 : (tensor<512xbf16>) -> tensor<512xf32>
-    %976 = stablehlo.broadcast_in_dim %974, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %977 = stablehlo.broadcast_in_dim %975, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %978 = stablehlo.multiply %976, %977 : tensor<1x256x512xf32>
-    %979 = stablehlo.convert %arg48 : (tensor<512xbf16>) -> tensor<512xf32>
-    %980 = stablehlo.broadcast_in_dim %978, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %981 = stablehlo.broadcast_in_dim %979, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %982 = stablehlo.add %980, %981 : tensor<1x256x512xf32>
-    %983 = stablehlo.convert %982 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %984 = stablehlo.reshape %983 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %985 = stablehlo.convert %984 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %986 = stablehlo.dot_general %985, %arg121, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %987 = stablehlo.broadcast_in_dim %986, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %988 = stablehlo.multiply %987, %146 : tensor<256x256xf32>
-    %989 = stablehlo.broadcast_in_dim %988, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %990 = stablehlo.broadcast_in_dim %arg122, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %991 = stablehlo.add %989, %990 : tensor<256x256xf32>
-    %992 = stablehlo.convert %991 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %993 = stablehlo.reshape %992 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %994 = stablehlo.multiply %993, %cst_22 : tensor<1x256x256xbf16>
-    %995 = stablehlo.multiply %993, %154 : tensor<1x256x256xbf16>
-    %996 = stablehlo.convert %995 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %997 = stablehlo.clamp %cst_23, %996, %cst_24 : tensor<1x256x256xf32>
-    %998 = stablehlo.multiply %997, %997 : tensor<1x256x256xf32>
-    %999 = stablehlo.multiply %cst_25, %998 : tensor<1x256x256xf32>
-    %1000 = stablehlo.add %999, %cst_26 : tensor<1x256x256xf32>
-    %1001 = stablehlo.multiply %1000, %998 : tensor<1x256x256xf32>
-    %1002 = stablehlo.add %1001, %cst_27 : tensor<1x256x256xf32>
-    %1003 = stablehlo.multiply %1002, %998 : tensor<1x256x256xf32>
-    %1004 = stablehlo.add %1003, %cst_28 : tensor<1x256x256xf32>
-    %1005 = stablehlo.multiply %1004, %998 : tensor<1x256x256xf32>
-    %1006 = stablehlo.add %1005, %cst_29 : tensor<1x256x256xf32>
-    %1007 = stablehlo.multiply %1006, %998 : tensor<1x256x256xf32>
-    %1008 = stablehlo.add %1007, %cst_30 : tensor<1x256x256xf32>
-    %1009 = stablehlo.multiply %1008, %998 : tensor<1x256x256xf32>
-    %1010 = stablehlo.add %1009, %cst_31 : tensor<1x256x256xf32>
-    %1011 = stablehlo.multiply %cst_32, %998 : tensor<1x256x256xf32>
-    %1012 = stablehlo.add %1011, %cst_33 : tensor<1x256x256xf32>
-    %1013 = stablehlo.multiply %1012, %998 : tensor<1x256x256xf32>
-    %1014 = stablehlo.add %1013, %cst_34 : tensor<1x256x256xf32>
-    %1015 = stablehlo.multiply %1014, %998 : tensor<1x256x256xf32>
-    %1016 = stablehlo.add %1015, %cst_35 : tensor<1x256x256xf32>
-    %1017 = stablehlo.multiply %1016, %998 : tensor<1x256x256xf32>
-    %1018 = stablehlo.add %1017, %cst_36 : tensor<1x256x256xf32>
-    %1019 = stablehlo.multiply %997, %1010 : tensor<1x256x256xf32>
-    %1020 = stablehlo.divide %1019, %1018 : tensor<1x256x256xf32>
-    %1021 = stablehlo.clamp %cst_37, %1020, %cst_38 : tensor<1x256x256xf32>
-    %1022 = stablehlo.convert %1021 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1023 = stablehlo.add %1022, %cst_20 : tensor<1x256x256xbf16>
-    %1024 = stablehlo.multiply %1023, %994 : tensor<1x256x256xbf16>
-    %1025 = stablehlo.reshape %1024 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1026 = stablehlo.convert %1025 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1027 = stablehlo.dot_general %1026, %arg123, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1028 = stablehlo.broadcast_in_dim %1027, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1029 = stablehlo.multiply %1028, %9 : tensor<256x512xf32>
-    %1030 = stablehlo.broadcast_in_dim %1029, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1031 = stablehlo.broadcast_in_dim %arg124, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %1032 = stablehlo.add %1030, %1031 : tensor<256x512xf32>
-    %1033 = stablehlo.convert %1032 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %1034 = stablehlo.reshape %1033 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1035 = stablehlo.add %1034, %946 : tensor<1x256x512xbf16>
-    %1036 = stablehlo.convert %1035 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1037 = stablehlo.convert %1036 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1038 = stablehlo.reduce(%1037 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1039 = stablehlo.reshape %1038 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1040 = stablehlo.broadcast_in_dim %1039, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1041 = stablehlo.divide %1040, %23 : tensor<1x256x1xf64>
-    %1042 = stablehlo.broadcast_in_dim %1037, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1043 = stablehlo.broadcast_in_dim %1041, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1044 = stablehlo.subtract %1042, %1043 : tensor<1x256x512xf64>
-    %1045 = stablehlo.multiply %1044, %1044 : tensor<1x256x512xf64>
-    %1046 = stablehlo.reduce(%1045 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1047 = stablehlo.reshape %1046 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1048 = stablehlo.broadcast_in_dim %1047, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1049 = stablehlo.divide %1048, %23 : tensor<1x256x1xf64>
-    %1050 = stablehlo.convert %1049 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1051 = stablehlo.reduce(%1036 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1052 = stablehlo.reshape %1051 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1053 = stablehlo.broadcast_in_dim %1052, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1054 = stablehlo.divide %1053, %39 : tensor<1x256x1xf32>
-    %1055 = stablehlo.broadcast_in_dim %1050, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1056 = stablehlo.add %1055, %44 : tensor<1x256x1xf32>
-    %1057 = stablehlo.rsqrt %1056 : tensor<1x256x1xf32>
-    %1058 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1059 = stablehlo.broadcast_in_dim %1054, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1060 = stablehlo.subtract %1058, %1059 : tensor<1x256x512xf32>
-    %1061 = stablehlo.broadcast_in_dim %1060, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1062 = stablehlo.broadcast_in_dim %1057, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1063 = stablehlo.multiply %1061, %1062 : tensor<1x256x512xf32>
-    %1064 = stablehlo.convert %arg49 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1065 = stablehlo.broadcast_in_dim %1063, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1066 = stablehlo.broadcast_in_dim %1064, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1067 = stablehlo.multiply %1065, %1066 : tensor<1x256x512xf32>
-    %1068 = stablehlo.convert %arg50 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1069 = stablehlo.broadcast_in_dim %1067, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1070 = stablehlo.broadcast_in_dim %1068, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1071 = stablehlo.add %1069, %1070 : tensor<1x256x512xf32>
-    %1072 = stablehlo.convert %1071 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1073 = stablehlo.convolution(%1072, %arg51) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1074 = stablehlo.reshape %arg52 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %1075 = stablehlo.broadcast_in_dim %1073, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %1076 = stablehlo.broadcast_in_dim %1074, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1077 = stablehlo.add %1075, %1076 : tensor<1x1024x512xbf16>
-    %1078 = stablehlo.multiply %1077, %cst_3 : tensor<1x1024x512xbf16>
-    %1079 = stablehlo.multiply %1077, %68 : tensor<1x1024x512xbf16>
-    %1080 = stablehlo.convert %1079 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %1081 = stablehlo.clamp %cst_4, %1080, %cst_5 : tensor<1x1024x512xf32>
-    %1082 = stablehlo.multiply %1081, %1081 : tensor<1x1024x512xf32>
-    %1083 = stablehlo.multiply %cst_6, %1082 : tensor<1x1024x512xf32>
-    %1084 = stablehlo.add %1083, %cst_7 : tensor<1x1024x512xf32>
-    %1085 = stablehlo.multiply %1084, %1082 : tensor<1x1024x512xf32>
-    %1086 = stablehlo.add %1085, %cst_8 : tensor<1x1024x512xf32>
-    %1087 = stablehlo.multiply %1086, %1082 : tensor<1x1024x512xf32>
-    %1088 = stablehlo.add %1087, %cst_9 : tensor<1x1024x512xf32>
-    %1089 = stablehlo.multiply %1088, %1082 : tensor<1x1024x512xf32>
-    %1090 = stablehlo.add %1089, %cst_10 : tensor<1x1024x512xf32>
-    %1091 = stablehlo.multiply %1090, %1082 : tensor<1x1024x512xf32>
-    %1092 = stablehlo.add %1091, %cst_11 : tensor<1x1024x512xf32>
-    %1093 = stablehlo.multiply %1092, %1082 : tensor<1x1024x512xf32>
-    %1094 = stablehlo.add %1093, %cst_12 : tensor<1x1024x512xf32>
-    %1095 = stablehlo.multiply %cst_13, %1082 : tensor<1x1024x512xf32>
-    %1096 = stablehlo.add %1095, %cst_14 : tensor<1x1024x512xf32>
-    %1097 = stablehlo.multiply %1096, %1082 : tensor<1x1024x512xf32>
-    %1098 = stablehlo.add %1097, %cst_15 : tensor<1x1024x512xf32>
-    %1099 = stablehlo.multiply %1098, %1082 : tensor<1x1024x512xf32>
-    %1100 = stablehlo.add %1099, %cst_16 : tensor<1x1024x512xf32>
-    %1101 = stablehlo.multiply %1100, %1082 : tensor<1x1024x512xf32>
-    %1102 = stablehlo.add %1101, %cst_17 : tensor<1x1024x512xf32>
-    %1103 = stablehlo.multiply %1081, %1094 : tensor<1x1024x512xf32>
-    %1104 = stablehlo.divide %1103, %1102 : tensor<1x1024x512xf32>
-    %1105 = stablehlo.clamp %cst_18, %1104, %cst_19 : tensor<1x1024x512xf32>
-    %1106 = stablehlo.convert %1105 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %1107 = stablehlo.add %1106, %cst_1 : tensor<1x1024x512xbf16>
-    %1108 = stablehlo.multiply %1107, %1078 : tensor<1x1024x512xbf16>
-    %1109 = stablehlo.convolution(%1108, %arg53) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %1110 = stablehlo.reshape %arg54 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %1111 = stablehlo.broadcast_in_dim %1109, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1112 = stablehlo.broadcast_in_dim %1110, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %1113 = stablehlo.add %1111, %1112 : tensor<1x256x512xbf16>
-    %1114 = stablehlo.add %1113, %1035 : tensor<1x256x512xbf16>
-    %1115 = stablehlo.convert %1114 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1116 = stablehlo.convert %1115 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1117 = stablehlo.reduce(%1116 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1118 = stablehlo.reshape %1117 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1119 = stablehlo.broadcast_in_dim %1118, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1120 = stablehlo.divide %1119, %23 : tensor<1x256x1xf64>
-    %1121 = stablehlo.broadcast_in_dim %1116, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1122 = stablehlo.broadcast_in_dim %1120, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1123 = stablehlo.subtract %1121, %1122 : tensor<1x256x512xf64>
-    %1124 = stablehlo.multiply %1123, %1123 : tensor<1x256x512xf64>
-    %1125 = stablehlo.reduce(%1124 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1126 = stablehlo.reshape %1125 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1127 = stablehlo.broadcast_in_dim %1126, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1128 = stablehlo.divide %1127, %23 : tensor<1x256x1xf64>
-    %1129 = stablehlo.convert %1128 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1130 = stablehlo.reduce(%1115 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1131 = stablehlo.reshape %1130 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1132 = stablehlo.broadcast_in_dim %1131, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1133 = stablehlo.divide %1132, %39 : tensor<1x256x1xf32>
-    %1134 = stablehlo.broadcast_in_dim %1129, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1135 = stablehlo.add %1134, %44 : tensor<1x256x1xf32>
-    %1136 = stablehlo.rsqrt %1135 : tensor<1x256x1xf32>
-    %1137 = stablehlo.broadcast_in_dim %1115, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1138 = stablehlo.broadcast_in_dim %1133, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1139 = stablehlo.subtract %1137, %1138 : tensor<1x256x512xf32>
-    %1140 = stablehlo.broadcast_in_dim %1139, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1141 = stablehlo.broadcast_in_dim %1136, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1142 = stablehlo.multiply %1140, %1141 : tensor<1x256x512xf32>
-    %1143 = stablehlo.convert %arg55 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1144 = stablehlo.broadcast_in_dim %1142, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1145 = stablehlo.broadcast_in_dim %1143, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1146 = stablehlo.multiply %1144, %1145 : tensor<1x256x512xf32>
-    %1147 = stablehlo.convert %arg56 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1148 = stablehlo.broadcast_in_dim %1146, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1149 = stablehlo.broadcast_in_dim %1147, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1150 = stablehlo.add %1148, %1149 : tensor<1x256x512xf32>
-    %1151 = stablehlo.convert %1150 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1152 = stablehlo.reshape %1151 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %1153 = stablehlo.convert %1152 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %1154 = stablehlo.dot_general %1153, %arg125, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %1155 = stablehlo.broadcast_in_dim %1154, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1156 = stablehlo.multiply %1155, %146 : tensor<256x256xf32>
-    %1157 = stablehlo.broadcast_in_dim %1156, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1158 = stablehlo.broadcast_in_dim %arg126, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1159 = stablehlo.add %1157, %1158 : tensor<256x256xf32>
-    %1160 = stablehlo.convert %1159 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1161 = stablehlo.reshape %1160 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1162 = stablehlo.multiply %1161, %cst_22 : tensor<1x256x256xbf16>
-    %1163 = stablehlo.multiply %1161, %154 : tensor<1x256x256xbf16>
-    %1164 = stablehlo.convert %1163 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1165 = stablehlo.clamp %cst_23, %1164, %cst_24 : tensor<1x256x256xf32>
-    %1166 = stablehlo.multiply %1165, %1165 : tensor<1x256x256xf32>
-    %1167 = stablehlo.multiply %cst_25, %1166 : tensor<1x256x256xf32>
-    %1168 = stablehlo.add %1167, %cst_26 : tensor<1x256x256xf32>
-    %1169 = stablehlo.multiply %1168, %1166 : tensor<1x256x256xf32>
-    %1170 = stablehlo.add %1169, %cst_27 : tensor<1x256x256xf32>
-    %1171 = stablehlo.multiply %1170, %1166 : tensor<1x256x256xf32>
-    %1172 = stablehlo.add %1171, %cst_28 : tensor<1x256x256xf32>
-    %1173 = stablehlo.multiply %1172, %1166 : tensor<1x256x256xf32>
-    %1174 = stablehlo.add %1173, %cst_29 : tensor<1x256x256xf32>
-    %1175 = stablehlo.multiply %1174, %1166 : tensor<1x256x256xf32>
-    %1176 = stablehlo.add %1175, %cst_30 : tensor<1x256x256xf32>
-    %1177 = stablehlo.multiply %1176, %1166 : tensor<1x256x256xf32>
-    %1178 = stablehlo.add %1177, %cst_31 : tensor<1x256x256xf32>
-    %1179 = stablehlo.multiply %cst_32, %1166 : tensor<1x256x256xf32>
-    %1180 = stablehlo.add %1179, %cst_33 : tensor<1x256x256xf32>
-    %1181 = stablehlo.multiply %1180, %1166 : tensor<1x256x256xf32>
-    %1182 = stablehlo.add %1181, %cst_34 : tensor<1x256x256xf32>
-    %1183 = stablehlo.multiply %1182, %1166 : tensor<1x256x256xf32>
-    %1184 = stablehlo.add %1183, %cst_35 : tensor<1x256x256xf32>
-    %1185 = stablehlo.multiply %1184, %1166 : tensor<1x256x256xf32>
-    %1186 = stablehlo.add %1185, %cst_36 : tensor<1x256x256xf32>
-    %1187 = stablehlo.multiply %1165, %1178 : tensor<1x256x256xf32>
-    %1188 = stablehlo.divide %1187, %1186 : tensor<1x256x256xf32>
-    %1189 = stablehlo.clamp %cst_37, %1188, %cst_38 : tensor<1x256x256xf32>
-    %1190 = stablehlo.convert %1189 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1191 = stablehlo.add %1190, %cst_20 : tensor<1x256x256xbf16>
-    %1192 = stablehlo.multiply %1191, %1162 : tensor<1x256x256xbf16>
-    %1193 = stablehlo.reshape %1192 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1194 = stablehlo.convert %1193 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1195 = stablehlo.dot_general %1194, %arg127, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1196 = stablehlo.broadcast_in_dim %1195, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1197 = stablehlo.multiply %1196, %9 : tensor<256x512xf32>
-    %1198 = stablehlo.broadcast_in_dim %1197, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1199 = stablehlo.broadcast_in_dim %arg128, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %1200 = stablehlo.add %1198, %1199 : tensor<256x512xf32>
-    %1201 = stablehlo.convert %1200 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %1202 = stablehlo.reshape %1201 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1203 = stablehlo.add %1202, %1114 : tensor<1x256x512xbf16>
-    %1204 = stablehlo.convert %1203 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1205 = stablehlo.convert %1204 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1206 = stablehlo.reduce(%1205 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1207 = stablehlo.reshape %1206 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1208 = stablehlo.broadcast_in_dim %1207, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1209 = stablehlo.divide %1208, %23 : tensor<1x256x1xf64>
-    %1210 = stablehlo.broadcast_in_dim %1205, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1211 = stablehlo.broadcast_in_dim %1209, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1212 = stablehlo.subtract %1210, %1211 : tensor<1x256x512xf64>
-    %1213 = stablehlo.multiply %1212, %1212 : tensor<1x256x512xf64>
-    %1214 = stablehlo.reduce(%1213 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1215 = stablehlo.reshape %1214 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1216 = stablehlo.broadcast_in_dim %1215, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1217 = stablehlo.divide %1216, %23 : tensor<1x256x1xf64>
-    %1218 = stablehlo.convert %1217 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1219 = stablehlo.reduce(%1204 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1220 = stablehlo.reshape %1219 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1221 = stablehlo.broadcast_in_dim %1220, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1222 = stablehlo.divide %1221, %39 : tensor<1x256x1xf32>
-    %1223 = stablehlo.broadcast_in_dim %1218, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1224 = stablehlo.add %1223, %44 : tensor<1x256x1xf32>
-    %1225 = stablehlo.rsqrt %1224 : tensor<1x256x1xf32>
-    %1226 = stablehlo.broadcast_in_dim %1204, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1227 = stablehlo.broadcast_in_dim %1222, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1228 = stablehlo.subtract %1226, %1227 : tensor<1x256x512xf32>
-    %1229 = stablehlo.broadcast_in_dim %1228, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1230 = stablehlo.broadcast_in_dim %1225, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1231 = stablehlo.multiply %1229, %1230 : tensor<1x256x512xf32>
-    %1232 = stablehlo.convert %arg57 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1233 = stablehlo.broadcast_in_dim %1231, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1234 = stablehlo.broadcast_in_dim %1232, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1235 = stablehlo.multiply %1233, %1234 : tensor<1x256x512xf32>
-    %1236 = stablehlo.convert %arg58 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1237 = stablehlo.broadcast_in_dim %1235, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1238 = stablehlo.broadcast_in_dim %1236, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1239 = stablehlo.add %1237, %1238 : tensor<1x256x512xf32>
-    %1240 = stablehlo.convert %1239 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1241 = stablehlo.convolution(%1240, %arg59) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1242 = stablehlo.reshape %arg60 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %1243 = stablehlo.broadcast_in_dim %1241, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %1244 = stablehlo.broadcast_in_dim %1242, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1245 = stablehlo.add %1243, %1244 : tensor<1x1024x512xbf16>
-    %1246 = stablehlo.multiply %1245, %cst_3 : tensor<1x1024x512xbf16>
-    %1247 = stablehlo.multiply %1245, %68 : tensor<1x1024x512xbf16>
-    %1248 = stablehlo.convert %1247 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %1249 = stablehlo.clamp %cst_4, %1248, %cst_5 : tensor<1x1024x512xf32>
-    %1250 = stablehlo.multiply %1249, %1249 : tensor<1x1024x512xf32>
-    %1251 = stablehlo.multiply %cst_6, %1250 : tensor<1x1024x512xf32>
-    %1252 = stablehlo.add %1251, %cst_7 : tensor<1x1024x512xf32>
-    %1253 = stablehlo.multiply %1252, %1250 : tensor<1x1024x512xf32>
-    %1254 = stablehlo.add %1253, %cst_8 : tensor<1x1024x512xf32>
-    %1255 = stablehlo.multiply %1254, %1250 : tensor<1x1024x512xf32>
-    %1256 = stablehlo.add %1255, %cst_9 : tensor<1x1024x512xf32>
-    %1257 = stablehlo.multiply %1256, %1250 : tensor<1x1024x512xf32>
-    %1258 = stablehlo.add %1257, %cst_10 : tensor<1x1024x512xf32>
-    %1259 = stablehlo.multiply %1258, %1250 : tensor<1x1024x512xf32>
-    %1260 = stablehlo.add %1259, %cst_11 : tensor<1x1024x512xf32>
-    %1261 = stablehlo.multiply %1260, %1250 : tensor<1x1024x512xf32>
-    %1262 = stablehlo.add %1261, %cst_12 : tensor<1x1024x512xf32>
-    %1263 = stablehlo.multiply %cst_13, %1250 : tensor<1x1024x512xf32>
-    %1264 = stablehlo.add %1263, %cst_14 : tensor<1x1024x512xf32>
-    %1265 = stablehlo.multiply %1264, %1250 : tensor<1x1024x512xf32>
-    %1266 = stablehlo.add %1265, %cst_15 : tensor<1x1024x512xf32>
-    %1267 = stablehlo.multiply %1266, %1250 : tensor<1x1024x512xf32>
-    %1268 = stablehlo.add %1267, %cst_16 : tensor<1x1024x512xf32>
-    %1269 = stablehlo.multiply %1268, %1250 : tensor<1x1024x512xf32>
-    %1270 = stablehlo.add %1269, %cst_17 : tensor<1x1024x512xf32>
-    %1271 = stablehlo.multiply %1249, %1262 : tensor<1x1024x512xf32>
-    %1272 = stablehlo.divide %1271, %1270 : tensor<1x1024x512xf32>
-    %1273 = stablehlo.clamp %cst_18, %1272, %cst_19 : tensor<1x1024x512xf32>
-    %1274 = stablehlo.convert %1273 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %1275 = stablehlo.add %1274, %cst_1 : tensor<1x1024x512xbf16>
-    %1276 = stablehlo.multiply %1275, %1246 : tensor<1x1024x512xbf16>
-    %1277 = stablehlo.convolution(%1276, %arg61) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %1278 = stablehlo.reshape %arg62 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %1279 = stablehlo.broadcast_in_dim %1277, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1280 = stablehlo.broadcast_in_dim %1278, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %1281 = stablehlo.add %1279, %1280 : tensor<1x256x512xbf16>
-    %1282 = stablehlo.add %1281, %1203 : tensor<1x256x512xbf16>
-    %1283 = stablehlo.convert %1282 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1284 = stablehlo.convert %1283 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1285 = stablehlo.reduce(%1284 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1286 = stablehlo.reshape %1285 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1287 = stablehlo.broadcast_in_dim %1286, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1288 = stablehlo.divide %1287, %23 : tensor<1x256x1xf64>
-    %1289 = stablehlo.broadcast_in_dim %1284, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1290 = stablehlo.broadcast_in_dim %1288, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1291 = stablehlo.subtract %1289, %1290 : tensor<1x256x512xf64>
-    %1292 = stablehlo.multiply %1291, %1291 : tensor<1x256x512xf64>
-    %1293 = stablehlo.reduce(%1292 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1294 = stablehlo.reshape %1293 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1295 = stablehlo.broadcast_in_dim %1294, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1296 = stablehlo.divide %1295, %23 : tensor<1x256x1xf64>
-    %1297 = stablehlo.convert %1296 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1298 = stablehlo.reduce(%1283 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1299 = stablehlo.reshape %1298 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1300 = stablehlo.broadcast_in_dim %1299, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1301 = stablehlo.divide %1300, %39 : tensor<1x256x1xf32>
-    %1302 = stablehlo.broadcast_in_dim %1297, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1303 = stablehlo.add %1302, %44 : tensor<1x256x1xf32>
-    %1304 = stablehlo.rsqrt %1303 : tensor<1x256x1xf32>
-    %1305 = stablehlo.broadcast_in_dim %1283, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1306 = stablehlo.broadcast_in_dim %1301, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1307 = stablehlo.subtract %1305, %1306 : tensor<1x256x512xf32>
-    %1308 = stablehlo.broadcast_in_dim %1307, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1309 = stablehlo.broadcast_in_dim %1304, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1310 = stablehlo.multiply %1308, %1309 : tensor<1x256x512xf32>
-    %1311 = stablehlo.convert %arg63 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1312 = stablehlo.broadcast_in_dim %1310, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1313 = stablehlo.broadcast_in_dim %1311, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1314 = stablehlo.multiply %1312, %1313 : tensor<1x256x512xf32>
-    %1315 = stablehlo.convert %arg64 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1316 = stablehlo.broadcast_in_dim %1314, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1317 = stablehlo.broadcast_in_dim %1315, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1318 = stablehlo.add %1316, %1317 : tensor<1x256x512xf32>
-    %1319 = stablehlo.convert %1318 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1320 = stablehlo.reshape %1319 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %1321 = stablehlo.convert %1320 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %1322 = stablehlo.dot_general %1321, %arg129, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %1323 = stablehlo.broadcast_in_dim %1322, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1324 = stablehlo.multiply %1323, %146 : tensor<256x256xf32>
-    %1325 = stablehlo.broadcast_in_dim %1324, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1326 = stablehlo.broadcast_in_dim %arg130, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1327 = stablehlo.add %1325, %1326 : tensor<256x256xf32>
-    %1328 = stablehlo.convert %1327 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1329 = stablehlo.reshape %1328 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1330 = stablehlo.multiply %1329, %cst_22 : tensor<1x256x256xbf16>
-    %1331 = stablehlo.multiply %1329, %154 : tensor<1x256x256xbf16>
-    %1332 = stablehlo.convert %1331 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1333 = stablehlo.clamp %cst_23, %1332, %cst_24 : tensor<1x256x256xf32>
-    %1334 = stablehlo.multiply %1333, %1333 : tensor<1x256x256xf32>
-    %1335 = stablehlo.multiply %cst_25, %1334 : tensor<1x256x256xf32>
-    %1336 = stablehlo.add %1335, %cst_26 : tensor<1x256x256xf32>
-    %1337 = stablehlo.multiply %1336, %1334 : tensor<1x256x256xf32>
-    %1338 = stablehlo.add %1337, %cst_27 : tensor<1x256x256xf32>
-    %1339 = stablehlo.multiply %1338, %1334 : tensor<1x256x256xf32>
-    %1340 = stablehlo.add %1339, %cst_28 : tensor<1x256x256xf32>
-    %1341 = stablehlo.multiply %1340, %1334 : tensor<1x256x256xf32>
-    %1342 = stablehlo.add %1341, %cst_29 : tensor<1x256x256xf32>
-    %1343 = stablehlo.multiply %1342, %1334 : tensor<1x256x256xf32>
-    %1344 = stablehlo.add %1343, %cst_30 : tensor<1x256x256xf32>
-    %1345 = stablehlo.multiply %1344, %1334 : tensor<1x256x256xf32>
-    %1346 = stablehlo.add %1345, %cst_31 : tensor<1x256x256xf32>
-    %1347 = stablehlo.multiply %cst_32, %1334 : tensor<1x256x256xf32>
-    %1348 = stablehlo.add %1347, %cst_33 : tensor<1x256x256xf32>
-    %1349 = stablehlo.multiply %1348, %1334 : tensor<1x256x256xf32>
-    %1350 = stablehlo.add %1349, %cst_34 : tensor<1x256x256xf32>
-    %1351 = stablehlo.multiply %1350, %1334 : tensor<1x256x256xf32>
-    %1352 = stablehlo.add %1351, %cst_35 : tensor<1x256x256xf32>
-    %1353 = stablehlo.multiply %1352, %1334 : tensor<1x256x256xf32>
-    %1354 = stablehlo.add %1353, %cst_36 : tensor<1x256x256xf32>
-    %1355 = stablehlo.multiply %1333, %1346 : tensor<1x256x256xf32>
-    %1356 = stablehlo.divide %1355, %1354 : tensor<1x256x256xf32>
-    %1357 = stablehlo.clamp %cst_37, %1356, %cst_38 : tensor<1x256x256xf32>
-    %1358 = stablehlo.convert %1357 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1359 = stablehlo.add %1358, %cst_20 : tensor<1x256x256xbf16>
-    %1360 = stablehlo.multiply %1359, %1330 : tensor<1x256x256xbf16>
-    %1361 = stablehlo.reshape %1360 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1362 = stablehlo.convert %1361 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1363 = stablehlo.dot_general %1362, %arg131, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1364 = stablehlo.broadcast_in_dim %1363, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1365 = stablehlo.multiply %1364, %9 : tensor<256x512xf32>
-    %1366 = stablehlo.broadcast_in_dim %1365, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1367 = stablehlo.broadcast_in_dim %arg132, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %1368 = stablehlo.add %1366, %1367 : tensor<256x512xf32>
-    %1369 = stablehlo.convert %1368 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %1370 = stablehlo.reshape %1369 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1371 = stablehlo.add %1370, %1282 : tensor<1x256x512xbf16>
-    %1372 = stablehlo.convert %1371 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1373 = stablehlo.convert %1372 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1374 = stablehlo.reduce(%1373 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1375 = stablehlo.reshape %1374 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1376 = stablehlo.broadcast_in_dim %1375, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1377 = stablehlo.divide %1376, %23 : tensor<1x256x1xf64>
-    %1378 = stablehlo.broadcast_in_dim %1373, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1379 = stablehlo.broadcast_in_dim %1377, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1380 = stablehlo.subtract %1378, %1379 : tensor<1x256x512xf64>
-    %1381 = stablehlo.multiply %1380, %1380 : tensor<1x256x512xf64>
-    %1382 = stablehlo.reduce(%1381 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1383 = stablehlo.reshape %1382 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1384 = stablehlo.broadcast_in_dim %1383, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1385 = stablehlo.divide %1384, %23 : tensor<1x256x1xf64>
-    %1386 = stablehlo.convert %1385 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1387 = stablehlo.reduce(%1372 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1388 = stablehlo.reshape %1387 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1389 = stablehlo.broadcast_in_dim %1388, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1390 = stablehlo.divide %1389, %39 : tensor<1x256x1xf32>
-    %1391 = stablehlo.broadcast_in_dim %1386, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1392 = stablehlo.add %1391, %44 : tensor<1x256x1xf32>
-    %1393 = stablehlo.rsqrt %1392 : tensor<1x256x1xf32>
-    %1394 = stablehlo.broadcast_in_dim %1372, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1395 = stablehlo.broadcast_in_dim %1390, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1396 = stablehlo.subtract %1394, %1395 : tensor<1x256x512xf32>
-    %1397 = stablehlo.broadcast_in_dim %1396, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1398 = stablehlo.broadcast_in_dim %1393, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1399 = stablehlo.multiply %1397, %1398 : tensor<1x256x512xf32>
-    %1400 = stablehlo.convert %arg65 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1401 = stablehlo.broadcast_in_dim %1399, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1402 = stablehlo.broadcast_in_dim %1400, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1403 = stablehlo.multiply %1401, %1402 : tensor<1x256x512xf32>
-    %1404 = stablehlo.convert %arg66 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1405 = stablehlo.broadcast_in_dim %1403, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1406 = stablehlo.broadcast_in_dim %1404, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1407 = stablehlo.add %1405, %1406 : tensor<1x256x512xf32>
-    %1408 = stablehlo.convert %1407 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1409 = stablehlo.convolution(%1408, %arg67) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1410 = stablehlo.reshape %arg68 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %1411 = stablehlo.broadcast_in_dim %1409, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %1412 = stablehlo.broadcast_in_dim %1410, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1413 = stablehlo.add %1411, %1412 : tensor<1x1024x512xbf16>
-    %1414 = stablehlo.multiply %1413, %cst_3 : tensor<1x1024x512xbf16>
-    %1415 = stablehlo.multiply %1413, %68 : tensor<1x1024x512xbf16>
-    %1416 = stablehlo.convert %1415 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %1417 = stablehlo.clamp %cst_4, %1416, %cst_5 : tensor<1x1024x512xf32>
-    %1418 = stablehlo.multiply %1417, %1417 : tensor<1x1024x512xf32>
-    %1419 = stablehlo.multiply %cst_6, %1418 : tensor<1x1024x512xf32>
-    %1420 = stablehlo.add %1419, %cst_7 : tensor<1x1024x512xf32>
-    %1421 = stablehlo.multiply %1420, %1418 : tensor<1x1024x512xf32>
-    %1422 = stablehlo.add %1421, %cst_8 : tensor<1x1024x512xf32>
-    %1423 = stablehlo.multiply %1422, %1418 : tensor<1x1024x512xf32>
-    %1424 = stablehlo.add %1423, %cst_9 : tensor<1x1024x512xf32>
-    %1425 = stablehlo.multiply %1424, %1418 : tensor<1x1024x512xf32>
-    %1426 = stablehlo.add %1425, %cst_10 : tensor<1x1024x512xf32>
-    %1427 = stablehlo.multiply %1426, %1418 : tensor<1x1024x512xf32>
-    %1428 = stablehlo.add %1427, %cst_11 : tensor<1x1024x512xf32>
-    %1429 = stablehlo.multiply %1428, %1418 : tensor<1x1024x512xf32>
-    %1430 = stablehlo.add %1429, %cst_12 : tensor<1x1024x512xf32>
-    %1431 = stablehlo.multiply %cst_13, %1418 : tensor<1x1024x512xf32>
-    %1432 = stablehlo.add %1431, %cst_14 : tensor<1x1024x512xf32>
-    %1433 = stablehlo.multiply %1432, %1418 : tensor<1x1024x512xf32>
-    %1434 = stablehlo.add %1433, %cst_15 : tensor<1x1024x512xf32>
-    %1435 = stablehlo.multiply %1434, %1418 : tensor<1x1024x512xf32>
-    %1436 = stablehlo.add %1435, %cst_16 : tensor<1x1024x512xf32>
-    %1437 = stablehlo.multiply %1436, %1418 : tensor<1x1024x512xf32>
-    %1438 = stablehlo.add %1437, %cst_17 : tensor<1x1024x512xf32>
-    %1439 = stablehlo.multiply %1417, %1430 : tensor<1x1024x512xf32>
-    %1440 = stablehlo.divide %1439, %1438 : tensor<1x1024x512xf32>
-    %1441 = stablehlo.clamp %cst_18, %1440, %cst_19 : tensor<1x1024x512xf32>
-    %1442 = stablehlo.convert %1441 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %1443 = stablehlo.add %1442, %cst_1 : tensor<1x1024x512xbf16>
-    %1444 = stablehlo.multiply %1443, %1414 : tensor<1x1024x512xbf16>
-    %1445 = stablehlo.convolution(%1444, %arg69) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %1446 = stablehlo.reshape %arg70 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %1447 = stablehlo.broadcast_in_dim %1445, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1448 = stablehlo.broadcast_in_dim %1446, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %1449 = stablehlo.add %1447, %1448 : tensor<1x256x512xbf16>
-    %1450 = stablehlo.add %1449, %1371 : tensor<1x256x512xbf16>
-    %1451 = stablehlo.convert %1450 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1452 = stablehlo.convert %1451 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1453 = stablehlo.reduce(%1452 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1454 = stablehlo.reshape %1453 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1455 = stablehlo.broadcast_in_dim %1454, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1456 = stablehlo.divide %1455, %23 : tensor<1x256x1xf64>
-    %1457 = stablehlo.broadcast_in_dim %1452, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1458 = stablehlo.broadcast_in_dim %1456, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1459 = stablehlo.subtract %1457, %1458 : tensor<1x256x512xf64>
-    %1460 = stablehlo.multiply %1459, %1459 : tensor<1x256x512xf64>
-    %1461 = stablehlo.reduce(%1460 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1462 = stablehlo.reshape %1461 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1463 = stablehlo.broadcast_in_dim %1462, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1464 = stablehlo.divide %1463, %23 : tensor<1x256x1xf64>
-    %1465 = stablehlo.convert %1464 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1466 = stablehlo.reduce(%1451 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1467 = stablehlo.reshape %1466 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1468 = stablehlo.broadcast_in_dim %1467, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1469 = stablehlo.divide %1468, %39 : tensor<1x256x1xf32>
-    %1470 = stablehlo.broadcast_in_dim %1465, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1471 = stablehlo.add %1470, %44 : tensor<1x256x1xf32>
-    %1472 = stablehlo.rsqrt %1471 : tensor<1x256x1xf32>
-    %1473 = stablehlo.broadcast_in_dim %1451, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1474 = stablehlo.broadcast_in_dim %1469, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1475 = stablehlo.subtract %1473, %1474 : tensor<1x256x512xf32>
-    %1476 = stablehlo.broadcast_in_dim %1475, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1477 = stablehlo.broadcast_in_dim %1472, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1478 = stablehlo.multiply %1476, %1477 : tensor<1x256x512xf32>
-    %1479 = stablehlo.convert %arg71 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1480 = stablehlo.broadcast_in_dim %1478, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1481 = stablehlo.broadcast_in_dim %1479, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1482 = stablehlo.multiply %1480, %1481 : tensor<1x256x512xf32>
-    %1483 = stablehlo.convert %arg72 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1484 = stablehlo.broadcast_in_dim %1482, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1485 = stablehlo.broadcast_in_dim %1483, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1486 = stablehlo.add %1484, %1485 : tensor<1x256x512xf32>
-    %1487 = stablehlo.convert %1486 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1488 = stablehlo.reshape %1487 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %1489 = stablehlo.convert %1488 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %1490 = stablehlo.dot_general %1489, %arg133, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %1491 = stablehlo.broadcast_in_dim %1490, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1492 = stablehlo.multiply %1491, %146 : tensor<256x256xf32>
-    %1493 = stablehlo.broadcast_in_dim %1492, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1494 = stablehlo.broadcast_in_dim %arg134, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1495 = stablehlo.add %1493, %1494 : tensor<256x256xf32>
-    %1496 = stablehlo.convert %1495 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1497 = stablehlo.reshape %1496 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1498 = stablehlo.multiply %1497, %cst_22 : tensor<1x256x256xbf16>
-    %1499 = stablehlo.multiply %1497, %154 : tensor<1x256x256xbf16>
-    %1500 = stablehlo.convert %1499 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1501 = stablehlo.clamp %cst_23, %1500, %cst_24 : tensor<1x256x256xf32>
-    %1502 = stablehlo.multiply %1501, %1501 : tensor<1x256x256xf32>
-    %1503 = stablehlo.multiply %cst_25, %1502 : tensor<1x256x256xf32>
-    %1504 = stablehlo.add %1503, %cst_26 : tensor<1x256x256xf32>
-    %1505 = stablehlo.multiply %1504, %1502 : tensor<1x256x256xf32>
-    %1506 = stablehlo.add %1505, %cst_27 : tensor<1x256x256xf32>
-    %1507 = stablehlo.multiply %1506, %1502 : tensor<1x256x256xf32>
-    %1508 = stablehlo.add %1507, %cst_28 : tensor<1x256x256xf32>
-    %1509 = stablehlo.multiply %1508, %1502 : tensor<1x256x256xf32>
-    %1510 = stablehlo.add %1509, %cst_29 : tensor<1x256x256xf32>
-    %1511 = stablehlo.multiply %1510, %1502 : tensor<1x256x256xf32>
-    %1512 = stablehlo.add %1511, %cst_30 : tensor<1x256x256xf32>
-    %1513 = stablehlo.multiply %1512, %1502 : tensor<1x256x256xf32>
-    %1514 = stablehlo.add %1513, %cst_31 : tensor<1x256x256xf32>
-    %1515 = stablehlo.multiply %cst_32, %1502 : tensor<1x256x256xf32>
-    %1516 = stablehlo.add %1515, %cst_33 : tensor<1x256x256xf32>
-    %1517 = stablehlo.multiply %1516, %1502 : tensor<1x256x256xf32>
-    %1518 = stablehlo.add %1517, %cst_34 : tensor<1x256x256xf32>
-    %1519 = stablehlo.multiply %1518, %1502 : tensor<1x256x256xf32>
-    %1520 = stablehlo.add %1519, %cst_35 : tensor<1x256x256xf32>
-    %1521 = stablehlo.multiply %1520, %1502 : tensor<1x256x256xf32>
-    %1522 = stablehlo.add %1521, %cst_36 : tensor<1x256x256xf32>
-    %1523 = stablehlo.multiply %1501, %1514 : tensor<1x256x256xf32>
-    %1524 = stablehlo.divide %1523, %1522 : tensor<1x256x256xf32>
-    %1525 = stablehlo.clamp %cst_37, %1524, %cst_38 : tensor<1x256x256xf32>
-    %1526 = stablehlo.convert %1525 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1527 = stablehlo.add %1526, %cst_20 : tensor<1x256x256xbf16>
-    %1528 = stablehlo.multiply %1527, %1498 : tensor<1x256x256xbf16>
-    %1529 = stablehlo.reshape %1528 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1530 = stablehlo.convert %1529 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1531 = stablehlo.dot_general %1530, %arg135, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1532 = stablehlo.broadcast_in_dim %1531, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1533 = stablehlo.multiply %1532, %9 : tensor<256x512xf32>
-    %1534 = stablehlo.broadcast_in_dim %1533, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1535 = stablehlo.broadcast_in_dim %arg136, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %1536 = stablehlo.add %1534, %1535 : tensor<256x512xf32>
-    %1537 = stablehlo.convert %1536 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %1538 = stablehlo.reshape %1537 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1539 = stablehlo.add %1538, %1450 : tensor<1x256x512xbf16>
-    %1540 = stablehlo.convert %1539 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1541 = stablehlo.convert %1540 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1542 = stablehlo.reduce(%1541 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1543 = stablehlo.reshape %1542 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1544 = stablehlo.broadcast_in_dim %1543, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1545 = stablehlo.divide %1544, %23 : tensor<1x256x1xf64>
-    %1546 = stablehlo.broadcast_in_dim %1541, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1547 = stablehlo.broadcast_in_dim %1545, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1548 = stablehlo.subtract %1546, %1547 : tensor<1x256x512xf64>
-    %1549 = stablehlo.multiply %1548, %1548 : tensor<1x256x512xf64>
-    %1550 = stablehlo.reduce(%1549 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1551 = stablehlo.reshape %1550 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1552 = stablehlo.broadcast_in_dim %1551, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1553 = stablehlo.divide %1552, %23 : tensor<1x256x1xf64>
-    %1554 = stablehlo.convert %1553 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1555 = stablehlo.reduce(%1540 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1556 = stablehlo.reshape %1555 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1557 = stablehlo.broadcast_in_dim %1556, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1558 = stablehlo.divide %1557, %39 : tensor<1x256x1xf32>
-    %1559 = stablehlo.broadcast_in_dim %1554, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1560 = stablehlo.add %1559, %44 : tensor<1x256x1xf32>
-    %1561 = stablehlo.rsqrt %1560 : tensor<1x256x1xf32>
-    %1562 = stablehlo.broadcast_in_dim %1540, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1563 = stablehlo.broadcast_in_dim %1558, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1564 = stablehlo.subtract %1562, %1563 : tensor<1x256x512xf32>
-    %1565 = stablehlo.broadcast_in_dim %1564, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1566 = stablehlo.broadcast_in_dim %1561, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1567 = stablehlo.multiply %1565, %1566 : tensor<1x256x512xf32>
-    %1568 = stablehlo.convert %arg73 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1569 = stablehlo.broadcast_in_dim %1567, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1570 = stablehlo.broadcast_in_dim %1568, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1571 = stablehlo.multiply %1569, %1570 : tensor<1x256x512xf32>
-    %1572 = stablehlo.convert %arg74 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1573 = stablehlo.broadcast_in_dim %1571, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1574 = stablehlo.broadcast_in_dim %1572, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1575 = stablehlo.add %1573, %1574 : tensor<1x256x512xf32>
-    %1576 = stablehlo.convert %1575 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1577 = stablehlo.convolution(%1576, %arg75) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1578 = stablehlo.reshape %arg76 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %1579 = stablehlo.broadcast_in_dim %1577, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %1580 = stablehlo.broadcast_in_dim %1578, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1581 = stablehlo.add %1579, %1580 : tensor<1x1024x512xbf16>
-    %1582 = stablehlo.multiply %1581, %cst_3 : tensor<1x1024x512xbf16>
-    %1583 = stablehlo.multiply %1581, %68 : tensor<1x1024x512xbf16>
-    %1584 = stablehlo.convert %1583 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %1585 = stablehlo.clamp %cst_4, %1584, %cst_5 : tensor<1x1024x512xf32>
-    %1586 = stablehlo.multiply %1585, %1585 : tensor<1x1024x512xf32>
-    %1587 = stablehlo.multiply %cst_6, %1586 : tensor<1x1024x512xf32>
-    %1588 = stablehlo.add %1587, %cst_7 : tensor<1x1024x512xf32>
-    %1589 = stablehlo.multiply %1588, %1586 : tensor<1x1024x512xf32>
-    %1590 = stablehlo.add %1589, %cst_8 : tensor<1x1024x512xf32>
-    %1591 = stablehlo.multiply %1590, %1586 : tensor<1x1024x512xf32>
-    %1592 = stablehlo.add %1591, %cst_9 : tensor<1x1024x512xf32>
-    %1593 = stablehlo.multiply %1592, %1586 : tensor<1x1024x512xf32>
-    %1594 = stablehlo.add %1593, %cst_10 : tensor<1x1024x512xf32>
-    %1595 = stablehlo.multiply %1594, %1586 : tensor<1x1024x512xf32>
-    %1596 = stablehlo.add %1595, %cst_11 : tensor<1x1024x512xf32>
-    %1597 = stablehlo.multiply %1596, %1586 : tensor<1x1024x512xf32>
-    %1598 = stablehlo.add %1597, %cst_12 : tensor<1x1024x512xf32>
-    %1599 = stablehlo.multiply %cst_13, %1586 : tensor<1x1024x512xf32>
-    %1600 = stablehlo.add %1599, %cst_14 : tensor<1x1024x512xf32>
-    %1601 = stablehlo.multiply %1600, %1586 : tensor<1x1024x512xf32>
-    %1602 = stablehlo.add %1601, %cst_15 : tensor<1x1024x512xf32>
-    %1603 = stablehlo.multiply %1602, %1586 : tensor<1x1024x512xf32>
-    %1604 = stablehlo.add %1603, %cst_16 : tensor<1x1024x512xf32>
-    %1605 = stablehlo.multiply %1604, %1586 : tensor<1x1024x512xf32>
-    %1606 = stablehlo.add %1605, %cst_17 : tensor<1x1024x512xf32>
-    %1607 = stablehlo.multiply %1585, %1598 : tensor<1x1024x512xf32>
-    %1608 = stablehlo.divide %1607, %1606 : tensor<1x1024x512xf32>
-    %1609 = stablehlo.clamp %cst_18, %1608, %cst_19 : tensor<1x1024x512xf32>
-    %1610 = stablehlo.convert %1609 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %1611 = stablehlo.add %1610, %cst_1 : tensor<1x1024x512xbf16>
-    %1612 = stablehlo.multiply %1611, %1582 : tensor<1x1024x512xbf16>
-    %1613 = stablehlo.convolution(%1612, %arg77) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %1614 = stablehlo.reshape %arg78 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %1615 = stablehlo.broadcast_in_dim %1613, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1616 = stablehlo.broadcast_in_dim %1614, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %1617 = stablehlo.add %1615, %1616 : tensor<1x256x512xbf16>
-    %1618 = stablehlo.add %1617, %1539 : tensor<1x256x512xbf16>
-    %1619 = stablehlo.convert %1618 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1620 = stablehlo.convert %1619 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1621 = stablehlo.reduce(%1620 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1622 = stablehlo.reshape %1621 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1623 = stablehlo.broadcast_in_dim %1622, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1624 = stablehlo.divide %1623, %23 : tensor<1x256x1xf64>
-    %1625 = stablehlo.broadcast_in_dim %1620, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1626 = stablehlo.broadcast_in_dim %1624, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1627 = stablehlo.subtract %1625, %1626 : tensor<1x256x512xf64>
-    %1628 = stablehlo.multiply %1627, %1627 : tensor<1x256x512xf64>
-    %1629 = stablehlo.reduce(%1628 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1630 = stablehlo.reshape %1629 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1631 = stablehlo.broadcast_in_dim %1630, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1632 = stablehlo.divide %1631, %23 : tensor<1x256x1xf64>
-    %1633 = stablehlo.convert %1632 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1634 = stablehlo.reduce(%1619 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1635 = stablehlo.reshape %1634 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1636 = stablehlo.broadcast_in_dim %1635, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1637 = stablehlo.divide %1636, %39 : tensor<1x256x1xf32>
-    %1638 = stablehlo.broadcast_in_dim %1633, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1639 = stablehlo.add %1638, %44 : tensor<1x256x1xf32>
-    %1640 = stablehlo.rsqrt %1639 : tensor<1x256x1xf32>
-    %1641 = stablehlo.broadcast_in_dim %1619, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1642 = stablehlo.broadcast_in_dim %1637, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1643 = stablehlo.subtract %1641, %1642 : tensor<1x256x512xf32>
-    %1644 = stablehlo.broadcast_in_dim %1643, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1645 = stablehlo.broadcast_in_dim %1640, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1646 = stablehlo.multiply %1644, %1645 : tensor<1x256x512xf32>
-    %1647 = stablehlo.convert %arg79 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1648 = stablehlo.broadcast_in_dim %1646, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1649 = stablehlo.broadcast_in_dim %1647, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1650 = stablehlo.multiply %1648, %1649 : tensor<1x256x512xf32>
-    %1651 = stablehlo.convert %arg80 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1652 = stablehlo.broadcast_in_dim %1650, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1653 = stablehlo.broadcast_in_dim %1651, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1654 = stablehlo.add %1652, %1653 : tensor<1x256x512xf32>
-    %1655 = stablehlo.convert %1654 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1656 = stablehlo.reshape %1655 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %1657 = stablehlo.convert %1656 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %1658 = stablehlo.dot_general %1657, %arg137, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %1659 = stablehlo.broadcast_in_dim %1658, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1660 = stablehlo.multiply %1659, %146 : tensor<256x256xf32>
-    %1661 = stablehlo.broadcast_in_dim %1660, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1662 = stablehlo.broadcast_in_dim %arg138, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1663 = stablehlo.add %1661, %1662 : tensor<256x256xf32>
-    %1664 = stablehlo.convert %1663 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1665 = stablehlo.reshape %1664 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1666 = stablehlo.multiply %1665, %cst_22 : tensor<1x256x256xbf16>
-    %1667 = stablehlo.multiply %1665, %154 : tensor<1x256x256xbf16>
-    %1668 = stablehlo.convert %1667 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1669 = stablehlo.clamp %cst_23, %1668, %cst_24 : tensor<1x256x256xf32>
-    %1670 = stablehlo.multiply %1669, %1669 : tensor<1x256x256xf32>
-    %1671 = stablehlo.multiply %cst_25, %1670 : tensor<1x256x256xf32>
-    %1672 = stablehlo.add %1671, %cst_26 : tensor<1x256x256xf32>
-    %1673 = stablehlo.multiply %1672, %1670 : tensor<1x256x256xf32>
-    %1674 = stablehlo.add %1673, %cst_27 : tensor<1x256x256xf32>
-    %1675 = stablehlo.multiply %1674, %1670 : tensor<1x256x256xf32>
-    %1676 = stablehlo.add %1675, %cst_28 : tensor<1x256x256xf32>
-    %1677 = stablehlo.multiply %1676, %1670 : tensor<1x256x256xf32>
-    %1678 = stablehlo.add %1677, %cst_29 : tensor<1x256x256xf32>
-    %1679 = stablehlo.multiply %1678, %1670 : tensor<1x256x256xf32>
-    %1680 = stablehlo.add %1679, %cst_30 : tensor<1x256x256xf32>
-    %1681 = stablehlo.multiply %1680, %1670 : tensor<1x256x256xf32>
-    %1682 = stablehlo.add %1681, %cst_31 : tensor<1x256x256xf32>
-    %1683 = stablehlo.multiply %cst_32, %1670 : tensor<1x256x256xf32>
-    %1684 = stablehlo.add %1683, %cst_33 : tensor<1x256x256xf32>
-    %1685 = stablehlo.multiply %1684, %1670 : tensor<1x256x256xf32>
-    %1686 = stablehlo.add %1685, %cst_34 : tensor<1x256x256xf32>
-    %1687 = stablehlo.multiply %1686, %1670 : tensor<1x256x256xf32>
-    %1688 = stablehlo.add %1687, %cst_35 : tensor<1x256x256xf32>
-    %1689 = stablehlo.multiply %1688, %1670 : tensor<1x256x256xf32>
-    %1690 = stablehlo.add %1689, %cst_36 : tensor<1x256x256xf32>
-    %1691 = stablehlo.multiply %1669, %1682 : tensor<1x256x256xf32>
-    %1692 = stablehlo.divide %1691, %1690 : tensor<1x256x256xf32>
-    %1693 = stablehlo.clamp %cst_37, %1692, %cst_38 : tensor<1x256x256xf32>
-    %1694 = stablehlo.convert %1693 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1695 = stablehlo.add %1694, %cst_20 : tensor<1x256x256xbf16>
-    %1696 = stablehlo.multiply %1695, %1666 : tensor<1x256x256xbf16>
-    %1697 = stablehlo.reshape %1696 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1698 = stablehlo.convert %1697 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1699 = stablehlo.dot_general %1698, %arg139, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1700 = stablehlo.broadcast_in_dim %1699, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1701 = stablehlo.multiply %1700, %9 : tensor<256x512xf32>
-    %1702 = stablehlo.broadcast_in_dim %1701, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1703 = stablehlo.broadcast_in_dim %arg140, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %1704 = stablehlo.add %1702, %1703 : tensor<256x512xf32>
-    %1705 = stablehlo.convert %1704 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %1706 = stablehlo.reshape %1705 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1707 = stablehlo.add %1706, %1618 : tensor<1x256x512xbf16>
-    %1708 = stablehlo.convert %1707 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1709 = stablehlo.convert %1708 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1710 = stablehlo.reduce(%1709 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1711 = stablehlo.reshape %1710 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1712 = stablehlo.broadcast_in_dim %1711, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1713 = stablehlo.divide %1712, %23 : tensor<1x256x1xf64>
-    %1714 = stablehlo.broadcast_in_dim %1709, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1715 = stablehlo.broadcast_in_dim %1713, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1716 = stablehlo.subtract %1714, %1715 : tensor<1x256x512xf64>
-    %1717 = stablehlo.multiply %1716, %1716 : tensor<1x256x512xf64>
-    %1718 = stablehlo.reduce(%1717 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1719 = stablehlo.reshape %1718 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1720 = stablehlo.broadcast_in_dim %1719, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1721 = stablehlo.divide %1720, %23 : tensor<1x256x1xf64>
-    %1722 = stablehlo.convert %1721 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1723 = stablehlo.reduce(%1708 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1724 = stablehlo.reshape %1723 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1725 = stablehlo.broadcast_in_dim %1724, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1726 = stablehlo.divide %1725, %39 : tensor<1x256x1xf32>
-    %1727 = stablehlo.broadcast_in_dim %1722, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1728 = stablehlo.add %1727, %44 : tensor<1x256x1xf32>
-    %1729 = stablehlo.rsqrt %1728 : tensor<1x256x1xf32>
-    %1730 = stablehlo.broadcast_in_dim %1708, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1731 = stablehlo.broadcast_in_dim %1726, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1732 = stablehlo.subtract %1730, %1731 : tensor<1x256x512xf32>
-    %1733 = stablehlo.broadcast_in_dim %1732, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1734 = stablehlo.broadcast_in_dim %1729, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1735 = stablehlo.multiply %1733, %1734 : tensor<1x256x512xf32>
-    %1736 = stablehlo.convert %arg81 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1737 = stablehlo.broadcast_in_dim %1735, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1738 = stablehlo.broadcast_in_dim %1736, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1739 = stablehlo.multiply %1737, %1738 : tensor<1x256x512xf32>
-    %1740 = stablehlo.convert %arg82 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1741 = stablehlo.broadcast_in_dim %1739, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1742 = stablehlo.broadcast_in_dim %1740, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1743 = stablehlo.add %1741, %1742 : tensor<1x256x512xf32>
-    %1744 = stablehlo.convert %1743 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1745 = stablehlo.convolution(%1744, %arg83) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1746 = stablehlo.reshape %arg84 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %1747 = stablehlo.broadcast_in_dim %1745, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %1748 = stablehlo.broadcast_in_dim %1746, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1749 = stablehlo.add %1747, %1748 : tensor<1x1024x512xbf16>
-    %1750 = stablehlo.multiply %1749, %cst_3 : tensor<1x1024x512xbf16>
-    %1751 = stablehlo.multiply %1749, %68 : tensor<1x1024x512xbf16>
-    %1752 = stablehlo.convert %1751 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %1753 = stablehlo.clamp %cst_4, %1752, %cst_5 : tensor<1x1024x512xf32>
-    %1754 = stablehlo.multiply %1753, %1753 : tensor<1x1024x512xf32>
-    %1755 = stablehlo.multiply %cst_6, %1754 : tensor<1x1024x512xf32>
-    %1756 = stablehlo.add %1755, %cst_7 : tensor<1x1024x512xf32>
-    %1757 = stablehlo.multiply %1756, %1754 : tensor<1x1024x512xf32>
-    %1758 = stablehlo.add %1757, %cst_8 : tensor<1x1024x512xf32>
-    %1759 = stablehlo.multiply %1758, %1754 : tensor<1x1024x512xf32>
-    %1760 = stablehlo.add %1759, %cst_9 : tensor<1x1024x512xf32>
-    %1761 = stablehlo.multiply %1760, %1754 : tensor<1x1024x512xf32>
-    %1762 = stablehlo.add %1761, %cst_10 : tensor<1x1024x512xf32>
-    %1763 = stablehlo.multiply %1762, %1754 : tensor<1x1024x512xf32>
-    %1764 = stablehlo.add %1763, %cst_11 : tensor<1x1024x512xf32>
-    %1765 = stablehlo.multiply %1764, %1754 : tensor<1x1024x512xf32>
-    %1766 = stablehlo.add %1765, %cst_12 : tensor<1x1024x512xf32>
-    %1767 = stablehlo.multiply %cst_13, %1754 : tensor<1x1024x512xf32>
-    %1768 = stablehlo.add %1767, %cst_14 : tensor<1x1024x512xf32>
-    %1769 = stablehlo.multiply %1768, %1754 : tensor<1x1024x512xf32>
-    %1770 = stablehlo.add %1769, %cst_15 : tensor<1x1024x512xf32>
-    %1771 = stablehlo.multiply %1770, %1754 : tensor<1x1024x512xf32>
-    %1772 = stablehlo.add %1771, %cst_16 : tensor<1x1024x512xf32>
-    %1773 = stablehlo.multiply %1772, %1754 : tensor<1x1024x512xf32>
-    %1774 = stablehlo.add %1773, %cst_17 : tensor<1x1024x512xf32>
-    %1775 = stablehlo.multiply %1753, %1766 : tensor<1x1024x512xf32>
-    %1776 = stablehlo.divide %1775, %1774 : tensor<1x1024x512xf32>
-    %1777 = stablehlo.clamp %cst_18, %1776, %cst_19 : tensor<1x1024x512xf32>
-    %1778 = stablehlo.convert %1777 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %1779 = stablehlo.add %1778, %cst_1 : tensor<1x1024x512xbf16>
-    %1780 = stablehlo.multiply %1779, %1750 : tensor<1x1024x512xbf16>
-    %1781 = stablehlo.convolution(%1780, %arg85) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %1782 = stablehlo.reshape %arg86 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %1783 = stablehlo.broadcast_in_dim %1781, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1784 = stablehlo.broadcast_in_dim %1782, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %1785 = stablehlo.add %1783, %1784 : tensor<1x256x512xbf16>
-    %1786 = stablehlo.add %1785, %1707 : tensor<1x256x512xbf16>
-    %1787 = stablehlo.convert %1786 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1788 = stablehlo.convert %1787 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1789 = stablehlo.reduce(%1788 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1790 = stablehlo.reshape %1789 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1791 = stablehlo.broadcast_in_dim %1790, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1792 = stablehlo.divide %1791, %23 : tensor<1x256x1xf64>
-    %1793 = stablehlo.broadcast_in_dim %1788, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1794 = stablehlo.broadcast_in_dim %1792, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1795 = stablehlo.subtract %1793, %1794 : tensor<1x256x512xf64>
-    %1796 = stablehlo.multiply %1795, %1795 : tensor<1x256x512xf64>
-    %1797 = stablehlo.reduce(%1796 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1798 = stablehlo.reshape %1797 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1799 = stablehlo.broadcast_in_dim %1798, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1800 = stablehlo.divide %1799, %23 : tensor<1x256x1xf64>
-    %1801 = stablehlo.convert %1800 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1802 = stablehlo.reduce(%1787 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1803 = stablehlo.reshape %1802 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1804 = stablehlo.broadcast_in_dim %1803, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1805 = stablehlo.divide %1804, %39 : tensor<1x256x1xf32>
-    %1806 = stablehlo.broadcast_in_dim %1801, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1807 = stablehlo.add %1806, %44 : tensor<1x256x1xf32>
-    %1808 = stablehlo.rsqrt %1807 : tensor<1x256x1xf32>
-    %1809 = stablehlo.broadcast_in_dim %1787, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1810 = stablehlo.broadcast_in_dim %1805, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1811 = stablehlo.subtract %1809, %1810 : tensor<1x256x512xf32>
-    %1812 = stablehlo.broadcast_in_dim %1811, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1813 = stablehlo.broadcast_in_dim %1808, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1814 = stablehlo.multiply %1812, %1813 : tensor<1x256x512xf32>
-    %1815 = stablehlo.convert %arg87 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1816 = stablehlo.broadcast_in_dim %1814, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1817 = stablehlo.broadcast_in_dim %1815, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1818 = stablehlo.multiply %1816, %1817 : tensor<1x256x512xf32>
-    %1819 = stablehlo.convert %arg88 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1820 = stablehlo.broadcast_in_dim %1818, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1821 = stablehlo.broadcast_in_dim %1819, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1822 = stablehlo.add %1820, %1821 : tensor<1x256x512xf32>
-    %1823 = stablehlo.convert %1822 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1824 = stablehlo.reshape %1823 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %1825 = stablehlo.convert %1824 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %1826 = stablehlo.dot_general %1825, %arg141, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %1827 = stablehlo.broadcast_in_dim %1826, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1828 = stablehlo.multiply %1827, %146 : tensor<256x256xf32>
-    %1829 = stablehlo.broadcast_in_dim %1828, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1830 = stablehlo.broadcast_in_dim %arg142, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1831 = stablehlo.add %1829, %1830 : tensor<256x256xf32>
-    %1832 = stablehlo.convert %1831 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1833 = stablehlo.reshape %1832 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1834 = stablehlo.multiply %1833, %cst_22 : tensor<1x256x256xbf16>
-    %1835 = stablehlo.multiply %1833, %154 : tensor<1x256x256xbf16>
-    %1836 = stablehlo.convert %1835 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1837 = stablehlo.clamp %cst_23, %1836, %cst_24 : tensor<1x256x256xf32>
-    %1838 = stablehlo.multiply %1837, %1837 : tensor<1x256x256xf32>
-    %1839 = stablehlo.multiply %cst_25, %1838 : tensor<1x256x256xf32>
-    %1840 = stablehlo.add %1839, %cst_26 : tensor<1x256x256xf32>
-    %1841 = stablehlo.multiply %1840, %1838 : tensor<1x256x256xf32>
-    %1842 = stablehlo.add %1841, %cst_27 : tensor<1x256x256xf32>
-    %1843 = stablehlo.multiply %1842, %1838 : tensor<1x256x256xf32>
-    %1844 = stablehlo.add %1843, %cst_28 : tensor<1x256x256xf32>
-    %1845 = stablehlo.multiply %1844, %1838 : tensor<1x256x256xf32>
-    %1846 = stablehlo.add %1845, %cst_29 : tensor<1x256x256xf32>
-    %1847 = stablehlo.multiply %1846, %1838 : tensor<1x256x256xf32>
-    %1848 = stablehlo.add %1847, %cst_30 : tensor<1x256x256xf32>
-    %1849 = stablehlo.multiply %1848, %1838 : tensor<1x256x256xf32>
-    %1850 = stablehlo.add %1849, %cst_31 : tensor<1x256x256xf32>
-    %1851 = stablehlo.multiply %cst_32, %1838 : tensor<1x256x256xf32>
-    %1852 = stablehlo.add %1851, %cst_33 : tensor<1x256x256xf32>
-    %1853 = stablehlo.multiply %1852, %1838 : tensor<1x256x256xf32>
-    %1854 = stablehlo.add %1853, %cst_34 : tensor<1x256x256xf32>
-    %1855 = stablehlo.multiply %1854, %1838 : tensor<1x256x256xf32>
-    %1856 = stablehlo.add %1855, %cst_35 : tensor<1x256x256xf32>
-    %1857 = stablehlo.multiply %1856, %1838 : tensor<1x256x256xf32>
-    %1858 = stablehlo.add %1857, %cst_36 : tensor<1x256x256xf32>
-    %1859 = stablehlo.multiply %1837, %1850 : tensor<1x256x256xf32>
-    %1860 = stablehlo.divide %1859, %1858 : tensor<1x256x256xf32>
-    %1861 = stablehlo.clamp %cst_37, %1860, %cst_38 : tensor<1x256x256xf32>
-    %1862 = stablehlo.convert %1861 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1863 = stablehlo.add %1862, %cst_20 : tensor<1x256x256xbf16>
-    %1864 = stablehlo.multiply %1863, %1834 : tensor<1x256x256xbf16>
-    %1865 = stablehlo.reshape %1864 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1866 = stablehlo.convert %1865 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1867 = stablehlo.dot_general %1866, %arg143, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1868 = stablehlo.broadcast_in_dim %1867, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1869 = stablehlo.multiply %1868, %9 : tensor<256x512xf32>
-    %1870 = stablehlo.broadcast_in_dim %1869, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %1871 = stablehlo.broadcast_in_dim %arg144, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %1872 = stablehlo.add %1870, %1871 : tensor<256x512xf32>
-    %1873 = stablehlo.convert %1872 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %1874 = stablehlo.reshape %1873 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1875 = stablehlo.add %1874, %1786 : tensor<1x256x512xbf16>
-    %1876 = stablehlo.convert %1875 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1877 = stablehlo.convert %1876 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1878 = stablehlo.reduce(%1877 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1879 = stablehlo.reshape %1878 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1880 = stablehlo.broadcast_in_dim %1879, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1881 = stablehlo.divide %1880, %23 : tensor<1x256x1xf64>
-    %1882 = stablehlo.broadcast_in_dim %1877, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1883 = stablehlo.broadcast_in_dim %1881, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1884 = stablehlo.subtract %1882, %1883 : tensor<1x256x512xf64>
-    %1885 = stablehlo.multiply %1884, %1884 : tensor<1x256x512xf64>
-    %1886 = stablehlo.reduce(%1885 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1887 = stablehlo.reshape %1886 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1888 = stablehlo.broadcast_in_dim %1887, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1889 = stablehlo.divide %1888, %23 : tensor<1x256x1xf64>
-    %1890 = stablehlo.convert %1889 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1891 = stablehlo.reduce(%1876 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1892 = stablehlo.reshape %1891 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1893 = stablehlo.broadcast_in_dim %1892, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1894 = stablehlo.divide %1893, %39 : tensor<1x256x1xf32>
-    %1895 = stablehlo.broadcast_in_dim %1890, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1896 = stablehlo.add %1895, %44 : tensor<1x256x1xf32>
-    %1897 = stablehlo.rsqrt %1896 : tensor<1x256x1xf32>
-    %1898 = stablehlo.broadcast_in_dim %1876, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1899 = stablehlo.broadcast_in_dim %1894, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1900 = stablehlo.subtract %1898, %1899 : tensor<1x256x512xf32>
-    %1901 = stablehlo.broadcast_in_dim %1900, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1902 = stablehlo.broadcast_in_dim %1897, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1903 = stablehlo.multiply %1901, %1902 : tensor<1x256x512xf32>
-    %1904 = stablehlo.convert %arg89 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1905 = stablehlo.broadcast_in_dim %1903, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1906 = stablehlo.broadcast_in_dim %1904, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1907 = stablehlo.multiply %1905, %1906 : tensor<1x256x512xf32>
-    %1908 = stablehlo.convert %arg90 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1909 = stablehlo.broadcast_in_dim %1907, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1910 = stablehlo.broadcast_in_dim %1908, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1911 = stablehlo.add %1909, %1910 : tensor<1x256x512xf32>
-    %1912 = stablehlo.convert %1911 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1913 = stablehlo.convolution(%1912, %arg91) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x512xbf16>, tensor<1024x256x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1914 = stablehlo.reshape %arg92 : (tensor<1024xbf16>) -> tensor<1024x1xbf16>
-    %1915 = stablehlo.broadcast_in_dim %1913, dims = [0, 1, 2] : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xbf16>
-    %1916 = stablehlo.broadcast_in_dim %1914, dims = [1, 2] : (tensor<1024x1xbf16>) -> tensor<1x1024x512xbf16>
-    %1917 = stablehlo.add %1915, %1916 : tensor<1x1024x512xbf16>
-    %1918 = stablehlo.multiply %1917, %cst_3 : tensor<1x1024x512xbf16>
-    %1919 = stablehlo.multiply %1917, %68 : tensor<1x1024x512xbf16>
-    %1920 = stablehlo.convert %1919 : (tensor<1x1024x512xbf16>) -> tensor<1x1024x512xf32>
-    %1921 = stablehlo.clamp %cst_4, %1920, %cst_5 : tensor<1x1024x512xf32>
-    %1922 = stablehlo.multiply %1921, %1921 : tensor<1x1024x512xf32>
-    %1923 = stablehlo.multiply %cst_6, %1922 : tensor<1x1024x512xf32>
-    %1924 = stablehlo.add %1923, %cst_7 : tensor<1x1024x512xf32>
-    %1925 = stablehlo.multiply %1924, %1922 : tensor<1x1024x512xf32>
-    %1926 = stablehlo.add %1925, %cst_8 : tensor<1x1024x512xf32>
-    %1927 = stablehlo.multiply %1926, %1922 : tensor<1x1024x512xf32>
-    %1928 = stablehlo.add %1927, %cst_9 : tensor<1x1024x512xf32>
-    %1929 = stablehlo.multiply %1928, %1922 : tensor<1x1024x512xf32>
-    %1930 = stablehlo.add %1929, %cst_10 : tensor<1x1024x512xf32>
-    %1931 = stablehlo.multiply %1930, %1922 : tensor<1x1024x512xf32>
-    %1932 = stablehlo.add %1931, %cst_11 : tensor<1x1024x512xf32>
-    %1933 = stablehlo.multiply %1932, %1922 : tensor<1x1024x512xf32>
-    %1934 = stablehlo.add %1933, %cst_12 : tensor<1x1024x512xf32>
-    %1935 = stablehlo.multiply %cst_13, %1922 : tensor<1x1024x512xf32>
-    %1936 = stablehlo.add %1935, %cst_14 : tensor<1x1024x512xf32>
-    %1937 = stablehlo.multiply %1936, %1922 : tensor<1x1024x512xf32>
-    %1938 = stablehlo.add %1937, %cst_15 : tensor<1x1024x512xf32>
-    %1939 = stablehlo.multiply %1938, %1922 : tensor<1x1024x512xf32>
-    %1940 = stablehlo.add %1939, %cst_16 : tensor<1x1024x512xf32>
-    %1941 = stablehlo.multiply %1940, %1922 : tensor<1x1024x512xf32>
-    %1942 = stablehlo.add %1941, %cst_17 : tensor<1x1024x512xf32>
-    %1943 = stablehlo.multiply %1921, %1934 : tensor<1x1024x512xf32>
-    %1944 = stablehlo.divide %1943, %1942 : tensor<1x1024x512xf32>
-    %1945 = stablehlo.clamp %cst_18, %1944, %cst_19 : tensor<1x1024x512xf32>
-    %1946 = stablehlo.convert %1945 : (tensor<1x1024x512xf32>) -> tensor<1x1024x512xbf16>
-    %1947 = stablehlo.add %1946, %cst_1 : tensor<1x1024x512xbf16>
-    %1948 = stablehlo.multiply %1947, %1918 : tensor<1x1024x512xbf16>
-    %1949 = stablehlo.convolution(%1948, %arg93) dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], window = {stride = [1], pad = [[0, 0]], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x512xbf16>, tensor<256x1024x1xbf16>) -> tensor<1x256x512xbf16>
-    %1950 = stablehlo.reshape %arg94 : (tensor<256xbf16>) -> tensor<256x1xbf16>
-    %1951 = stablehlo.broadcast_in_dim %1949, dims = [0, 1, 2] : (tensor<1x256x512xbf16>) -> tensor<1x256x512xbf16>
-    %1952 = stablehlo.broadcast_in_dim %1950, dims = [1, 2] : (tensor<256x1xbf16>) -> tensor<1x256x512xbf16>
-    %1953 = stablehlo.add %1951, %1952 : tensor<1x256x512xbf16>
-    %1954 = stablehlo.add %1953, %1875 : tensor<1x256x512xbf16>
-    %1955 = stablehlo.convert %1954 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %1956 = stablehlo.convert %1955 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %1957 = stablehlo.reduce(%1956 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1958 = stablehlo.reshape %1957 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1959 = stablehlo.broadcast_in_dim %1958, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1960 = stablehlo.divide %1959, %23 : tensor<1x256x1xf64>
-    %1961 = stablehlo.broadcast_in_dim %1956, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %1962 = stablehlo.broadcast_in_dim %1960, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %1963 = stablehlo.subtract %1961, %1962 : tensor<1x256x512xf64>
-    %1964 = stablehlo.multiply %1963, %1963 : tensor<1x256x512xf64>
-    %1965 = stablehlo.reduce(%1964 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1966 = stablehlo.reshape %1965 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1967 = stablehlo.broadcast_in_dim %1966, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1968 = stablehlo.divide %1967, %23 : tensor<1x256x1xf64>
-    %1969 = stablehlo.convert %1968 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1970 = stablehlo.reduce(%1955 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1971 = stablehlo.reshape %1970 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1972 = stablehlo.broadcast_in_dim %1971, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1973 = stablehlo.divide %1972, %39 : tensor<1x256x1xf32>
-    %1974 = stablehlo.broadcast_in_dim %1969, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1975 = stablehlo.add %1974, %44 : tensor<1x256x1xf32>
-    %1976 = stablehlo.rsqrt %1975 : tensor<1x256x1xf32>
-    %1977 = stablehlo.broadcast_in_dim %1955, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1978 = stablehlo.broadcast_in_dim %1973, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1979 = stablehlo.subtract %1977, %1978 : tensor<1x256x512xf32>
-    %1980 = stablehlo.broadcast_in_dim %1979, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1981 = stablehlo.broadcast_in_dim %1976, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %1982 = stablehlo.multiply %1980, %1981 : tensor<1x256x512xf32>
-    %1983 = stablehlo.convert %arg95 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1984 = stablehlo.broadcast_in_dim %1982, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1985 = stablehlo.broadcast_in_dim %1983, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1986 = stablehlo.multiply %1984, %1985 : tensor<1x256x512xf32>
-    %1987 = stablehlo.convert %arg96 : (tensor<512xbf16>) -> tensor<512xf32>
-    %1988 = stablehlo.broadcast_in_dim %1986, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %1989 = stablehlo.broadcast_in_dim %1987, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %1990 = stablehlo.add %1988, %1989 : tensor<1x256x512xf32>
-    %1991 = stablehlo.convert %1990 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %1992 = stablehlo.reshape %1991 : (tensor<1x256x512xbf16>) -> tensor<256x512xbf16>
-    %1993 = stablehlo.convert %1992 : (tensor<256x512xbf16>) -> tensor<256x512xf32>
-    %1994 = stablehlo.dot_general %1993, %arg145, contracting_dims = [1] x [0] : (tensor<256x512xf32>, tensor<512x256xf32>) -> tensor<256x256xf32>
-    %1995 = stablehlo.broadcast_in_dim %1994, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1996 = stablehlo.multiply %1995, %146 : tensor<256x256xf32>
-    %1997 = stablehlo.broadcast_in_dim %1996, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1998 = stablehlo.broadcast_in_dim %arg146, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1999 = stablehlo.add %1997, %1998 : tensor<256x256xf32>
-    %2000 = stablehlo.convert %1999 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2001 = stablehlo.reshape %2000 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2002 = stablehlo.multiply %2001, %cst_22 : tensor<1x256x256xbf16>
-    %2003 = stablehlo.multiply %2001, %154 : tensor<1x256x256xbf16>
-    %2004 = stablehlo.convert %2003 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %2005 = stablehlo.clamp %cst_23, %2004, %cst_24 : tensor<1x256x256xf32>
-    %2006 = stablehlo.multiply %2005, %2005 : tensor<1x256x256xf32>
-    %2007 = stablehlo.multiply %cst_25, %2006 : tensor<1x256x256xf32>
-    %2008 = stablehlo.add %2007, %cst_26 : tensor<1x256x256xf32>
-    %2009 = stablehlo.multiply %2008, %2006 : tensor<1x256x256xf32>
-    %2010 = stablehlo.add %2009, %cst_27 : tensor<1x256x256xf32>
-    %2011 = stablehlo.multiply %2010, %2006 : tensor<1x256x256xf32>
-    %2012 = stablehlo.add %2011, %cst_28 : tensor<1x256x256xf32>
-    %2013 = stablehlo.multiply %2012, %2006 : tensor<1x256x256xf32>
-    %2014 = stablehlo.add %2013, %cst_29 : tensor<1x256x256xf32>
-    %2015 = stablehlo.multiply %2014, %2006 : tensor<1x256x256xf32>
-    %2016 = stablehlo.add %2015, %cst_30 : tensor<1x256x256xf32>
-    %2017 = stablehlo.multiply %2016, %2006 : tensor<1x256x256xf32>
-    %2018 = stablehlo.add %2017, %cst_31 : tensor<1x256x256xf32>
-    %2019 = stablehlo.multiply %cst_32, %2006 : tensor<1x256x256xf32>
-    %2020 = stablehlo.add %2019, %cst_33 : tensor<1x256x256xf32>
-    %2021 = stablehlo.multiply %2020, %2006 : tensor<1x256x256xf32>
-    %2022 = stablehlo.add %2021, %cst_34 : tensor<1x256x256xf32>
-    %2023 = stablehlo.multiply %2022, %2006 : tensor<1x256x256xf32>
-    %2024 = stablehlo.add %2023, %cst_35 : tensor<1x256x256xf32>
-    %2025 = stablehlo.multiply %2024, %2006 : tensor<1x256x256xf32>
-    %2026 = stablehlo.add %2025, %cst_36 : tensor<1x256x256xf32>
-    %2027 = stablehlo.multiply %2005, %2018 : tensor<1x256x256xf32>
-    %2028 = stablehlo.divide %2027, %2026 : tensor<1x256x256xf32>
-    %2029 = stablehlo.clamp %cst_37, %2028, %cst_38 : tensor<1x256x256xf32>
-    %2030 = stablehlo.convert %2029 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %2031 = stablehlo.add %2030, %cst_20 : tensor<1x256x256xbf16>
-    %2032 = stablehlo.multiply %2031, %2002 : tensor<1x256x256xbf16>
-    %2033 = stablehlo.reshape %2032 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2034 = stablehlo.convert %2033 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %2035 = stablehlo.dot_general %2034, %arg147, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x512xf32>) -> tensor<256x512xf32>
-    %2036 = stablehlo.broadcast_in_dim %2035, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %2037 = stablehlo.multiply %2036, %9 : tensor<256x512xf32>
-    %2038 = stablehlo.broadcast_in_dim %2037, dims = [0, 1] : (tensor<256x512xf32>) -> tensor<256x512xf32>
-    %2039 = stablehlo.broadcast_in_dim %arg148, dims = [1] : (tensor<512xf32>) -> tensor<256x512xf32>
-    %2040 = stablehlo.add %2038, %2039 : tensor<256x512xf32>
-    %2041 = stablehlo.convert %2040 : (tensor<256x512xf32>) -> tensor<256x512xbf16>
-    %2042 = stablehlo.reshape %2041 : (tensor<256x512xbf16>) -> tensor<1x256x512xbf16>
-    %2043 = stablehlo.add %2042, %1954 : tensor<1x256x512xbf16>
-    %2044 = stablehlo.convert %2043 : (tensor<1x256x512xbf16>) -> tensor<1x256x512xf32>
-    %2045 = stablehlo.convert %2044 : (tensor<1x256x512xf32>) -> tensor<1x256x512xf64>
-    %2046 = stablehlo.reduce(%2045 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2047 = stablehlo.reshape %2046 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2048 = stablehlo.broadcast_in_dim %2047, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2049 = stablehlo.divide %2048, %23 : tensor<1x256x1xf64>
-    %2050 = stablehlo.broadcast_in_dim %2045, dims = [0, 1, 2] : (tensor<1x256x512xf64>) -> tensor<1x256x512xf64>
-    %2051 = stablehlo.broadcast_in_dim %2049, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x512xf64>
-    %2052 = stablehlo.subtract %2050, %2051 : tensor<1x256x512xf64>
-    %2053 = stablehlo.multiply %2052, %2052 : tensor<1x256x512xf64>
-    %2054 = stablehlo.reduce(%2053 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2055 = stablehlo.reshape %2054 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2056 = stablehlo.broadcast_in_dim %2055, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2057 = stablehlo.divide %2056, %23 : tensor<1x256x1xf64>
-    %2058 = stablehlo.convert %2057 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2059 = stablehlo.reduce(%2044 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x512xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2060 = stablehlo.reshape %2059 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2061 = stablehlo.broadcast_in_dim %2060, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2062 = stablehlo.divide %2061, %39 : tensor<1x256x1xf32>
-    %2063 = stablehlo.broadcast_in_dim %2058, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2064 = stablehlo.add %2063, %44 : tensor<1x256x1xf32>
-    %2065 = stablehlo.rsqrt %2064 : tensor<1x256x1xf32>
-    %2066 = stablehlo.broadcast_in_dim %2044, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %2067 = stablehlo.broadcast_in_dim %2062, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %2068 = stablehlo.subtract %2066, %2067 : tensor<1x256x512xf32>
-    %2069 = stablehlo.broadcast_in_dim %2068, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %2070 = stablehlo.broadcast_in_dim %2065, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x512xf32>
-    %2071 = stablehlo.multiply %2069, %2070 : tensor<1x256x512xf32>
-    %2072 = stablehlo.convert %arg97 : (tensor<512xbf16>) -> tensor<512xf32>
-    %2073 = stablehlo.broadcast_in_dim %2071, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %2074 = stablehlo.broadcast_in_dim %2072, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %2075 = stablehlo.multiply %2073, %2074 : tensor<1x256x512xf32>
-    %2076 = stablehlo.convert %arg98 : (tensor<512xbf16>) -> tensor<512xf32>
-    %2077 = stablehlo.broadcast_in_dim %2075, dims = [0, 1, 2] : (tensor<1x256x512xf32>) -> tensor<1x256x512xf32>
-    %2078 = stablehlo.broadcast_in_dim %2076, dims = [2] : (tensor<512xf32>) -> tensor<1x256x512xf32>
-    %2079 = stablehlo.add %2077, %2078 : tensor<1x256x512xf32>
-    %2080 = stablehlo.convert %2079 : (tensor<1x256x512xf32>) -> tensor<1x256x512xbf16>
-    %2081 = stablehlo.transpose %2080, dims = [0, 2, 1] : (tensor<1x256x512xbf16>) -> tensor<1x512x256xbf16>
-    %2082 = stablehlo.reduce(%2081 init: %cst_39) applies stablehlo.add across dimensions = [2] : (tensor<1x512x256xbf16>, tensor<bf16>) -> tensor<1x512xbf16>
-    %2083 = stablehlo.convert %cst_43 : (tensor<1xi64>) -> tensor<1xbf16>
-    %2084 = stablehlo.reshape %2083 : (tensor<1xbf16>) -> tensor<bf16>
-    %2085 = stablehlo.broadcast_in_dim %2082, dims = [0, 1] : (tensor<1x512xbf16>) -> tensor<1x512xbf16>
-    %2086 = stablehlo.broadcast_in_dim %2084, dims = [] : (tensor<bf16>) -> tensor<1x512xbf16>
-    %2087 = stablehlo.divide %2085, %2086 : tensor<1x512xbf16>
-    %2088 = stablehlo.convert %2087 : (tensor<1x512xbf16>) -> tensor<1x512xf32>
-    %2089 = stablehlo.dot_general %2088, %arg149, contracting_dims = [1] x [0] : (tensor<1x512xf32>, tensor<512x1000xf32>) -> tensor<1x1000xf32>
-    %2090 = stablehlo.broadcast_in_dim %2089, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %2091 = stablehlo.broadcast_in_dim %7, dims = [] : (tensor<f32>) -> tensor<1x1000xf32>
-    %2092 = stablehlo.multiply %2090, %2091 : tensor<1x1000xf32>
-    %2093 = stablehlo.broadcast_in_dim %2092, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %2094 = stablehlo.broadcast_in_dim %arg150, dims = [1] : (tensor<1000xf32>) -> tensor<1x1000xf32>
-    %2095 = stablehlo.add %2093, %2094 : tensor<1x1000xf32>
-    %2096 = stablehlo.convert %2095 : (tensor<1x1000xf32>) -> tensor<1x1000xbf16>
-    return %2096 : tensor<1x1000xbf16>
-  }
-}
diff --git a/mlir_tests/Mnist.mlir b/mlir_tests/Mnist.mlir
deleted file mode 100644
index 90a90d1b..00000000
--- a/mlir_tests/Mnist.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x1x28x28xbf16>, %arg1: tensor<32x1x3x3xbf16>, %arg2: tensor<32xbf16>, %arg3: tensor<64x32x3x3xbf16>, %arg4: tensor<64xbf16>, %arg5: tensor<9216x128xf32>, %arg6: tensor<128xf32>, %arg7: tensor<128x10xf32>, %arg8: tensor<10xf32>) -> tensor<1x10xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<1x32x26x26xbf16>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x24x24xbf16>
-    %cst_1 = stablehlo.constant dense<0xFF80> : tensor<bf16>
-    %cst_2 = stablehlo.constant dense<0.000000e+00> : tensor<1x128xbf16>
-    %cst_3 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_4 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_5 = arith.constant dense<1> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1x28x28xbf16>, tensor<32x1x3x3xbf16>) -> tensor<1x32x26x26xbf16>
-    %1 = stablehlo.reshape %arg2 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x32x26x26xbf16>) -> tensor<1x32x26x26xbf16>
-    %3 = stablehlo.broadcast_in_dim %1, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x26x26xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x32x26x26xbf16>
-    %5 = stablehlo.maximum %4, %cst : tensor<1x32x26x26xbf16>
-    %6 = stablehlo.convolution(%5, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x26x26xbf16>, tensor<64x32x3x3xbf16>) -> tensor<1x64x24x24xbf16>
-    %7 = stablehlo.reshape %arg4 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %8 = stablehlo.broadcast_in_dim %6, dims = [0, 1, 2, 3] : (tensor<1x64x24x24xbf16>) -> tensor<1x64x24x24xbf16>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x24x24xbf16>
-    %10 = stablehlo.add %8, %9 : tensor<1x64x24x24xbf16>
-    %11 = stablehlo.maximum %10, %cst_0 : tensor<1x64x24x24xbf16>
-    %12 = "stablehlo.reduce_window"(%11, %cst_1) <{padding = dense<0> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 1, 2, 2>, window_strides = array<i64: 1, 1, 2, 2>}> ({
-    ^bb0(%arg9: tensor<bf16>, %arg10: tensor<bf16>):
-      %49 = stablehlo.maximum %arg9, %arg10 : tensor<bf16>
-      stablehlo.return %49 : tensor<bf16>
-    }) : (tensor<1x64x24x24xbf16>, tensor<bf16>) -> tensor<1x64x12x12xbf16>
-    %13 = stablehlo.reshape %12 : (tensor<1x64x12x12xbf16>) -> tensor<1x9216xbf16>
-    %14 = stablehlo.convert %13 : (tensor<1x9216xbf16>) -> tensor<1x9216xf32>
-    %15 = stablehlo.dot_general %14, %arg5, contracting_dims = [1] x [0] : (tensor<1x9216xf32>, tensor<9216x128xf32>) -> tensor<1x128xf32>
-    %16 = stablehlo.convert %cst_5 : (tensor<1xi64>) -> tensor<1xf32>
-    %17 = stablehlo.reshape %16 : (tensor<1xf32>) -> tensor<f32>
-    %18 = stablehlo.broadcast_in_dim %15, dims = [0, 1] : (tensor<1x128xf32>) -> tensor<1x128xf32>
-    %19 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<f32>) -> tensor<1x128xf32>
-    %20 = stablehlo.multiply %18, %19 : tensor<1x128xf32>
-    %21 = stablehlo.broadcast_in_dim %20, dims = [0, 1] : (tensor<1x128xf32>) -> tensor<1x128xf32>
-    %22 = stablehlo.broadcast_in_dim %arg6, dims = [1] : (tensor<128xf32>) -> tensor<1x128xf32>
-    %23 = stablehlo.add %21, %22 : tensor<1x128xf32>
-    %24 = stablehlo.convert %23 : (tensor<1x128xf32>) -> tensor<1x128xbf16>
-    %25 = stablehlo.maximum %24, %cst_2 : tensor<1x128xbf16>
-    %26 = stablehlo.convert %25 : (tensor<1x128xbf16>) -> tensor<1x128xf32>
-    %27 = stablehlo.dot_general %26, %arg7, contracting_dims = [1] x [0] : (tensor<1x128xf32>, tensor<128x10xf32>) -> tensor<1x10xf32>
-    %28 = stablehlo.broadcast_in_dim %27, dims = [0, 1] : (tensor<1x10xf32>) -> tensor<1x10xf32>
-    %29 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<f32>) -> tensor<1x10xf32>
-    %30 = stablehlo.multiply %28, %29 : tensor<1x10xf32>
-    %31 = stablehlo.broadcast_in_dim %30, dims = [0, 1] : (tensor<1x10xf32>) -> tensor<1x10xf32>
-    %32 = stablehlo.broadcast_in_dim %arg8, dims = [1] : (tensor<10xf32>) -> tensor<1x10xf32>
-    %33 = stablehlo.add %31, %32 : tensor<1x10xf32>
-    %34 = stablehlo.convert %33 : (tensor<1x10xf32>) -> tensor<1x10xbf16>
-    %35 = stablehlo.convert %34 : (tensor<1x10xbf16>) -> tensor<1x10xf32>
-    %36 = stablehlo.reduce(%35 init: %cst_3) applies stablehlo.maximum across dimensions = [1] : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-    %37 = stablehlo.reshape %36 : (tensor<1xf32>) -> tensor<1x1xf32>
-    %38 = stablehlo.broadcast_in_dim %35, dims = [0, 1] : (tensor<1x10xf32>) -> tensor<1x10xf32>
-    %39 = stablehlo.broadcast_in_dim %37, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x10xf32>
-    %40 = stablehlo.subtract %38, %39 : tensor<1x10xf32>
-    %41 = stablehlo.exponential %40 : tensor<1x10xf32>
-    %42 = stablehlo.reduce(%41 init: %cst_4) applies stablehlo.add across dimensions = [1] : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-    %43 = stablehlo.reshape %42 : (tensor<1xf32>) -> tensor<1x1xf32>
-    %44 = stablehlo.log %43 : tensor<1x1xf32>
-    %45 = stablehlo.broadcast_in_dim %40, dims = [0, 1] : (tensor<1x10xf32>) -> tensor<1x10xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x10xf32>
-    %47 = stablehlo.subtract %45, %46 : tensor<1x10xf32>
-    %48 = stablehlo.convert %47 : (tensor<1x10xf32>) -> tensor<1x10xbf16>
-    return %48 : tensor<1x10xbf16>
-  }
-}
diff --git a/mlir_tests/MobileNetSSD.mlir b/mlir_tests/MobileNetSSD.mlir
deleted file mode 100644
index 498e0544..00000000
--- a/mlir_tests/MobileNetSSD.mlir
+++ /dev/null
@@ -1,1643 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x320x320xf32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16x1x3x3xf32>, %arg3: tensor<16x16x1x1xf32>, %arg4: tensor<64x16x1x1xf32>, %arg5: tensor<64x1x3x3xf32>, %arg6: tensor<24x64x1x1xf32>, %arg7: tensor<72x24x1x1xf32>, %arg8: tensor<72x1x3x3xf32>, %arg9: tensor<24x72x1x1xf32>, %arg10: tensor<72x24x1x1xf32>, %arg11: tensor<72x1x5x5xf32>, %arg12: tensor<24x72x1x1xf32>, %arg13: tensor<24xf32>, %arg14: tensor<72x24x1x1xf32>, %arg15: tensor<72xf32>, %arg16: tensor<40x72x1x1xf32>, %arg17: tensor<120x40x1x1xf32>, %arg18: tensor<120x1x5x5xf32>, %arg19: tensor<32x120x1x1xf32>, %arg20: tensor<32xf32>, %arg21: tensor<120x32x1x1xf32>, %arg22: tensor<120xf32>, %arg23: tensor<40x120x1x1xf32>, %arg24: tensor<120x40x1x1xf32>, %arg25: tensor<120x1x5x5xf32>, %arg26: tensor<32x120x1x1xf32>, %arg27: tensor<32xf32>, %arg28: tensor<120x32x1x1xf32>, %arg29: tensor<120xf32>, %arg30: tensor<40x120x1x1xf32>, %arg31: tensor<240x40x1x1xf32>, %arg32: tensor<240x1x3x3xf32>, %arg33: tensor<80x240x1x1xf32>, %arg34: tensor<200x80x1x1xf32>, %arg35: tensor<200x1x3x3xf32>, %arg36: tensor<80x200x1x1xf32>, %arg37: tensor<184x80x1x1xf32>, %arg38: tensor<184x1x3x3xf32>, %arg39: tensor<80x184x1x1xf32>, %arg40: tensor<184x80x1x1xf32>, %arg41: tensor<184x1x3x3xf32>, %arg42: tensor<80x184x1x1xf32>, %arg43: tensor<480x80x1x1xf32>, %arg44: tensor<480x1x3x3xf32>, %arg45: tensor<120x480x1x1xf32>, %arg46: tensor<120xf32>, %arg47: tensor<480x120x1x1xf32>, %arg48: tensor<480xf32>, %arg49: tensor<112x480x1x1xf32>, %arg50: tensor<672x112x1x1xf32>, %arg51: tensor<672x1x3x3xf32>, %arg52: tensor<168x672x1x1xf32>, %arg53: tensor<168xf32>, %arg54: tensor<672x168x1x1xf32>, %arg55: tensor<672xf32>, %arg56: tensor<112x672x1x1xf32>, %arg57: tensor<672x112x1x1xf32>, %arg58: tensor<672x1x5x5xf32>, %arg59: tensor<168x672x1x1xf32>, %arg60: tensor<168xf32>, %arg61: tensor<672x168x1x1xf32>, %arg62: tensor<672xf32>, %arg63: tensor<80x672x1x1xf32>, %arg64: tensor<480x80x1x1xf32>, %arg65: tensor<480x1x5x5xf32>, %arg66: tensor<120x480x1x1xf32>, %arg67: tensor<120xf32>, %arg68: tensor<480x120x1x1xf32>, %arg69: tensor<480xf32>, %arg70: tensor<80x480x1x1xf32>, %arg71: tensor<480x80x1x1xf32>, %arg72: tensor<480x1x5x5xf32>, %arg73: tensor<120x480x1x1xf32>, %arg74: tensor<120xf32>, %arg75: tensor<480x120x1x1xf32>, %arg76: tensor<480xf32>, %arg77: tensor<80x480x1x1xf32>, %arg78: tensor<480x80x1x1xf32>, %arg79: tensor<256x480x1x1xf32>, %arg80: tensor<256x1x3x3xf32>, %arg81: tensor<512x256x1x1xf32>, %arg82: tensor<128x512x1x1xf32>, %arg83: tensor<128x1x3x3xf32>, %arg84: tensor<256x128x1x1xf32>, %arg85: tensor<128x256x1x1xf32>, %arg86: tensor<128x1x3x3xf32>, %arg87: tensor<256x128x1x1xf32>, %arg88: tensor<64x256x1x1xf32>, %arg89: tensor<64x1x3x3xf32>, %arg90: tensor<128x64x1x1xf32>, %arg91: tensor<672x1x3x3xf32>, %arg92: tensor<24x672x1x1xf32>, %arg93: tensor<24xf32>, %arg94: tensor<480x1x3x3xf32>, %arg95: tensor<24x480x1x1xf32>, %arg96: tensor<24xf32>, %arg97: tensor<512x1x3x3xf32>, %arg98: tensor<24x512x1x1xf32>, %arg99: tensor<24xf32>, %arg100: tensor<256x1x3x3xf32>, %arg101: tensor<24x256x1x1xf32>, %arg102: tensor<24xf32>, %arg103: tensor<256x1x3x3xf32>, %arg104: tensor<24x256x1x1xf32>, %arg105: tensor<24xf32>, %arg106: tensor<128x1x3x3xf32>, %arg107: tensor<24x128x1x1xf32>, %arg108: tensor<24xf32>, %arg109: tensor<672x1x3x3xf32>, %arg110: tensor<546x672x1x1xf32>, %arg111: tensor<546xf32>, %arg112: tensor<480x1x3x3xf32>, %arg113: tensor<546x480x1x1xf32>, %arg114: tensor<546xf32>, %arg115: tensor<512x1x3x3xf32>, %arg116: tensor<546x512x1x1xf32>, %arg117: tensor<546xf32>, %arg118: tensor<256x1x3x3xf32>, %arg119: tensor<546x256x1x1xf32>, %arg120: tensor<546xf32>, %arg121: tensor<256x1x3x3xf32>, %arg122: tensor<546x256x1x1xf32>, %arg123: tensor<546xf32>, %arg124: tensor<128x1x3x3xf32>, %arg125: tensor<546x128x1x1xf32>, %arg126: tensor<546xf32>, %arg127: tensor<3x1x1xf32>, %arg128: tensor<3x1x1xf32>, %arg129: tensor<3x320x320xf32>, %arg130: tensor<3x320x320xf32>, %arg131: tensor<16x1x1xf32>, %arg132: tensor<16x1x1xf32>, %arg133: tensor<16x1x1xf32>, %arg134: tensor<16x1x1xf32>, %arg135: tensor<16x1x1xf32>, %arg136: tensor<16x1x1xf32>, %arg137: tensor<16x1x1xf32>, %arg138: tensor<16x1x1xf32>, %arg139: tensor<16x1x1xf32>, %arg140: tensor<16x1x1xf32>, %arg141: tensor<16x1x1xf32>, %arg142: tensor<16x1x1xf32>, %arg143: tensor<64x1x1xf32>, %arg144: tensor<64x1x1xf32>, %arg145: tensor<64x1x1xf32>, %arg146: tensor<64x1x1xf32>, %arg147: tensor<64x1x1xf32>, %arg148: tensor<64x1x1xf32>, %arg149: tensor<64x1x1xf32>, %arg150: tensor<64x1x1xf32>, %arg151: tensor<24x1x1xf32>, %arg152: tensor<24x1x1xf32>, %arg153: tensor<24x1x1xf32>, %arg154: tensor<24x1x1xf32>, %arg155: tensor<72x1x1xf32>, %arg156: tensor<72x1x1xf32>, %arg157: tensor<72x1x1xf32>, %arg158: tensor<72x1x1xf32>, %arg159: tensor<72x1x1xf32>, %arg160: tensor<72x1x1xf32>, %arg161: tensor<72x1x1xf32>, %arg162: tensor<72x1x1xf32>, %arg163: tensor<24x1x1xf32>, %arg164: tensor<24x1x1xf32>, %arg165: tensor<24x1x1xf32>, %arg166: tensor<24x1x1xf32>, %arg167: tensor<72x1x1xf32>, %arg168: tensor<72x1x1xf32>, %arg169: tensor<72x1x1xf32>, %arg170: tensor<72x1x1xf32>, %arg171: tensor<72x1x1xf32>, %arg172: tensor<72x1x1xf32>, %arg173: tensor<72x1x1xf32>, %arg174: tensor<72x1x1xf32>, %arg175: tensor<40x1x1xf32>, %arg176: tensor<40x1x1xf32>, %arg177: tensor<40x1x1xf32>, %arg178: tensor<40x1x1xf32>, %arg179: tensor<120x1x1xf32>, %arg180: tensor<120x1x1xf32>, %arg181: tensor<120x1x1xf32>, %arg182: tensor<120x1x1xf32>, %arg183: tensor<120x1x1xf32>, %arg184: tensor<120x1x1xf32>, %arg185: tensor<120x1x1xf32>, %arg186: tensor<120x1x1xf32>, %arg187: tensor<40x1x1xf32>, %arg188: tensor<40x1x1xf32>, %arg189: tensor<40x1x1xf32>, %arg190: tensor<40x1x1xf32>, %arg191: tensor<120x1x1xf32>, %arg192: tensor<120x1x1xf32>, %arg193: tensor<120x1x1xf32>, %arg194: tensor<120x1x1xf32>, %arg195: tensor<120x1x1xf32>, %arg196: tensor<120x1x1xf32>, %arg197: tensor<120x1x1xf32>, %arg198: tensor<120x1x1xf32>, %arg199: tensor<40x1x1xf32>, %arg200: tensor<40x1x1xf32>, %arg201: tensor<40x1x1xf32>, %arg202: tensor<40x1x1xf32>, %arg203: tensor<240x1x1xf32>, %arg204: tensor<240x1x1xf32>, %arg205: tensor<240x1x1xf32>, %arg206: tensor<240x1x1xf32>, %arg207: tensor<240x1x1xf32>, %arg208: tensor<240x1x1xf32>, %arg209: tensor<240x1x1xf32>, %arg210: tensor<240x1x1xf32>, %arg211: tensor<80x1x1xf32>, %arg212: tensor<80x1x1xf32>, %arg213: tensor<80x1x1xf32>, %arg214: tensor<80x1x1xf32>, %arg215: tensor<200x1x1xf32>, %arg216: tensor<200x1x1xf32>, %arg217: tensor<200x1x1xf32>, %arg218: tensor<200x1x1xf32>, %arg219: tensor<200x1x1xf32>, %arg220: tensor<200x1x1xf32>, %arg221: tensor<200x1x1xf32>, %arg222: tensor<200x1x1xf32>, %arg223: tensor<80x1x1xf32>, %arg224: tensor<80x1x1xf32>, %arg225: tensor<80x1x1xf32>, %arg226: tensor<80x1x1xf32>, %arg227: tensor<184x1x1xf32>, %arg228: tensor<184x1x1xf32>, %arg229: tensor<184x1x1xf32>, %arg230: tensor<184x1x1xf32>, %arg231: tensor<184x1x1xf32>, %arg232: tensor<184x1x1xf32>, %arg233: tensor<184x1x1xf32>, %arg234: tensor<184x1x1xf32>, %arg235: tensor<80x1x1xf32>, %arg236: tensor<80x1x1xf32>, %arg237: tensor<80x1x1xf32>, %arg238: tensor<80x1x1xf32>, %arg239: tensor<184x1x1xf32>, %arg240: tensor<184x1x1xf32>, %arg241: tensor<184x1x1xf32>, %arg242: tensor<184x1x1xf32>, %arg243: tensor<184x1x1xf32>, %arg244: tensor<184x1x1xf32>, %arg245: tensor<184x1x1xf32>, %arg246: tensor<184x1x1xf32>, %arg247: tensor<80x1x1xf32>, %arg248: tensor<80x1x1xf32>, %arg249: tensor<80x1x1xf32>, %arg250: tensor<80x1x1xf32>, %arg251: tensor<480x1x1xf32>, %arg252: tensor<480x1x1xf32>, %arg253: tensor<480x1x1xf32>, %arg254: tensor<480x1x1xf32>, %arg255: tensor<480x1x1xf32>, %arg256: tensor<480x1x1xf32>, %arg257: tensor<480x1x1xf32>, %arg258: tensor<480x1x1xf32>, %arg259: tensor<112x1x1xf32>, %arg260: tensor<112x1x1xf32>, %arg261: tensor<112x1x1xf32>, %arg262: tensor<112x1x1xf32>, %arg263: tensor<672x1x1xf32>, %arg264: tensor<672x1x1xf32>, %arg265: tensor<672x1x1xf32>, %arg266: tensor<672x1x1xf32>, %arg267: tensor<672x1x1xf32>, %arg268: tensor<672x1x1xf32>, %arg269: tensor<672x1x1xf32>, %arg270: tensor<672x1x1xf32>, %arg271: tensor<112x1x1xf32>, %arg272: tensor<112x1x1xf32>, %arg273: tensor<112x1x1xf32>, %arg274: tensor<112x1x1xf32>, %arg275: tensor<672x1x1xf32>, %arg276: tensor<672x1x1xf32>, %arg277: tensor<672x1x1xf32>, %arg278: tensor<672x1x1xf32>, %arg279: tensor<672x1x1xf32>, %arg280: tensor<672x1x1xf32>, %arg281: tensor<672x1x1xf32>, %arg282: tensor<672x1x1xf32>, %arg283: tensor<80x1x1xf32>, %arg284: tensor<80x1x1xf32>, %arg285: tensor<80x1x1xf32>, %arg286: tensor<80x1x1xf32>, %arg287: tensor<480x1x1xf32>, %arg288: tensor<480x1x1xf32>, %arg289: tensor<480x1x1xf32>, %arg290: tensor<480x1x1xf32>, %arg291: tensor<480x1x1xf32>, %arg292: tensor<480x1x1xf32>, %arg293: tensor<480x1x1xf32>, %arg294: tensor<480x1x1xf32>, %arg295: tensor<80x1x1xf32>, %arg296: tensor<80x1x1xf32>, %arg297: tensor<80x1x1xf32>, %arg298: tensor<80x1x1xf32>, %arg299: tensor<480x1x1xf32>, %arg300: tensor<480x1x1xf32>, %arg301: tensor<480x1x1xf32>, %arg302: tensor<480x1x1xf32>, %arg303: tensor<480x1x1xf32>, %arg304: tensor<480x1x1xf32>, %arg305: tensor<480x1x1xf32>, %arg306: tensor<480x1x1xf32>, %arg307: tensor<80x1x1xf32>, %arg308: tensor<80x1x1xf32>, %arg309: tensor<80x1x1xf32>, %arg310: tensor<80x1x1xf32>, %arg311: tensor<480x1x1xf32>, %arg312: tensor<480x1x1xf32>, %arg313: tensor<480x1x1xf32>, %arg314: tensor<480x1x1xf32>, %arg315: tensor<256x1x1xf32>, %arg316: tensor<256x1x1xf32>, %arg317: tensor<256x1x1xf32>, %arg318: tensor<256x1x1xf32>, %arg319: tensor<256x1x1xf32>, %arg320: tensor<256x1x1xf32>, %arg321: tensor<256x1x1xf32>, %arg322: tensor<256x1x1xf32>, %arg323: tensor<512x1x1xf32>, %arg324: tensor<512x1x1xf32>, %arg325: tensor<512x1x1xf32>, %arg326: tensor<512x1x1xf32>, %arg327: tensor<128x1x1xf32>, %arg328: tensor<128x1x1xf32>, %arg329: tensor<128x1x1xf32>, %arg330: tensor<128x1x1xf32>, %arg331: tensor<128x1x1xf32>, %arg332: tensor<128x1x1xf32>, %arg333: tensor<128x1x1xf32>, %arg334: tensor<128x1x1xf32>, %arg335: tensor<256x1x1xf32>, %arg336: tensor<256x1x1xf32>, %arg337: tensor<256x1x1xf32>, %arg338: tensor<256x1x1xf32>, %arg339: tensor<128x1x1xf32>, %arg340: tensor<128x1x1xf32>, %arg341: tensor<128x1x1xf32>, %arg342: tensor<128x1x1xf32>, %arg343: tensor<128x1x1xf32>, %arg344: tensor<128x1x1xf32>, %arg345: tensor<128x1x1xf32>, %arg346: tensor<128x1x1xf32>, %arg347: tensor<256x1x1xf32>, %arg348: tensor<256x1x1xf32>, %arg349: tensor<256x1x1xf32>, %arg350: tensor<256x1x1xf32>, %arg351: tensor<64x1x1xf32>, %arg352: tensor<64x1x1xf32>, %arg353: tensor<64x1x1xf32>, %arg354: tensor<64x1x1xf32>, %arg355: tensor<64x1x1xf32>, %arg356: tensor<64x1x1xf32>, %arg357: tensor<64x1x1xf32>, %arg358: tensor<64x1x1xf32>, %arg359: tensor<128x1x1xf32>, %arg360: tensor<128x1x1xf32>, %arg361: tensor<128x1x1xf32>, %arg362: tensor<128x1x1xf32>, %arg363: tensor<672x1x1xf32>, %arg364: tensor<672x1x1xf32>, %arg365: tensor<672x1x1xf32>, %arg366: tensor<672x1x1xf32>, %arg367: tensor<480x1x1xf32>, %arg368: tensor<480x1x1xf32>, %arg369: tensor<480x1x1xf32>, %arg370: tensor<480x1x1xf32>, %arg371: tensor<512x1x1xf32>, %arg372: tensor<512x1x1xf32>, %arg373: tensor<512x1x1xf32>, %arg374: tensor<512x1x1xf32>, %arg375: tensor<256x1x1xf32>, %arg376: tensor<256x1x1xf32>, %arg377: tensor<256x1x1xf32>, %arg378: tensor<256x1x1xf32>, %arg379: tensor<256x1x1xf32>, %arg380: tensor<256x1x1xf32>, %arg381: tensor<256x1x1xf32>, %arg382: tensor<256x1x1xf32>, %arg383: tensor<128x1x1xf32>, %arg384: tensor<128x1x1xf32>, %arg385: tensor<128x1x1xf32>, %arg386: tensor<128x1x1xf32>, %arg387: tensor<672x1x1xf32>, %arg388: tensor<672x1x1xf32>, %arg389: tensor<672x1x1xf32>, %arg390: tensor<672x1x1xf32>, %arg391: tensor<480x1x1xf32>, %arg392: tensor<480x1x1xf32>, %arg393: tensor<480x1x1xf32>, %arg394: tensor<480x1x1xf32>, %arg395: tensor<512x1x1xf32>, %arg396: tensor<512x1x1xf32>, %arg397: tensor<512x1x1xf32>, %arg398: tensor<512x1x1xf32>, %arg399: tensor<256x1x1xf32>, %arg400: tensor<256x1x1xf32>, %arg401: tensor<256x1x1xf32>, %arg402: tensor<256x1x1xf32>, %arg403: tensor<256x1x1xf32>, %arg404: tensor<256x1x1xf32>, %arg405: tensor<256x1x1xf32>, %arg406: tensor<256x1x1xf32>, %arg407: tensor<128x1x1xf32>, %arg408: tensor<128x1x1xf32>, %arg409: tensor<128x1x1xf32>, %arg410: tensor<128x1x1xf32>, %arg411: tensor<3234x4xf32>) -> (tensor<1x3234x4xf32>, tensor<1x3234x91xf32>, tensor<3234x4xf32>, tensor<1x3x320x320xf32>) {
-    %cst = stablehlo.constant dense<6.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %c = stablehlo.constant dense<1> : tensor<i64>
-    %c_1 = stablehlo.constant dense<6> : tensor<i64>
-    %c_2 = stablehlo.constant dense<0> : tensor<i64>
-    %c_3 = stablehlo.constant dense<0> : tensor<1x1xi64>
-    %cst_4 = stablehlo.constant dense<0.000000e+00> : tensor<1x16x160x160xf32>
-    %cst_5 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x160x160xf32>
-    %cst_6 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x80x80xf32>
-    %cst_7 = stablehlo.constant dense<0.000000e+00> : tensor<1x72x80x80xf32>
-    %cst_8 = stablehlo.constant dense<0.000000e+00> : tensor<1x72x40x40xf32>
-    %cst_9 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_10 = stablehlo.constant dense<0.000000e+00> : tensor<1x24x1x1xf32>
-    %cst_11 = stablehlo.constant dense<0.000000e+00> : tensor<1x120x40x40xf32>
-    %cst_12 = stablehlo.constant dense<0.000000e+00> : tensor<1x32x1x1xf32>
-    %cst_13 = stablehlo.constant dense<0.000000e+00> : tensor<1x240x40x40xf32>
-    %cst_14 = stablehlo.constant dense<0.000000e+00> : tensor<1x240x20x20xf32>
-    %cst_15 = stablehlo.constant dense<0.000000e+00> : tensor<1x200x20x20xf32>
-    %cst_16 = stablehlo.constant dense<0.000000e+00> : tensor<1x184x20x20xf32>
-    %cst_17 = stablehlo.constant dense<0.000000e+00> : tensor<1x480x20x20xf32>
-    %cst_18 = stablehlo.constant dense<0.000000e+00> : tensor<1x120x1x1xf32>
-    %cst_19 = stablehlo.constant dense<0.000000e+00> : tensor<1x672x20x20xf32>
-    %cst_20 = stablehlo.constant dense<0.000000e+00> : tensor<1x168x1x1xf32>
-    %cst_21 = stablehlo.constant dense<0.000000e+00> : tensor<1x672x10x10xf32>
-    %cst_22 = stablehlo.constant dense<0.000000e+00> : tensor<1x480x10x10xf32>
-    %cst_23 = arith.constant dense<3> : tensor<1xi64>
-    %cst_24 = arith.constant dense<6> : tensor<1xi64>
-    %cst_25 = arith.constant dense<1600> : tensor<1xi64>
-    %cst_26 = arith.constant dense<400> : tensor<1xi64>
-    %cst_27 = arith.constant dense<100> : tensor<1xi64>
-    %0 = stablehlo.reshape %arg0 : (tensor<1x3x320x320xf32>) -> tensor<3x320x320xf32>
-    %1 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2] : (tensor<3x320x320xf32>) -> tensor<3x320x320xf32>
-    %2 = stablehlo.broadcast_in_dim %arg127, dims = [0, 1, 2] : (tensor<3x1x1xf32>) -> tensor<3x320x320xf32>
-    %3 = stablehlo.subtract %1, %2 : tensor<3x320x320xf32>
-    %4 = stablehlo.broadcast_in_dim %3, dims = [0, 1, 2] : (tensor<3x320x320xf32>) -> tensor<3x320x320xf32>
-    %5 = stablehlo.broadcast_in_dim %arg128, dims = [0, 1, 2] : (tensor<3x1x1xf32>) -> tensor<3x320x320xf32>
-    %6 = stablehlo.divide %4, %5 : tensor<3x320x320xf32>
-    %7 = stablehlo.reshape %6 : (tensor<3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %8 = stablehlo.transpose %7, dims = [0, 1, 3, 2] : (tensor<1x3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %9 = stablehlo.reshape %8 : (tensor<1x3x320x320xf32>) -> tensor<3x320x320xf32>
-    %10 = stablehlo.broadcast_in_dim %arg129, dims = [0, 1, 2] : (tensor<3x320x320xf32>) -> tensor<3x320x320xf32>
-    %11 = stablehlo.dot_general %9, %10, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x320x320xf32>, tensor<3x320x320xf32>) -> tensor<3x320x320xf32>
-    %12 = stablehlo.reshape %11 : (tensor<3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %13 = stablehlo.transpose %12, dims = [0, 1, 3, 2] : (tensor<1x3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %14 = stablehlo.reshape %13 : (tensor<1x3x320x320xf32>) -> tensor<3x320x320xf32>
-    %15 = stablehlo.broadcast_in_dim %arg130, dims = [0, 1, 2] : (tensor<3x320x320xf32>) -> tensor<3x320x320xf32>
-    %16 = stablehlo.dot_general %14, %15, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x320x320xf32>, tensor<3x320x320xf32>) -> tensor<3x320x320xf32>
-    %17 = stablehlo.reshape %16 : (tensor<3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %18 = stablehlo.reshape %17 : (tensor<1x3x320x320xf32>) -> tensor<3x320x320xf32>
-    %19 = stablehlo.convert %c_2 : (tensor<i64>) -> tensor<f32>
-    %20 = stablehlo.broadcast_in_dim %19, dims = [] : (tensor<f32>) -> tensor<1x3x320x320xf32>
-    %21 = stablehlo.reshape %18 : (tensor<3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %22 = "stablehlo.scatter"(%20, %c_3, %21) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [1, 2, 3], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false}> ({
-    ^bb0(%arg412: tensor<f32>, %arg413: tensor<f32>):
-      stablehlo.return %arg413 : tensor<f32>
-    }) : (tensor<1x3x320x320xf32>, tensor<1x1xi64>, tensor<1x3x320x320xf32>) -> tensor<1x3x320x320xf32>
-    %23 = stablehlo.convolution(%22, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x320x320xf32>, tensor<16x3x3x3xf32>) -> tensor<1x16x160x160xf32>
-    %24 = stablehlo.broadcast_in_dim %23, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %25 = stablehlo.broadcast_in_dim %arg131, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %26 = stablehlo.subtract %24, %25 : tensor<1x16x160x160xf32>
-    %27 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %28 = stablehlo.broadcast_in_dim %arg132, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %29 = stablehlo.multiply %27, %28 : tensor<1x16x160x160xf32>
-    %30 = stablehlo.broadcast_in_dim %29, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %31 = stablehlo.broadcast_in_dim %arg133, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %32 = stablehlo.multiply %30, %31 : tensor<1x16x160x160xf32>
-    %33 = stablehlo.broadcast_in_dim %32, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %34 = stablehlo.broadcast_in_dim %arg134, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %35 = stablehlo.add %33, %34 : tensor<1x16x160x160xf32>
-    %36 = stablehlo.convert %cst_23 : (tensor<1xi64>) -> tensor<1xf32>
-    %37 = stablehlo.reshape %36 : (tensor<1xf32>) -> tensor<f32>
-    %38 = stablehlo.broadcast_in_dim %35, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %39 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x16x160x160xf32>
-    %40 = stablehlo.add %38, %39 : tensor<1x16x160x160xf32>
-    %41 = stablehlo.maximum %40, %cst_4 : tensor<1x16x160x160xf32>
-    %42 = stablehlo.convert %c_1 : (tensor<i64>) -> tensor<f32>
-    %43 = stablehlo.broadcast_in_dim %41, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %44 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x16x160x160xf32>
-    %45 = stablehlo.minimum %43, %44 : tensor<1x16x160x160xf32>
-    %46 = stablehlo.convert %cst_24 : (tensor<1xi64>) -> tensor<1xf32>
-    %47 = stablehlo.reshape %46 : (tensor<1xf32>) -> tensor<f32>
-    %48 = stablehlo.broadcast_in_dim %45, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %49 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x16x160x160xf32>
-    %50 = stablehlo.divide %48, %49 : tensor<1x16x160x160xf32>
-    %51 = stablehlo.multiply %50, %35 : tensor<1x16x160x160xf32>
-    %52 = stablehlo.convolution(%51, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 16 : i64} : (tensor<1x16x160x160xf32>, tensor<16x1x3x3xf32>) -> tensor<1x16x160x160xf32>
-    %53 = stablehlo.broadcast_in_dim %52, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %54 = stablehlo.broadcast_in_dim %arg135, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %55 = stablehlo.subtract %53, %54 : tensor<1x16x160x160xf32>
-    %56 = stablehlo.broadcast_in_dim %55, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %57 = stablehlo.broadcast_in_dim %arg136, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %58 = stablehlo.multiply %56, %57 : tensor<1x16x160x160xf32>
-    %59 = stablehlo.broadcast_in_dim %58, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %60 = stablehlo.broadcast_in_dim %arg137, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %61 = stablehlo.multiply %59, %60 : tensor<1x16x160x160xf32>
-    %62 = stablehlo.broadcast_in_dim %61, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %63 = stablehlo.broadcast_in_dim %arg138, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %64 = stablehlo.add %62, %63 : tensor<1x16x160x160xf32>
-    %65 = stablehlo.maximum %64, %cst_4 : tensor<1x16x160x160xf32>
-    %66 = stablehlo.convolution(%65, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x16x160x160xf32>, tensor<16x16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %67 = stablehlo.broadcast_in_dim %66, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %68 = stablehlo.broadcast_in_dim %arg139, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %69 = stablehlo.subtract %67, %68 : tensor<1x16x160x160xf32>
-    %70 = stablehlo.broadcast_in_dim %69, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %71 = stablehlo.broadcast_in_dim %arg140, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %72 = stablehlo.multiply %70, %71 : tensor<1x16x160x160xf32>
-    %73 = stablehlo.broadcast_in_dim %72, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %74 = stablehlo.broadcast_in_dim %arg141, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %75 = stablehlo.multiply %73, %74 : tensor<1x16x160x160xf32>
-    %76 = stablehlo.broadcast_in_dim %75, dims = [0, 1, 2, 3] : (tensor<1x16x160x160xf32>) -> tensor<1x16x160x160xf32>
-    %77 = stablehlo.broadcast_in_dim %arg142, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x160x160xf32>
-    %78 = stablehlo.add %76, %77 : tensor<1x16x160x160xf32>
-    %79 = stablehlo.add %78, %51 : tensor<1x16x160x160xf32>
-    %80 = stablehlo.convolution(%79, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x16x160x160xf32>, tensor<64x16x1x1xf32>) -> tensor<1x64x160x160xf32>
-    %81 = stablehlo.broadcast_in_dim %80, dims = [0, 1, 2, 3] : (tensor<1x64x160x160xf32>) -> tensor<1x64x160x160xf32>
-    %82 = stablehlo.broadcast_in_dim %arg143, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x160x160xf32>
-    %83 = stablehlo.subtract %81, %82 : tensor<1x64x160x160xf32>
-    %84 = stablehlo.broadcast_in_dim %83, dims = [0, 1, 2, 3] : (tensor<1x64x160x160xf32>) -> tensor<1x64x160x160xf32>
-    %85 = stablehlo.broadcast_in_dim %arg144, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x160x160xf32>
-    %86 = stablehlo.multiply %84, %85 : tensor<1x64x160x160xf32>
-    %87 = stablehlo.broadcast_in_dim %86, dims = [0, 1, 2, 3] : (tensor<1x64x160x160xf32>) -> tensor<1x64x160x160xf32>
-    %88 = stablehlo.broadcast_in_dim %arg145, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x160x160xf32>
-    %89 = stablehlo.multiply %87, %88 : tensor<1x64x160x160xf32>
-    %90 = stablehlo.broadcast_in_dim %89, dims = [0, 1, 2, 3] : (tensor<1x64x160x160xf32>) -> tensor<1x64x160x160xf32>
-    %91 = stablehlo.broadcast_in_dim %arg146, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x160x160xf32>
-    %92 = stablehlo.add %90, %91 : tensor<1x64x160x160xf32>
-    %93 = stablehlo.maximum %92, %cst_5 : tensor<1x64x160x160xf32>
-    %94 = stablehlo.convolution(%93, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 64 : i64} : (tensor<1x64x160x160xf32>, tensor<64x1x3x3xf32>) -> tensor<1x64x80x80xf32>
-    %95 = stablehlo.broadcast_in_dim %94, dims = [0, 1, 2, 3] : (tensor<1x64x80x80xf32>) -> tensor<1x64x80x80xf32>
-    %96 = stablehlo.broadcast_in_dim %arg147, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x80x80xf32>
-    %97 = stablehlo.subtract %95, %96 : tensor<1x64x80x80xf32>
-    %98 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x64x80x80xf32>) -> tensor<1x64x80x80xf32>
-    %99 = stablehlo.broadcast_in_dim %arg148, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x80x80xf32>
-    %100 = stablehlo.multiply %98, %99 : tensor<1x64x80x80xf32>
-    %101 = stablehlo.broadcast_in_dim %100, dims = [0, 1, 2, 3] : (tensor<1x64x80x80xf32>) -> tensor<1x64x80x80xf32>
-    %102 = stablehlo.broadcast_in_dim %arg149, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x80x80xf32>
-    %103 = stablehlo.multiply %101, %102 : tensor<1x64x80x80xf32>
-    %104 = stablehlo.broadcast_in_dim %103, dims = [0, 1, 2, 3] : (tensor<1x64x80x80xf32>) -> tensor<1x64x80x80xf32>
-    %105 = stablehlo.broadcast_in_dim %arg150, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x80x80xf32>
-    %106 = stablehlo.add %104, %105 : tensor<1x64x80x80xf32>
-    %107 = stablehlo.maximum %106, %cst_6 : tensor<1x64x80x80xf32>
-    %108 = stablehlo.convolution(%107, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x80x80xf32>, tensor<24x64x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %109 = stablehlo.broadcast_in_dim %108, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %110 = stablehlo.broadcast_in_dim %arg151, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %111 = stablehlo.subtract %109, %110 : tensor<1x24x80x80xf32>
-    %112 = stablehlo.broadcast_in_dim %111, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %113 = stablehlo.broadcast_in_dim %arg152, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %114 = stablehlo.multiply %112, %113 : tensor<1x24x80x80xf32>
-    %115 = stablehlo.broadcast_in_dim %114, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %116 = stablehlo.broadcast_in_dim %arg153, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %117 = stablehlo.multiply %115, %116 : tensor<1x24x80x80xf32>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %119 = stablehlo.broadcast_in_dim %arg154, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %120 = stablehlo.add %118, %119 : tensor<1x24x80x80xf32>
-    %121 = stablehlo.convolution(%120, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x24x80x80xf32>, tensor<72x24x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %122 = stablehlo.broadcast_in_dim %121, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %123 = stablehlo.broadcast_in_dim %arg155, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %124 = stablehlo.subtract %122, %123 : tensor<1x72x80x80xf32>
-    %125 = stablehlo.broadcast_in_dim %124, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %126 = stablehlo.broadcast_in_dim %arg156, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %127 = stablehlo.multiply %125, %126 : tensor<1x72x80x80xf32>
-    %128 = stablehlo.broadcast_in_dim %127, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %129 = stablehlo.broadcast_in_dim %arg157, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %130 = stablehlo.multiply %128, %129 : tensor<1x72x80x80xf32>
-    %131 = stablehlo.broadcast_in_dim %130, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %132 = stablehlo.broadcast_in_dim %arg158, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %133 = stablehlo.add %131, %132 : tensor<1x72x80x80xf32>
-    %134 = stablehlo.maximum %133, %cst_7 : tensor<1x72x80x80xf32>
-    %135 = stablehlo.convolution(%134, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 72 : i64} : (tensor<1x72x80x80xf32>, tensor<72x1x3x3xf32>) -> tensor<1x72x80x80xf32>
-    %136 = stablehlo.broadcast_in_dim %135, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %137 = stablehlo.broadcast_in_dim %arg159, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %138 = stablehlo.subtract %136, %137 : tensor<1x72x80x80xf32>
-    %139 = stablehlo.broadcast_in_dim %138, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %140 = stablehlo.broadcast_in_dim %arg160, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %141 = stablehlo.multiply %139, %140 : tensor<1x72x80x80xf32>
-    %142 = stablehlo.broadcast_in_dim %141, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %143 = stablehlo.broadcast_in_dim %arg161, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %144 = stablehlo.multiply %142, %143 : tensor<1x72x80x80xf32>
-    %145 = stablehlo.broadcast_in_dim %144, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %146 = stablehlo.broadcast_in_dim %arg162, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %147 = stablehlo.add %145, %146 : tensor<1x72x80x80xf32>
-    %148 = stablehlo.maximum %147, %cst_7 : tensor<1x72x80x80xf32>
-    %149 = stablehlo.convolution(%148, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x72x80x80xf32>, tensor<24x72x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %150 = stablehlo.broadcast_in_dim %149, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %151 = stablehlo.broadcast_in_dim %arg163, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %152 = stablehlo.subtract %150, %151 : tensor<1x24x80x80xf32>
-    %153 = stablehlo.broadcast_in_dim %152, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %154 = stablehlo.broadcast_in_dim %arg164, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %155 = stablehlo.multiply %153, %154 : tensor<1x24x80x80xf32>
-    %156 = stablehlo.broadcast_in_dim %155, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %157 = stablehlo.broadcast_in_dim %arg165, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %158 = stablehlo.multiply %156, %157 : tensor<1x24x80x80xf32>
-    %159 = stablehlo.broadcast_in_dim %158, dims = [0, 1, 2, 3] : (tensor<1x24x80x80xf32>) -> tensor<1x24x80x80xf32>
-    %160 = stablehlo.broadcast_in_dim %arg166, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x80x80xf32>
-    %161 = stablehlo.add %159, %160 : tensor<1x24x80x80xf32>
-    %162 = stablehlo.add %161, %120 : tensor<1x24x80x80xf32>
-    %163 = stablehlo.convolution(%162, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x24x80x80xf32>, tensor<72x24x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %164 = stablehlo.broadcast_in_dim %163, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %165 = stablehlo.broadcast_in_dim %arg167, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %166 = stablehlo.subtract %164, %165 : tensor<1x72x80x80xf32>
-    %167 = stablehlo.broadcast_in_dim %166, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %168 = stablehlo.broadcast_in_dim %arg168, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %169 = stablehlo.multiply %167, %168 : tensor<1x72x80x80xf32>
-    %170 = stablehlo.broadcast_in_dim %169, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %171 = stablehlo.broadcast_in_dim %arg169, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %172 = stablehlo.multiply %170, %171 : tensor<1x72x80x80xf32>
-    %173 = stablehlo.broadcast_in_dim %172, dims = [0, 1, 2, 3] : (tensor<1x72x80x80xf32>) -> tensor<1x72x80x80xf32>
-    %174 = stablehlo.broadcast_in_dim %arg170, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x80x80xf32>
-    %175 = stablehlo.add %173, %174 : tensor<1x72x80x80xf32>
-    %176 = stablehlo.maximum %175, %cst_7 : tensor<1x72x80x80xf32>
-    %177 = stablehlo.convolution(%176, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[2, 2], [2, 2]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 72 : i64} : (tensor<1x72x80x80xf32>, tensor<72x1x5x5xf32>) -> tensor<1x72x40x40xf32>
-    %178 = stablehlo.broadcast_in_dim %177, dims = [0, 1, 2, 3] : (tensor<1x72x40x40xf32>) -> tensor<1x72x40x40xf32>
-    %179 = stablehlo.broadcast_in_dim %arg171, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x40x40xf32>
-    %180 = stablehlo.subtract %178, %179 : tensor<1x72x40x40xf32>
-    %181 = stablehlo.broadcast_in_dim %180, dims = [0, 1, 2, 3] : (tensor<1x72x40x40xf32>) -> tensor<1x72x40x40xf32>
-    %182 = stablehlo.broadcast_in_dim %arg172, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x40x40xf32>
-    %183 = stablehlo.multiply %181, %182 : tensor<1x72x40x40xf32>
-    %184 = stablehlo.broadcast_in_dim %183, dims = [0, 1, 2, 3] : (tensor<1x72x40x40xf32>) -> tensor<1x72x40x40xf32>
-    %185 = stablehlo.broadcast_in_dim %arg173, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x40x40xf32>
-    %186 = stablehlo.multiply %184, %185 : tensor<1x72x40x40xf32>
-    %187 = stablehlo.broadcast_in_dim %186, dims = [0, 1, 2, 3] : (tensor<1x72x40x40xf32>) -> tensor<1x72x40x40xf32>
-    %188 = stablehlo.broadcast_in_dim %arg174, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x40x40xf32>
-    %189 = stablehlo.add %187, %188 : tensor<1x72x40x40xf32>
-    %190 = stablehlo.maximum %189, %cst_8 : tensor<1x72x40x40xf32>
-    %191 = stablehlo.reduce(%190 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x72x40x40xf32>, tensor<f32>) -> tensor<1x72xf32>
-    %192 = stablehlo.reshape %191 : (tensor<1x72xf32>) -> tensor<1x72x1x1xf32>
-    %193 = stablehlo.convert %cst_25 : (tensor<1xi64>) -> tensor<1xf32>
-    %194 = stablehlo.reshape %193 : (tensor<1xf32>) -> tensor<f32>
-    %195 = stablehlo.broadcast_in_dim %192, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %196 = stablehlo.broadcast_in_dim %194, dims = [] : (tensor<f32>) -> tensor<1x72x1x1xf32>
-    %197 = stablehlo.divide %195, %196 : tensor<1x72x1x1xf32>
-    %198 = stablehlo.convolution(%197, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x72x1x1xf32>, tensor<24x72x1x1xf32>) -> tensor<1x24x1x1xf32>
-    %199 = stablehlo.reshape %arg13 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %200 = stablehlo.broadcast_in_dim %198, dims = [0, 1, 2, 3] : (tensor<1x24x1x1xf32>) -> tensor<1x24x1x1xf32>
-    %201 = stablehlo.broadcast_in_dim %199, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x1x1xf32>
-    %202 = stablehlo.add %200, %201 : tensor<1x24x1x1xf32>
-    %203 = stablehlo.maximum %202, %cst_10 : tensor<1x24x1x1xf32>
-    %204 = stablehlo.convolution(%203, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x24x1x1xf32>, tensor<72x24x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %205 = stablehlo.reshape %arg15 : (tensor<72xf32>) -> tensor<72x1x1xf32>
-    %206 = stablehlo.broadcast_in_dim %204, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %207 = stablehlo.broadcast_in_dim %205, dims = [1, 2, 3] : (tensor<72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %208 = stablehlo.add %206, %207 : tensor<1x72x1x1xf32>
-    %209 = stablehlo.broadcast_in_dim %208, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %210 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x72x1x1xf32>
-    %211 = stablehlo.add %209, %210 : tensor<1x72x1x1xf32>
-    %212 = stablehlo.broadcast_in_dim %211, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %213 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x72x1x1xf32>
-    %214 = stablehlo.divide %212, %213 : tensor<1x72x1x1xf32>
-    %215 = stablehlo.convert %c : (tensor<i64>) -> tensor<f32>
-    %216 = stablehlo.broadcast_in_dim %215, dims = [] : (tensor<f32>) -> tensor<1x72x1x1xf32>
-    %217 = stablehlo.broadcast_in_dim %214, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %218 = stablehlo.minimum %216, %217 : tensor<1x72x1x1xf32>
-    %219 = stablehlo.broadcast_in_dim %19, dims = [] : (tensor<f32>) -> tensor<1x72x1x1xf32>
-    %220 = stablehlo.broadcast_in_dim %218, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x1x1xf32>
-    %221 = stablehlo.maximum %219, %220 : tensor<1x72x1x1xf32>
-    %222 = stablehlo.broadcast_in_dim %221, dims = [0, 1, 2, 3] : (tensor<1x72x1x1xf32>) -> tensor<1x72x40x40xf32>
-    %223 = stablehlo.broadcast_in_dim %190, dims = [0, 1, 2, 3] : (tensor<1x72x40x40xf32>) -> tensor<1x72x40x40xf32>
-    %224 = stablehlo.multiply %222, %223 : tensor<1x72x40x40xf32>
-    %225 = stablehlo.convolution(%224, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x72x40x40xf32>, tensor<40x72x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %226 = stablehlo.broadcast_in_dim %225, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %227 = stablehlo.broadcast_in_dim %arg175, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %228 = stablehlo.subtract %226, %227 : tensor<1x40x40x40xf32>
-    %229 = stablehlo.broadcast_in_dim %228, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %230 = stablehlo.broadcast_in_dim %arg176, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %231 = stablehlo.multiply %229, %230 : tensor<1x40x40x40xf32>
-    %232 = stablehlo.broadcast_in_dim %231, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %233 = stablehlo.broadcast_in_dim %arg177, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %234 = stablehlo.multiply %232, %233 : tensor<1x40x40x40xf32>
-    %235 = stablehlo.broadcast_in_dim %234, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %236 = stablehlo.broadcast_in_dim %arg178, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %237 = stablehlo.add %235, %236 : tensor<1x40x40x40xf32>
-    %238 = stablehlo.convolution(%237, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x40x40x40xf32>, tensor<120x40x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %239 = stablehlo.broadcast_in_dim %238, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %240 = stablehlo.broadcast_in_dim %arg179, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %241 = stablehlo.subtract %239, %240 : tensor<1x120x40x40xf32>
-    %242 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %243 = stablehlo.broadcast_in_dim %arg180, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %244 = stablehlo.multiply %242, %243 : tensor<1x120x40x40xf32>
-    %245 = stablehlo.broadcast_in_dim %244, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %246 = stablehlo.broadcast_in_dim %arg181, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %247 = stablehlo.multiply %245, %246 : tensor<1x120x40x40xf32>
-    %248 = stablehlo.broadcast_in_dim %247, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %249 = stablehlo.broadcast_in_dim %arg182, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %250 = stablehlo.add %248, %249 : tensor<1x120x40x40xf32>
-    %251 = stablehlo.maximum %250, %cst_11 : tensor<1x120x40x40xf32>
-    %252 = stablehlo.convolution(%251, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 120 : i64} : (tensor<1x120x40x40xf32>, tensor<120x1x5x5xf32>) -> tensor<1x120x40x40xf32>
-    %253 = stablehlo.broadcast_in_dim %252, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %254 = stablehlo.broadcast_in_dim %arg183, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %255 = stablehlo.subtract %253, %254 : tensor<1x120x40x40xf32>
-    %256 = stablehlo.broadcast_in_dim %255, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %257 = stablehlo.broadcast_in_dim %arg184, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %258 = stablehlo.multiply %256, %257 : tensor<1x120x40x40xf32>
-    %259 = stablehlo.broadcast_in_dim %258, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %260 = stablehlo.broadcast_in_dim %arg185, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %261 = stablehlo.multiply %259, %260 : tensor<1x120x40x40xf32>
-    %262 = stablehlo.broadcast_in_dim %261, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %263 = stablehlo.broadcast_in_dim %arg186, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %264 = stablehlo.add %262, %263 : tensor<1x120x40x40xf32>
-    %265 = stablehlo.maximum %264, %cst_11 : tensor<1x120x40x40xf32>
-    %266 = stablehlo.reduce(%265 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x120x40x40xf32>, tensor<f32>) -> tensor<1x120xf32>
-    %267 = stablehlo.reshape %266 : (tensor<1x120xf32>) -> tensor<1x120x1x1xf32>
-    %268 = stablehlo.broadcast_in_dim %267, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %269 = stablehlo.broadcast_in_dim %194, dims = [] : (tensor<f32>) -> tensor<1x120x1x1xf32>
-    %270 = stablehlo.divide %268, %269 : tensor<1x120x1x1xf32>
-    %271 = stablehlo.convolution(%270, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x1x1xf32>, tensor<32x120x1x1xf32>) -> tensor<1x32x1x1xf32>
-    %272 = stablehlo.reshape %arg20 : (tensor<32xf32>) -> tensor<32x1x1xf32>
-    %273 = stablehlo.broadcast_in_dim %271, dims = [0, 1, 2, 3] : (tensor<1x32x1x1xf32>) -> tensor<1x32x1x1xf32>
-    %274 = stablehlo.broadcast_in_dim %272, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x1x1xf32>
-    %275 = stablehlo.add %273, %274 : tensor<1x32x1x1xf32>
-    %276 = stablehlo.maximum %275, %cst_12 : tensor<1x32x1x1xf32>
-    %277 = stablehlo.convolution(%276, %arg21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x1x1xf32>, tensor<120x32x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %278 = stablehlo.reshape %arg22 : (tensor<120xf32>) -> tensor<120x1x1xf32>
-    %279 = stablehlo.broadcast_in_dim %277, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %280 = stablehlo.broadcast_in_dim %278, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %281 = stablehlo.add %279, %280 : tensor<1x120x1x1xf32>
-    %282 = stablehlo.broadcast_in_dim %281, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %283 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x120x1x1xf32>
-    %284 = stablehlo.add %282, %283 : tensor<1x120x1x1xf32>
-    %285 = stablehlo.broadcast_in_dim %284, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %286 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x120x1x1xf32>
-    %287 = stablehlo.divide %285, %286 : tensor<1x120x1x1xf32>
-    %288 = stablehlo.broadcast_in_dim %215, dims = [] : (tensor<f32>) -> tensor<1x120x1x1xf32>
-    %289 = stablehlo.broadcast_in_dim %287, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %290 = stablehlo.minimum %288, %289 : tensor<1x120x1x1xf32>
-    %291 = stablehlo.broadcast_in_dim %19, dims = [] : (tensor<f32>) -> tensor<1x120x1x1xf32>
-    %292 = stablehlo.broadcast_in_dim %290, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %293 = stablehlo.maximum %291, %292 : tensor<1x120x1x1xf32>
-    %294 = stablehlo.broadcast_in_dim %293, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %295 = stablehlo.broadcast_in_dim %265, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %296 = stablehlo.multiply %294, %295 : tensor<1x120x40x40xf32>
-    %297 = stablehlo.convolution(%296, %arg23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x40x40xf32>, tensor<40x120x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %298 = stablehlo.broadcast_in_dim %297, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %299 = stablehlo.broadcast_in_dim %arg187, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %300 = stablehlo.subtract %298, %299 : tensor<1x40x40x40xf32>
-    %301 = stablehlo.broadcast_in_dim %300, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %302 = stablehlo.broadcast_in_dim %arg188, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %303 = stablehlo.multiply %301, %302 : tensor<1x40x40x40xf32>
-    %304 = stablehlo.broadcast_in_dim %303, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %305 = stablehlo.broadcast_in_dim %arg189, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %306 = stablehlo.multiply %304, %305 : tensor<1x40x40x40xf32>
-    %307 = stablehlo.broadcast_in_dim %306, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %308 = stablehlo.broadcast_in_dim %arg190, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %309 = stablehlo.add %307, %308 : tensor<1x40x40x40xf32>
-    %310 = stablehlo.add %309, %237 : tensor<1x40x40x40xf32>
-    %311 = stablehlo.convolution(%310, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x40x40x40xf32>, tensor<120x40x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %312 = stablehlo.broadcast_in_dim %311, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %313 = stablehlo.broadcast_in_dim %arg191, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %314 = stablehlo.subtract %312, %313 : tensor<1x120x40x40xf32>
-    %315 = stablehlo.broadcast_in_dim %314, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %316 = stablehlo.broadcast_in_dim %arg192, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %317 = stablehlo.multiply %315, %316 : tensor<1x120x40x40xf32>
-    %318 = stablehlo.broadcast_in_dim %317, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %319 = stablehlo.broadcast_in_dim %arg193, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %320 = stablehlo.multiply %318, %319 : tensor<1x120x40x40xf32>
-    %321 = stablehlo.broadcast_in_dim %320, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %322 = stablehlo.broadcast_in_dim %arg194, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %323 = stablehlo.add %321, %322 : tensor<1x120x40x40xf32>
-    %324 = stablehlo.maximum %323, %cst_11 : tensor<1x120x40x40xf32>
-    %325 = stablehlo.convolution(%324, %arg25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 120 : i64} : (tensor<1x120x40x40xf32>, tensor<120x1x5x5xf32>) -> tensor<1x120x40x40xf32>
-    %326 = stablehlo.broadcast_in_dim %325, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %327 = stablehlo.broadcast_in_dim %arg195, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %328 = stablehlo.subtract %326, %327 : tensor<1x120x40x40xf32>
-    %329 = stablehlo.broadcast_in_dim %328, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %330 = stablehlo.broadcast_in_dim %arg196, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %331 = stablehlo.multiply %329, %330 : tensor<1x120x40x40xf32>
-    %332 = stablehlo.broadcast_in_dim %331, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %333 = stablehlo.broadcast_in_dim %arg197, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %334 = stablehlo.multiply %332, %333 : tensor<1x120x40x40xf32>
-    %335 = stablehlo.broadcast_in_dim %334, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %336 = stablehlo.broadcast_in_dim %arg198, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %337 = stablehlo.add %335, %336 : tensor<1x120x40x40xf32>
-    %338 = stablehlo.maximum %337, %cst_11 : tensor<1x120x40x40xf32>
-    %339 = stablehlo.reduce(%338 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x120x40x40xf32>, tensor<f32>) -> tensor<1x120xf32>
-    %340 = stablehlo.reshape %339 : (tensor<1x120xf32>) -> tensor<1x120x1x1xf32>
-    %341 = stablehlo.broadcast_in_dim %340, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %342 = stablehlo.divide %341, %269 : tensor<1x120x1x1xf32>
-    %343 = stablehlo.convolution(%342, %arg26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x1x1xf32>, tensor<32x120x1x1xf32>) -> tensor<1x32x1x1xf32>
-    %344 = stablehlo.reshape %arg27 : (tensor<32xf32>) -> tensor<32x1x1xf32>
-    %345 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2, 3] : (tensor<1x32x1x1xf32>) -> tensor<1x32x1x1xf32>
-    %346 = stablehlo.broadcast_in_dim %344, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x1x1xf32>
-    %347 = stablehlo.add %345, %346 : tensor<1x32x1x1xf32>
-    %348 = stablehlo.maximum %347, %cst_12 : tensor<1x32x1x1xf32>
-    %349 = stablehlo.convolution(%348, %arg28) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x1x1xf32>, tensor<120x32x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %350 = stablehlo.reshape %arg29 : (tensor<120xf32>) -> tensor<120x1x1xf32>
-    %351 = stablehlo.broadcast_in_dim %349, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %352 = stablehlo.broadcast_in_dim %350, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %353 = stablehlo.add %351, %352 : tensor<1x120x1x1xf32>
-    %354 = stablehlo.broadcast_in_dim %353, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %355 = stablehlo.add %354, %283 : tensor<1x120x1x1xf32>
-    %356 = stablehlo.broadcast_in_dim %355, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %357 = stablehlo.divide %356, %286 : tensor<1x120x1x1xf32>
-    %358 = stablehlo.broadcast_in_dim %357, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %359 = stablehlo.minimum %288, %358 : tensor<1x120x1x1xf32>
-    %360 = stablehlo.broadcast_in_dim %359, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %361 = stablehlo.maximum %291, %360 : tensor<1x120x1x1xf32>
-    %362 = stablehlo.broadcast_in_dim %361, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x40x40xf32>
-    %363 = stablehlo.broadcast_in_dim %338, dims = [0, 1, 2, 3] : (tensor<1x120x40x40xf32>) -> tensor<1x120x40x40xf32>
-    %364 = stablehlo.multiply %362, %363 : tensor<1x120x40x40xf32>
-    %365 = stablehlo.convolution(%364, %arg30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x40x40xf32>, tensor<40x120x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %366 = stablehlo.broadcast_in_dim %365, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %367 = stablehlo.broadcast_in_dim %arg199, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %368 = stablehlo.subtract %366, %367 : tensor<1x40x40x40xf32>
-    %369 = stablehlo.broadcast_in_dim %368, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %370 = stablehlo.broadcast_in_dim %arg200, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %371 = stablehlo.multiply %369, %370 : tensor<1x40x40x40xf32>
-    %372 = stablehlo.broadcast_in_dim %371, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %373 = stablehlo.broadcast_in_dim %arg201, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %374 = stablehlo.multiply %372, %373 : tensor<1x40x40x40xf32>
-    %375 = stablehlo.broadcast_in_dim %374, dims = [0, 1, 2, 3] : (tensor<1x40x40x40xf32>) -> tensor<1x40x40x40xf32>
-    %376 = stablehlo.broadcast_in_dim %arg202, dims = [1, 2, 3] : (tensor<40x1x1xf32>) -> tensor<1x40x40x40xf32>
-    %377 = stablehlo.add %375, %376 : tensor<1x40x40x40xf32>
-    %378 = stablehlo.add %377, %310 : tensor<1x40x40x40xf32>
-    %379 = stablehlo.convolution(%378, %arg31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x40x40x40xf32>, tensor<240x40x1x1xf32>) -> tensor<1x240x40x40xf32>
-    %380 = stablehlo.broadcast_in_dim %379, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %381 = stablehlo.broadcast_in_dim %arg203, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x40x40xf32>
-    %382 = stablehlo.subtract %380, %381 : tensor<1x240x40x40xf32>
-    %383 = stablehlo.broadcast_in_dim %382, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %384 = stablehlo.broadcast_in_dim %arg204, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x40x40xf32>
-    %385 = stablehlo.multiply %383, %384 : tensor<1x240x40x40xf32>
-    %386 = stablehlo.broadcast_in_dim %385, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %387 = stablehlo.broadcast_in_dim %arg205, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x40x40xf32>
-    %388 = stablehlo.multiply %386, %387 : tensor<1x240x40x40xf32>
-    %389 = stablehlo.broadcast_in_dim %388, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %390 = stablehlo.broadcast_in_dim %arg206, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x40x40xf32>
-    %391 = stablehlo.add %389, %390 : tensor<1x240x40x40xf32>
-    %392 = stablehlo.broadcast_in_dim %391, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %393 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x240x40x40xf32>
-    %394 = stablehlo.add %392, %393 : tensor<1x240x40x40xf32>
-    %395 = stablehlo.maximum %394, %cst_13 : tensor<1x240x40x40xf32>
-    %396 = stablehlo.broadcast_in_dim %395, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %397 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x240x40x40xf32>
-    %398 = stablehlo.minimum %396, %397 : tensor<1x240x40x40xf32>
-    %399 = stablehlo.broadcast_in_dim %398, dims = [0, 1, 2, 3] : (tensor<1x240x40x40xf32>) -> tensor<1x240x40x40xf32>
-    %400 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x240x40x40xf32>
-    %401 = stablehlo.divide %399, %400 : tensor<1x240x40x40xf32>
-    %402 = stablehlo.multiply %401, %391 : tensor<1x240x40x40xf32>
-    %403 = stablehlo.convolution(%402, %arg32) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 240 : i64} : (tensor<1x240x40x40xf32>, tensor<240x1x3x3xf32>) -> tensor<1x240x20x20xf32>
-    %404 = stablehlo.broadcast_in_dim %403, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %405 = stablehlo.broadcast_in_dim %arg207, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x20x20xf32>
-    %406 = stablehlo.subtract %404, %405 : tensor<1x240x20x20xf32>
-    %407 = stablehlo.broadcast_in_dim %406, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %408 = stablehlo.broadcast_in_dim %arg208, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x20x20xf32>
-    %409 = stablehlo.multiply %407, %408 : tensor<1x240x20x20xf32>
-    %410 = stablehlo.broadcast_in_dim %409, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %411 = stablehlo.broadcast_in_dim %arg209, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x20x20xf32>
-    %412 = stablehlo.multiply %410, %411 : tensor<1x240x20x20xf32>
-    %413 = stablehlo.broadcast_in_dim %412, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %414 = stablehlo.broadcast_in_dim %arg210, dims = [1, 2, 3] : (tensor<240x1x1xf32>) -> tensor<1x240x20x20xf32>
-    %415 = stablehlo.add %413, %414 : tensor<1x240x20x20xf32>
-    %416 = stablehlo.broadcast_in_dim %415, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %417 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x240x20x20xf32>
-    %418 = stablehlo.add %416, %417 : tensor<1x240x20x20xf32>
-    %419 = stablehlo.maximum %418, %cst_14 : tensor<1x240x20x20xf32>
-    %420 = stablehlo.broadcast_in_dim %419, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %421 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x240x20x20xf32>
-    %422 = stablehlo.minimum %420, %421 : tensor<1x240x20x20xf32>
-    %423 = stablehlo.broadcast_in_dim %422, dims = [0, 1, 2, 3] : (tensor<1x240x20x20xf32>) -> tensor<1x240x20x20xf32>
-    %424 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x240x20x20xf32>
-    %425 = stablehlo.divide %423, %424 : tensor<1x240x20x20xf32>
-    %426 = stablehlo.multiply %425, %415 : tensor<1x240x20x20xf32>
-    %427 = stablehlo.convolution(%426, %arg33) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x240x20x20xf32>, tensor<80x240x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %428 = stablehlo.broadcast_in_dim %427, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %429 = stablehlo.broadcast_in_dim %arg211, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %430 = stablehlo.subtract %428, %429 : tensor<1x80x20x20xf32>
-    %431 = stablehlo.broadcast_in_dim %430, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %432 = stablehlo.broadcast_in_dim %arg212, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %433 = stablehlo.multiply %431, %432 : tensor<1x80x20x20xf32>
-    %434 = stablehlo.broadcast_in_dim %433, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %435 = stablehlo.broadcast_in_dim %arg213, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %436 = stablehlo.multiply %434, %435 : tensor<1x80x20x20xf32>
-    %437 = stablehlo.broadcast_in_dim %436, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %438 = stablehlo.broadcast_in_dim %arg214, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %439 = stablehlo.add %437, %438 : tensor<1x80x20x20xf32>
-    %440 = stablehlo.convolution(%439, %arg34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x20x20xf32>, tensor<200x80x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %441 = stablehlo.broadcast_in_dim %440, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %442 = stablehlo.broadcast_in_dim %arg215, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %443 = stablehlo.subtract %441, %442 : tensor<1x200x20x20xf32>
-    %444 = stablehlo.broadcast_in_dim %443, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %445 = stablehlo.broadcast_in_dim %arg216, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %446 = stablehlo.multiply %444, %445 : tensor<1x200x20x20xf32>
-    %447 = stablehlo.broadcast_in_dim %446, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %448 = stablehlo.broadcast_in_dim %arg217, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %449 = stablehlo.multiply %447, %448 : tensor<1x200x20x20xf32>
-    %450 = stablehlo.broadcast_in_dim %449, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %451 = stablehlo.broadcast_in_dim %arg218, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %452 = stablehlo.add %450, %451 : tensor<1x200x20x20xf32>
-    %453 = stablehlo.broadcast_in_dim %452, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %454 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x200x20x20xf32>
-    %455 = stablehlo.add %453, %454 : tensor<1x200x20x20xf32>
-    %456 = stablehlo.maximum %455, %cst_15 : tensor<1x200x20x20xf32>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %458 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x200x20x20xf32>
-    %459 = stablehlo.minimum %457, %458 : tensor<1x200x20x20xf32>
-    %460 = stablehlo.broadcast_in_dim %459, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %461 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x200x20x20xf32>
-    %462 = stablehlo.divide %460, %461 : tensor<1x200x20x20xf32>
-    %463 = stablehlo.multiply %462, %452 : tensor<1x200x20x20xf32>
-    %464 = stablehlo.convolution(%463, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 200 : i64} : (tensor<1x200x20x20xf32>, tensor<200x1x3x3xf32>) -> tensor<1x200x20x20xf32>
-    %465 = stablehlo.broadcast_in_dim %464, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %466 = stablehlo.broadcast_in_dim %arg219, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %467 = stablehlo.subtract %465, %466 : tensor<1x200x20x20xf32>
-    %468 = stablehlo.broadcast_in_dim %467, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %469 = stablehlo.broadcast_in_dim %arg220, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %470 = stablehlo.multiply %468, %469 : tensor<1x200x20x20xf32>
-    %471 = stablehlo.broadcast_in_dim %470, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %472 = stablehlo.broadcast_in_dim %arg221, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %473 = stablehlo.multiply %471, %472 : tensor<1x200x20x20xf32>
-    %474 = stablehlo.broadcast_in_dim %473, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %475 = stablehlo.broadcast_in_dim %arg222, dims = [1, 2, 3] : (tensor<200x1x1xf32>) -> tensor<1x200x20x20xf32>
-    %476 = stablehlo.add %474, %475 : tensor<1x200x20x20xf32>
-    %477 = stablehlo.broadcast_in_dim %476, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %478 = stablehlo.add %477, %454 : tensor<1x200x20x20xf32>
-    %479 = stablehlo.maximum %478, %cst_15 : tensor<1x200x20x20xf32>
-    %480 = stablehlo.broadcast_in_dim %479, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %481 = stablehlo.minimum %480, %458 : tensor<1x200x20x20xf32>
-    %482 = stablehlo.broadcast_in_dim %481, dims = [0, 1, 2, 3] : (tensor<1x200x20x20xf32>) -> tensor<1x200x20x20xf32>
-    %483 = stablehlo.divide %482, %461 : tensor<1x200x20x20xf32>
-    %484 = stablehlo.multiply %483, %476 : tensor<1x200x20x20xf32>
-    %485 = stablehlo.convolution(%484, %arg36) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x200x20x20xf32>, tensor<80x200x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %486 = stablehlo.broadcast_in_dim %485, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %487 = stablehlo.broadcast_in_dim %arg223, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %488 = stablehlo.subtract %486, %487 : tensor<1x80x20x20xf32>
-    %489 = stablehlo.broadcast_in_dim %488, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %490 = stablehlo.broadcast_in_dim %arg224, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %491 = stablehlo.multiply %489, %490 : tensor<1x80x20x20xf32>
-    %492 = stablehlo.broadcast_in_dim %491, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %493 = stablehlo.broadcast_in_dim %arg225, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %494 = stablehlo.multiply %492, %493 : tensor<1x80x20x20xf32>
-    %495 = stablehlo.broadcast_in_dim %494, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %496 = stablehlo.broadcast_in_dim %arg226, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %497 = stablehlo.add %495, %496 : tensor<1x80x20x20xf32>
-    %498 = stablehlo.add %497, %439 : tensor<1x80x20x20xf32>
-    %499 = stablehlo.convolution(%498, %arg37) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x20x20xf32>, tensor<184x80x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %500 = stablehlo.broadcast_in_dim %499, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %501 = stablehlo.broadcast_in_dim %arg227, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %502 = stablehlo.subtract %500, %501 : tensor<1x184x20x20xf32>
-    %503 = stablehlo.broadcast_in_dim %502, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %504 = stablehlo.broadcast_in_dim %arg228, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %505 = stablehlo.multiply %503, %504 : tensor<1x184x20x20xf32>
-    %506 = stablehlo.broadcast_in_dim %505, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %507 = stablehlo.broadcast_in_dim %arg229, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %508 = stablehlo.multiply %506, %507 : tensor<1x184x20x20xf32>
-    %509 = stablehlo.broadcast_in_dim %508, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %510 = stablehlo.broadcast_in_dim %arg230, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %511 = stablehlo.add %509, %510 : tensor<1x184x20x20xf32>
-    %512 = stablehlo.broadcast_in_dim %511, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %513 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x184x20x20xf32>
-    %514 = stablehlo.add %512, %513 : tensor<1x184x20x20xf32>
-    %515 = stablehlo.maximum %514, %cst_16 : tensor<1x184x20x20xf32>
-    %516 = stablehlo.broadcast_in_dim %515, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %517 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x184x20x20xf32>
-    %518 = stablehlo.minimum %516, %517 : tensor<1x184x20x20xf32>
-    %519 = stablehlo.broadcast_in_dim %518, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %520 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x184x20x20xf32>
-    %521 = stablehlo.divide %519, %520 : tensor<1x184x20x20xf32>
-    %522 = stablehlo.multiply %521, %511 : tensor<1x184x20x20xf32>
-    %523 = stablehlo.convolution(%522, %arg38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 184 : i64} : (tensor<1x184x20x20xf32>, tensor<184x1x3x3xf32>) -> tensor<1x184x20x20xf32>
-    %524 = stablehlo.broadcast_in_dim %523, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %525 = stablehlo.broadcast_in_dim %arg231, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %526 = stablehlo.subtract %524, %525 : tensor<1x184x20x20xf32>
-    %527 = stablehlo.broadcast_in_dim %526, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %528 = stablehlo.broadcast_in_dim %arg232, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %529 = stablehlo.multiply %527, %528 : tensor<1x184x20x20xf32>
-    %530 = stablehlo.broadcast_in_dim %529, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %531 = stablehlo.broadcast_in_dim %arg233, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %532 = stablehlo.multiply %530, %531 : tensor<1x184x20x20xf32>
-    %533 = stablehlo.broadcast_in_dim %532, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %534 = stablehlo.broadcast_in_dim %arg234, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %535 = stablehlo.add %533, %534 : tensor<1x184x20x20xf32>
-    %536 = stablehlo.broadcast_in_dim %535, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %537 = stablehlo.add %536, %513 : tensor<1x184x20x20xf32>
-    %538 = stablehlo.maximum %537, %cst_16 : tensor<1x184x20x20xf32>
-    %539 = stablehlo.broadcast_in_dim %538, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %540 = stablehlo.minimum %539, %517 : tensor<1x184x20x20xf32>
-    %541 = stablehlo.broadcast_in_dim %540, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %542 = stablehlo.divide %541, %520 : tensor<1x184x20x20xf32>
-    %543 = stablehlo.multiply %542, %535 : tensor<1x184x20x20xf32>
-    %544 = stablehlo.convolution(%543, %arg39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x184x20x20xf32>, tensor<80x184x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %545 = stablehlo.broadcast_in_dim %544, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %546 = stablehlo.broadcast_in_dim %arg235, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %547 = stablehlo.subtract %545, %546 : tensor<1x80x20x20xf32>
-    %548 = stablehlo.broadcast_in_dim %547, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %549 = stablehlo.broadcast_in_dim %arg236, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %550 = stablehlo.multiply %548, %549 : tensor<1x80x20x20xf32>
-    %551 = stablehlo.broadcast_in_dim %550, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %552 = stablehlo.broadcast_in_dim %arg237, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %553 = stablehlo.multiply %551, %552 : tensor<1x80x20x20xf32>
-    %554 = stablehlo.broadcast_in_dim %553, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %555 = stablehlo.broadcast_in_dim %arg238, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %556 = stablehlo.add %554, %555 : tensor<1x80x20x20xf32>
-    %557 = stablehlo.add %556, %498 : tensor<1x80x20x20xf32>
-    %558 = stablehlo.convolution(%557, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x20x20xf32>, tensor<184x80x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %559 = stablehlo.broadcast_in_dim %558, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %560 = stablehlo.broadcast_in_dim %arg239, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %561 = stablehlo.subtract %559, %560 : tensor<1x184x20x20xf32>
-    %562 = stablehlo.broadcast_in_dim %561, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %563 = stablehlo.broadcast_in_dim %arg240, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %564 = stablehlo.multiply %562, %563 : tensor<1x184x20x20xf32>
-    %565 = stablehlo.broadcast_in_dim %564, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %566 = stablehlo.broadcast_in_dim %arg241, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %567 = stablehlo.multiply %565, %566 : tensor<1x184x20x20xf32>
-    %568 = stablehlo.broadcast_in_dim %567, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %569 = stablehlo.broadcast_in_dim %arg242, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %570 = stablehlo.add %568, %569 : tensor<1x184x20x20xf32>
-    %571 = stablehlo.broadcast_in_dim %570, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %572 = stablehlo.add %571, %513 : tensor<1x184x20x20xf32>
-    %573 = stablehlo.maximum %572, %cst_16 : tensor<1x184x20x20xf32>
-    %574 = stablehlo.broadcast_in_dim %573, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %575 = stablehlo.minimum %574, %517 : tensor<1x184x20x20xf32>
-    %576 = stablehlo.broadcast_in_dim %575, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %577 = stablehlo.divide %576, %520 : tensor<1x184x20x20xf32>
-    %578 = stablehlo.multiply %577, %570 : tensor<1x184x20x20xf32>
-    %579 = stablehlo.convolution(%578, %arg41) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 184 : i64} : (tensor<1x184x20x20xf32>, tensor<184x1x3x3xf32>) -> tensor<1x184x20x20xf32>
-    %580 = stablehlo.broadcast_in_dim %579, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %581 = stablehlo.broadcast_in_dim %arg243, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %582 = stablehlo.subtract %580, %581 : tensor<1x184x20x20xf32>
-    %583 = stablehlo.broadcast_in_dim %582, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %584 = stablehlo.broadcast_in_dim %arg244, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %585 = stablehlo.multiply %583, %584 : tensor<1x184x20x20xf32>
-    %586 = stablehlo.broadcast_in_dim %585, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %587 = stablehlo.broadcast_in_dim %arg245, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %588 = stablehlo.multiply %586, %587 : tensor<1x184x20x20xf32>
-    %589 = stablehlo.broadcast_in_dim %588, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %590 = stablehlo.broadcast_in_dim %arg246, dims = [1, 2, 3] : (tensor<184x1x1xf32>) -> tensor<1x184x20x20xf32>
-    %591 = stablehlo.add %589, %590 : tensor<1x184x20x20xf32>
-    %592 = stablehlo.broadcast_in_dim %591, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %593 = stablehlo.add %592, %513 : tensor<1x184x20x20xf32>
-    %594 = stablehlo.maximum %593, %cst_16 : tensor<1x184x20x20xf32>
-    %595 = stablehlo.broadcast_in_dim %594, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %596 = stablehlo.minimum %595, %517 : tensor<1x184x20x20xf32>
-    %597 = stablehlo.broadcast_in_dim %596, dims = [0, 1, 2, 3] : (tensor<1x184x20x20xf32>) -> tensor<1x184x20x20xf32>
-    %598 = stablehlo.divide %597, %520 : tensor<1x184x20x20xf32>
-    %599 = stablehlo.multiply %598, %591 : tensor<1x184x20x20xf32>
-    %600 = stablehlo.convolution(%599, %arg42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x184x20x20xf32>, tensor<80x184x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %601 = stablehlo.broadcast_in_dim %600, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %602 = stablehlo.broadcast_in_dim %arg247, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %603 = stablehlo.subtract %601, %602 : tensor<1x80x20x20xf32>
-    %604 = stablehlo.broadcast_in_dim %603, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %605 = stablehlo.broadcast_in_dim %arg248, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %606 = stablehlo.multiply %604, %605 : tensor<1x80x20x20xf32>
-    %607 = stablehlo.broadcast_in_dim %606, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %608 = stablehlo.broadcast_in_dim %arg249, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %609 = stablehlo.multiply %607, %608 : tensor<1x80x20x20xf32>
-    %610 = stablehlo.broadcast_in_dim %609, dims = [0, 1, 2, 3] : (tensor<1x80x20x20xf32>) -> tensor<1x80x20x20xf32>
-    %611 = stablehlo.broadcast_in_dim %arg250, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x20x20xf32>
-    %612 = stablehlo.add %610, %611 : tensor<1x80x20x20xf32>
-    %613 = stablehlo.add %612, %557 : tensor<1x80x20x20xf32>
-    %614 = stablehlo.convolution(%613, %arg43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x20x20xf32>, tensor<480x80x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %615 = stablehlo.broadcast_in_dim %614, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %616 = stablehlo.broadcast_in_dim %arg251, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %617 = stablehlo.subtract %615, %616 : tensor<1x480x20x20xf32>
-    %618 = stablehlo.broadcast_in_dim %617, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %619 = stablehlo.broadcast_in_dim %arg252, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %620 = stablehlo.multiply %618, %619 : tensor<1x480x20x20xf32>
-    %621 = stablehlo.broadcast_in_dim %620, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %622 = stablehlo.broadcast_in_dim %arg253, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %623 = stablehlo.multiply %621, %622 : tensor<1x480x20x20xf32>
-    %624 = stablehlo.broadcast_in_dim %623, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %625 = stablehlo.broadcast_in_dim %arg254, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %626 = stablehlo.add %624, %625 : tensor<1x480x20x20xf32>
-    %627 = stablehlo.broadcast_in_dim %626, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %628 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x480x20x20xf32>
-    %629 = stablehlo.add %627, %628 : tensor<1x480x20x20xf32>
-    %630 = stablehlo.maximum %629, %cst_17 : tensor<1x480x20x20xf32>
-    %631 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %632 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x480x20x20xf32>
-    %633 = stablehlo.minimum %631, %632 : tensor<1x480x20x20xf32>
-    %634 = stablehlo.broadcast_in_dim %633, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %635 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x480x20x20xf32>
-    %636 = stablehlo.divide %634, %635 : tensor<1x480x20x20xf32>
-    %637 = stablehlo.multiply %636, %626 : tensor<1x480x20x20xf32>
-    %638 = stablehlo.convolution(%637, %arg44) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 480 : i64} : (tensor<1x480x20x20xf32>, tensor<480x1x3x3xf32>) -> tensor<1x480x20x20xf32>
-    %639 = stablehlo.broadcast_in_dim %638, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %640 = stablehlo.broadcast_in_dim %arg255, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %641 = stablehlo.subtract %639, %640 : tensor<1x480x20x20xf32>
-    %642 = stablehlo.broadcast_in_dim %641, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %643 = stablehlo.broadcast_in_dim %arg256, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %644 = stablehlo.multiply %642, %643 : tensor<1x480x20x20xf32>
-    %645 = stablehlo.broadcast_in_dim %644, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %646 = stablehlo.broadcast_in_dim %arg257, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %647 = stablehlo.multiply %645, %646 : tensor<1x480x20x20xf32>
-    %648 = stablehlo.broadcast_in_dim %647, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %649 = stablehlo.broadcast_in_dim %arg258, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %650 = stablehlo.add %648, %649 : tensor<1x480x20x20xf32>
-    %651 = stablehlo.broadcast_in_dim %650, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %652 = stablehlo.add %651, %628 : tensor<1x480x20x20xf32>
-    %653 = stablehlo.maximum %652, %cst_17 : tensor<1x480x20x20xf32>
-    %654 = stablehlo.broadcast_in_dim %653, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %655 = stablehlo.minimum %654, %632 : tensor<1x480x20x20xf32>
-    %656 = stablehlo.broadcast_in_dim %655, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %657 = stablehlo.divide %656, %635 : tensor<1x480x20x20xf32>
-    %658 = stablehlo.multiply %657, %650 : tensor<1x480x20x20xf32>
-    %659 = stablehlo.reduce(%658 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x480x20x20xf32>, tensor<f32>) -> tensor<1x480xf32>
-    %660 = stablehlo.reshape %659 : (tensor<1x480xf32>) -> tensor<1x480x1x1xf32>
-    %661 = stablehlo.convert %cst_26 : (tensor<1xi64>) -> tensor<1xf32>
-    %662 = stablehlo.reshape %661 : (tensor<1xf32>) -> tensor<f32>
-    %663 = stablehlo.broadcast_in_dim %660, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %664 = stablehlo.broadcast_in_dim %662, dims = [] : (tensor<f32>) -> tensor<1x480x1x1xf32>
-    %665 = stablehlo.divide %663, %664 : tensor<1x480x1x1xf32>
-    %666 = stablehlo.convolution(%665, %arg45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x1x1xf32>, tensor<120x480x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %667 = stablehlo.reshape %arg46 : (tensor<120xf32>) -> tensor<120x1x1xf32>
-    %668 = stablehlo.broadcast_in_dim %666, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %669 = stablehlo.broadcast_in_dim %667, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %670 = stablehlo.add %668, %669 : tensor<1x120x1x1xf32>
-    %671 = stablehlo.maximum %670, %cst_18 : tensor<1x120x1x1xf32>
-    %672 = stablehlo.convolution(%671, %arg47) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x1x1xf32>, tensor<480x120x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %673 = stablehlo.reshape %arg48 : (tensor<480xf32>) -> tensor<480x1x1xf32>
-    %674 = stablehlo.broadcast_in_dim %672, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %675 = stablehlo.broadcast_in_dim %673, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %676 = stablehlo.add %674, %675 : tensor<1x480x1x1xf32>
-    %677 = stablehlo.broadcast_in_dim %676, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %678 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x480x1x1xf32>
-    %679 = stablehlo.add %677, %678 : tensor<1x480x1x1xf32>
-    %680 = stablehlo.broadcast_in_dim %679, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %681 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x480x1x1xf32>
-    %682 = stablehlo.divide %680, %681 : tensor<1x480x1x1xf32>
-    %683 = stablehlo.broadcast_in_dim %215, dims = [] : (tensor<f32>) -> tensor<1x480x1x1xf32>
-    %684 = stablehlo.broadcast_in_dim %682, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %685 = stablehlo.minimum %683, %684 : tensor<1x480x1x1xf32>
-    %686 = stablehlo.broadcast_in_dim %19, dims = [] : (tensor<f32>) -> tensor<1x480x1x1xf32>
-    %687 = stablehlo.broadcast_in_dim %685, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %688 = stablehlo.maximum %686, %687 : tensor<1x480x1x1xf32>
-    %689 = stablehlo.broadcast_in_dim %688, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x20x20xf32>
-    %690 = stablehlo.broadcast_in_dim %658, dims = [0, 1, 2, 3] : (tensor<1x480x20x20xf32>) -> tensor<1x480x20x20xf32>
-    %691 = stablehlo.multiply %689, %690 : tensor<1x480x20x20xf32>
-    %692 = stablehlo.convolution(%691, %arg49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x20x20xf32>, tensor<112x480x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %693 = stablehlo.broadcast_in_dim %692, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %694 = stablehlo.broadcast_in_dim %arg259, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %695 = stablehlo.subtract %693, %694 : tensor<1x112x20x20xf32>
-    %696 = stablehlo.broadcast_in_dim %695, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %697 = stablehlo.broadcast_in_dim %arg260, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %698 = stablehlo.multiply %696, %697 : tensor<1x112x20x20xf32>
-    %699 = stablehlo.broadcast_in_dim %698, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %700 = stablehlo.broadcast_in_dim %arg261, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %701 = stablehlo.multiply %699, %700 : tensor<1x112x20x20xf32>
-    %702 = stablehlo.broadcast_in_dim %701, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %703 = stablehlo.broadcast_in_dim %arg262, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %704 = stablehlo.add %702, %703 : tensor<1x112x20x20xf32>
-    %705 = stablehlo.convolution(%704, %arg50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x112x20x20xf32>, tensor<672x112x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %706 = stablehlo.broadcast_in_dim %705, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %707 = stablehlo.broadcast_in_dim %arg263, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %708 = stablehlo.subtract %706, %707 : tensor<1x672x20x20xf32>
-    %709 = stablehlo.broadcast_in_dim %708, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %710 = stablehlo.broadcast_in_dim %arg264, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %711 = stablehlo.multiply %709, %710 : tensor<1x672x20x20xf32>
-    %712 = stablehlo.broadcast_in_dim %711, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %713 = stablehlo.broadcast_in_dim %arg265, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %714 = stablehlo.multiply %712, %713 : tensor<1x672x20x20xf32>
-    %715 = stablehlo.broadcast_in_dim %714, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %716 = stablehlo.broadcast_in_dim %arg266, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %717 = stablehlo.add %715, %716 : tensor<1x672x20x20xf32>
-    %718 = stablehlo.broadcast_in_dim %717, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %719 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x672x20x20xf32>
-    %720 = stablehlo.add %718, %719 : tensor<1x672x20x20xf32>
-    %721 = stablehlo.maximum %720, %cst_19 : tensor<1x672x20x20xf32>
-    %722 = stablehlo.broadcast_in_dim %721, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %723 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x672x20x20xf32>
-    %724 = stablehlo.minimum %722, %723 : tensor<1x672x20x20xf32>
-    %725 = stablehlo.broadcast_in_dim %724, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %726 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x672x20x20xf32>
-    %727 = stablehlo.divide %725, %726 : tensor<1x672x20x20xf32>
-    %728 = stablehlo.multiply %727, %717 : tensor<1x672x20x20xf32>
-    %729 = stablehlo.convolution(%728, %arg51) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 672 : i64} : (tensor<1x672x20x20xf32>, tensor<672x1x3x3xf32>) -> tensor<1x672x20x20xf32>
-    %730 = stablehlo.broadcast_in_dim %729, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %731 = stablehlo.broadcast_in_dim %arg267, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %732 = stablehlo.subtract %730, %731 : tensor<1x672x20x20xf32>
-    %733 = stablehlo.broadcast_in_dim %732, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %734 = stablehlo.broadcast_in_dim %arg268, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %735 = stablehlo.multiply %733, %734 : tensor<1x672x20x20xf32>
-    %736 = stablehlo.broadcast_in_dim %735, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %737 = stablehlo.broadcast_in_dim %arg269, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %738 = stablehlo.multiply %736, %737 : tensor<1x672x20x20xf32>
-    %739 = stablehlo.broadcast_in_dim %738, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %740 = stablehlo.broadcast_in_dim %arg270, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %741 = stablehlo.add %739, %740 : tensor<1x672x20x20xf32>
-    %742 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %743 = stablehlo.add %742, %719 : tensor<1x672x20x20xf32>
-    %744 = stablehlo.maximum %743, %cst_19 : tensor<1x672x20x20xf32>
-    %745 = stablehlo.broadcast_in_dim %744, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %746 = stablehlo.minimum %745, %723 : tensor<1x672x20x20xf32>
-    %747 = stablehlo.broadcast_in_dim %746, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %748 = stablehlo.divide %747, %726 : tensor<1x672x20x20xf32>
-    %749 = stablehlo.multiply %748, %741 : tensor<1x672x20x20xf32>
-    %750 = stablehlo.reduce(%749 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x672x20x20xf32>, tensor<f32>) -> tensor<1x672xf32>
-    %751 = stablehlo.reshape %750 : (tensor<1x672xf32>) -> tensor<1x672x1x1xf32>
-    %752 = stablehlo.broadcast_in_dim %751, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %753 = stablehlo.broadcast_in_dim %662, dims = [] : (tensor<f32>) -> tensor<1x672x1x1xf32>
-    %754 = stablehlo.divide %752, %753 : tensor<1x672x1x1xf32>
-    %755 = stablehlo.convolution(%754, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x672x1x1xf32>, tensor<168x672x1x1xf32>) -> tensor<1x168x1x1xf32>
-    %756 = stablehlo.reshape %arg53 : (tensor<168xf32>) -> tensor<168x1x1xf32>
-    %757 = stablehlo.broadcast_in_dim %755, dims = [0, 1, 2, 3] : (tensor<1x168x1x1xf32>) -> tensor<1x168x1x1xf32>
-    %758 = stablehlo.broadcast_in_dim %756, dims = [1, 2, 3] : (tensor<168x1x1xf32>) -> tensor<1x168x1x1xf32>
-    %759 = stablehlo.add %757, %758 : tensor<1x168x1x1xf32>
-    %760 = stablehlo.maximum %759, %cst_20 : tensor<1x168x1x1xf32>
-    %761 = stablehlo.convolution(%760, %arg54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x168x1x1xf32>, tensor<672x168x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %762 = stablehlo.reshape %arg55 : (tensor<672xf32>) -> tensor<672x1x1xf32>
-    %763 = stablehlo.broadcast_in_dim %761, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %764 = stablehlo.broadcast_in_dim %762, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %765 = stablehlo.add %763, %764 : tensor<1x672x1x1xf32>
-    %766 = stablehlo.broadcast_in_dim %765, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %767 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x672x1x1xf32>
-    %768 = stablehlo.add %766, %767 : tensor<1x672x1x1xf32>
-    %769 = stablehlo.broadcast_in_dim %768, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %770 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x672x1x1xf32>
-    %771 = stablehlo.divide %769, %770 : tensor<1x672x1x1xf32>
-    %772 = stablehlo.broadcast_in_dim %215, dims = [] : (tensor<f32>) -> tensor<1x672x1x1xf32>
-    %773 = stablehlo.broadcast_in_dim %771, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %774 = stablehlo.minimum %772, %773 : tensor<1x672x1x1xf32>
-    %775 = stablehlo.broadcast_in_dim %19, dims = [] : (tensor<f32>) -> tensor<1x672x1x1xf32>
-    %776 = stablehlo.broadcast_in_dim %774, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %777 = stablehlo.maximum %775, %776 : tensor<1x672x1x1xf32>
-    %778 = stablehlo.broadcast_in_dim %777, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %779 = stablehlo.broadcast_in_dim %749, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %780 = stablehlo.multiply %778, %779 : tensor<1x672x20x20xf32>
-    %781 = stablehlo.convolution(%780, %arg56) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x672x20x20xf32>, tensor<112x672x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %782 = stablehlo.broadcast_in_dim %781, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %783 = stablehlo.broadcast_in_dim %arg271, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %784 = stablehlo.subtract %782, %783 : tensor<1x112x20x20xf32>
-    %785 = stablehlo.broadcast_in_dim %784, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %786 = stablehlo.broadcast_in_dim %arg272, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %787 = stablehlo.multiply %785, %786 : tensor<1x112x20x20xf32>
-    %788 = stablehlo.broadcast_in_dim %787, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %789 = stablehlo.broadcast_in_dim %arg273, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %790 = stablehlo.multiply %788, %789 : tensor<1x112x20x20xf32>
-    %791 = stablehlo.broadcast_in_dim %790, dims = [0, 1, 2, 3] : (tensor<1x112x20x20xf32>) -> tensor<1x112x20x20xf32>
-    %792 = stablehlo.broadcast_in_dim %arg274, dims = [1, 2, 3] : (tensor<112x1x1xf32>) -> tensor<1x112x20x20xf32>
-    %793 = stablehlo.add %791, %792 : tensor<1x112x20x20xf32>
-    %794 = stablehlo.add %793, %704 : tensor<1x112x20x20xf32>
-    %795 = stablehlo.convolution(%794, %arg57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x112x20x20xf32>, tensor<672x112x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %796 = stablehlo.broadcast_in_dim %795, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %797 = stablehlo.broadcast_in_dim %arg275, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %798 = stablehlo.subtract %796, %797 : tensor<1x672x20x20xf32>
-    %799 = stablehlo.broadcast_in_dim %798, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %800 = stablehlo.broadcast_in_dim %arg276, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %801 = stablehlo.multiply %799, %800 : tensor<1x672x20x20xf32>
-    %802 = stablehlo.broadcast_in_dim %801, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %803 = stablehlo.broadcast_in_dim %arg277, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %804 = stablehlo.multiply %802, %803 : tensor<1x672x20x20xf32>
-    %805 = stablehlo.broadcast_in_dim %804, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %806 = stablehlo.broadcast_in_dim %arg278, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %807 = stablehlo.add %805, %806 : tensor<1x672x20x20xf32>
-    %808 = stablehlo.broadcast_in_dim %807, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %809 = stablehlo.add %808, %719 : tensor<1x672x20x20xf32>
-    %810 = stablehlo.maximum %809, %cst_19 : tensor<1x672x20x20xf32>
-    %811 = stablehlo.broadcast_in_dim %810, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %812 = stablehlo.minimum %811, %723 : tensor<1x672x20x20xf32>
-    %813 = stablehlo.broadcast_in_dim %812, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %814 = stablehlo.divide %813, %726 : tensor<1x672x20x20xf32>
-    %815 = stablehlo.multiply %814, %807 : tensor<1x672x20x20xf32>
-    %816 = stablehlo.convolution(%815, %arg58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[2, 2], [2, 2]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 672 : i64} : (tensor<1x672x20x20xf32>, tensor<672x1x5x5xf32>) -> tensor<1x672x10x10xf32>
-    %817 = stablehlo.broadcast_in_dim %816, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %818 = stablehlo.broadcast_in_dim %arg279, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x10x10xf32>
-    %819 = stablehlo.subtract %817, %818 : tensor<1x672x10x10xf32>
-    %820 = stablehlo.broadcast_in_dim %819, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %821 = stablehlo.broadcast_in_dim %arg280, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x10x10xf32>
-    %822 = stablehlo.multiply %820, %821 : tensor<1x672x10x10xf32>
-    %823 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %824 = stablehlo.broadcast_in_dim %arg281, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x10x10xf32>
-    %825 = stablehlo.multiply %823, %824 : tensor<1x672x10x10xf32>
-    %826 = stablehlo.broadcast_in_dim %825, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %827 = stablehlo.broadcast_in_dim %arg282, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x10x10xf32>
-    %828 = stablehlo.add %826, %827 : tensor<1x672x10x10xf32>
-    %829 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %830 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x672x10x10xf32>
-    %831 = stablehlo.add %829, %830 : tensor<1x672x10x10xf32>
-    %832 = stablehlo.maximum %831, %cst_21 : tensor<1x672x10x10xf32>
-    %833 = stablehlo.broadcast_in_dim %832, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %834 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x672x10x10xf32>
-    %835 = stablehlo.minimum %833, %834 : tensor<1x672x10x10xf32>
-    %836 = stablehlo.broadcast_in_dim %835, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %837 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x672x10x10xf32>
-    %838 = stablehlo.divide %836, %837 : tensor<1x672x10x10xf32>
-    %839 = stablehlo.multiply %838, %828 : tensor<1x672x10x10xf32>
-    %840 = stablehlo.reduce(%839 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x672x10x10xf32>, tensor<f32>) -> tensor<1x672xf32>
-    %841 = stablehlo.reshape %840 : (tensor<1x672xf32>) -> tensor<1x672x1x1xf32>
-    %842 = stablehlo.convert %cst_27 : (tensor<1xi64>) -> tensor<1xf32>
-    %843 = stablehlo.reshape %842 : (tensor<1xf32>) -> tensor<f32>
-    %844 = stablehlo.broadcast_in_dim %841, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %845 = stablehlo.broadcast_in_dim %843, dims = [] : (tensor<f32>) -> tensor<1x672x1x1xf32>
-    %846 = stablehlo.divide %844, %845 : tensor<1x672x1x1xf32>
-    %847 = stablehlo.convolution(%846, %arg59) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x672x1x1xf32>, tensor<168x672x1x1xf32>) -> tensor<1x168x1x1xf32>
-    %848 = stablehlo.reshape %arg60 : (tensor<168xf32>) -> tensor<168x1x1xf32>
-    %849 = stablehlo.broadcast_in_dim %847, dims = [0, 1, 2, 3] : (tensor<1x168x1x1xf32>) -> tensor<1x168x1x1xf32>
-    %850 = stablehlo.broadcast_in_dim %848, dims = [1, 2, 3] : (tensor<168x1x1xf32>) -> tensor<1x168x1x1xf32>
-    %851 = stablehlo.add %849, %850 : tensor<1x168x1x1xf32>
-    %852 = stablehlo.maximum %851, %cst_20 : tensor<1x168x1x1xf32>
-    %853 = stablehlo.convolution(%852, %arg61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x168x1x1xf32>, tensor<672x168x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %854 = stablehlo.reshape %arg62 : (tensor<672xf32>) -> tensor<672x1x1xf32>
-    %855 = stablehlo.broadcast_in_dim %853, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %856 = stablehlo.broadcast_in_dim %854, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %857 = stablehlo.add %855, %856 : tensor<1x672x1x1xf32>
-    %858 = stablehlo.broadcast_in_dim %857, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %859 = stablehlo.add %858, %767 : tensor<1x672x1x1xf32>
-    %860 = stablehlo.broadcast_in_dim %859, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %861 = stablehlo.divide %860, %770 : tensor<1x672x1x1xf32>
-    %862 = stablehlo.broadcast_in_dim %861, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %863 = stablehlo.minimum %772, %862 : tensor<1x672x1x1xf32>
-    %864 = stablehlo.broadcast_in_dim %863, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x1x1xf32>
-    %865 = stablehlo.maximum %775, %864 : tensor<1x672x1x1xf32>
-    %866 = stablehlo.broadcast_in_dim %865, dims = [0, 1, 2, 3] : (tensor<1x672x1x1xf32>) -> tensor<1x672x10x10xf32>
-    %867 = stablehlo.broadcast_in_dim %839, dims = [0, 1, 2, 3] : (tensor<1x672x10x10xf32>) -> tensor<1x672x10x10xf32>
-    %868 = stablehlo.multiply %866, %867 : tensor<1x672x10x10xf32>
-    %869 = stablehlo.convolution(%868, %arg63) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x672x10x10xf32>, tensor<80x672x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %870 = stablehlo.broadcast_in_dim %869, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %871 = stablehlo.broadcast_in_dim %arg283, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %872 = stablehlo.subtract %870, %871 : tensor<1x80x10x10xf32>
-    %873 = stablehlo.broadcast_in_dim %872, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %874 = stablehlo.broadcast_in_dim %arg284, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %875 = stablehlo.multiply %873, %874 : tensor<1x80x10x10xf32>
-    %876 = stablehlo.broadcast_in_dim %875, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %877 = stablehlo.broadcast_in_dim %arg285, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %878 = stablehlo.multiply %876, %877 : tensor<1x80x10x10xf32>
-    %879 = stablehlo.broadcast_in_dim %878, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %880 = stablehlo.broadcast_in_dim %arg286, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %881 = stablehlo.add %879, %880 : tensor<1x80x10x10xf32>
-    %882 = stablehlo.convolution(%881, %arg64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x10x10xf32>, tensor<480x80x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %883 = stablehlo.broadcast_in_dim %882, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %884 = stablehlo.broadcast_in_dim %arg287, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %885 = stablehlo.subtract %883, %884 : tensor<1x480x10x10xf32>
-    %886 = stablehlo.broadcast_in_dim %885, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %887 = stablehlo.broadcast_in_dim %arg288, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %888 = stablehlo.multiply %886, %887 : tensor<1x480x10x10xf32>
-    %889 = stablehlo.broadcast_in_dim %888, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %890 = stablehlo.broadcast_in_dim %arg289, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %891 = stablehlo.multiply %889, %890 : tensor<1x480x10x10xf32>
-    %892 = stablehlo.broadcast_in_dim %891, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %893 = stablehlo.broadcast_in_dim %arg290, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %894 = stablehlo.add %892, %893 : tensor<1x480x10x10xf32>
-    %895 = stablehlo.broadcast_in_dim %894, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %896 = stablehlo.broadcast_in_dim %37, dims = [] : (tensor<f32>) -> tensor<1x480x10x10xf32>
-    %897 = stablehlo.add %895, %896 : tensor<1x480x10x10xf32>
-    %898 = stablehlo.maximum %897, %cst_22 : tensor<1x480x10x10xf32>
-    %899 = stablehlo.broadcast_in_dim %898, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %900 = stablehlo.broadcast_in_dim %42, dims = [] : (tensor<f32>) -> tensor<1x480x10x10xf32>
-    %901 = stablehlo.minimum %899, %900 : tensor<1x480x10x10xf32>
-    %902 = stablehlo.broadcast_in_dim %901, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %903 = stablehlo.broadcast_in_dim %47, dims = [] : (tensor<f32>) -> tensor<1x480x10x10xf32>
-    %904 = stablehlo.divide %902, %903 : tensor<1x480x10x10xf32>
-    %905 = stablehlo.multiply %904, %894 : tensor<1x480x10x10xf32>
-    %906 = stablehlo.convolution(%905, %arg65) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 480 : i64} : (tensor<1x480x10x10xf32>, tensor<480x1x5x5xf32>) -> tensor<1x480x10x10xf32>
-    %907 = stablehlo.broadcast_in_dim %906, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %908 = stablehlo.broadcast_in_dim %arg291, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %909 = stablehlo.subtract %907, %908 : tensor<1x480x10x10xf32>
-    %910 = stablehlo.broadcast_in_dim %909, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %911 = stablehlo.broadcast_in_dim %arg292, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %912 = stablehlo.multiply %910, %911 : tensor<1x480x10x10xf32>
-    %913 = stablehlo.broadcast_in_dim %912, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %914 = stablehlo.broadcast_in_dim %arg293, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %915 = stablehlo.multiply %913, %914 : tensor<1x480x10x10xf32>
-    %916 = stablehlo.broadcast_in_dim %915, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %917 = stablehlo.broadcast_in_dim %arg294, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %918 = stablehlo.add %916, %917 : tensor<1x480x10x10xf32>
-    %919 = stablehlo.broadcast_in_dim %918, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %920 = stablehlo.add %919, %896 : tensor<1x480x10x10xf32>
-    %921 = stablehlo.maximum %920, %cst_22 : tensor<1x480x10x10xf32>
-    %922 = stablehlo.broadcast_in_dim %921, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %923 = stablehlo.minimum %922, %900 : tensor<1x480x10x10xf32>
-    %924 = stablehlo.broadcast_in_dim %923, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %925 = stablehlo.divide %924, %903 : tensor<1x480x10x10xf32>
-    %926 = stablehlo.multiply %925, %918 : tensor<1x480x10x10xf32>
-    %927 = stablehlo.reduce(%926 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x480x10x10xf32>, tensor<f32>) -> tensor<1x480xf32>
-    %928 = stablehlo.reshape %927 : (tensor<1x480xf32>) -> tensor<1x480x1x1xf32>
-    %929 = stablehlo.broadcast_in_dim %928, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %930 = stablehlo.broadcast_in_dim %843, dims = [] : (tensor<f32>) -> tensor<1x480x1x1xf32>
-    %931 = stablehlo.divide %929, %930 : tensor<1x480x1x1xf32>
-    %932 = stablehlo.convolution(%931, %arg66) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x1x1xf32>, tensor<120x480x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %933 = stablehlo.reshape %arg67 : (tensor<120xf32>) -> tensor<120x1x1xf32>
-    %934 = stablehlo.broadcast_in_dim %932, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %935 = stablehlo.broadcast_in_dim %933, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %936 = stablehlo.add %934, %935 : tensor<1x120x1x1xf32>
-    %937 = stablehlo.maximum %936, %cst_18 : tensor<1x120x1x1xf32>
-    %938 = stablehlo.convolution(%937, %arg68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x1x1xf32>, tensor<480x120x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %939 = stablehlo.reshape %arg69 : (tensor<480xf32>) -> tensor<480x1x1xf32>
-    %940 = stablehlo.broadcast_in_dim %938, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %941 = stablehlo.broadcast_in_dim %939, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %942 = stablehlo.add %940, %941 : tensor<1x480x1x1xf32>
-    %943 = stablehlo.broadcast_in_dim %942, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %944 = stablehlo.add %943, %678 : tensor<1x480x1x1xf32>
-    %945 = stablehlo.broadcast_in_dim %944, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %946 = stablehlo.divide %945, %681 : tensor<1x480x1x1xf32>
-    %947 = stablehlo.broadcast_in_dim %946, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %948 = stablehlo.minimum %683, %947 : tensor<1x480x1x1xf32>
-    %949 = stablehlo.broadcast_in_dim %948, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %950 = stablehlo.maximum %686, %949 : tensor<1x480x1x1xf32>
-    %951 = stablehlo.broadcast_in_dim %950, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %952 = stablehlo.broadcast_in_dim %926, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %953 = stablehlo.multiply %951, %952 : tensor<1x480x10x10xf32>
-    %954 = stablehlo.convolution(%953, %arg70) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x10x10xf32>, tensor<80x480x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %955 = stablehlo.broadcast_in_dim %954, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %956 = stablehlo.broadcast_in_dim %arg295, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %957 = stablehlo.subtract %955, %956 : tensor<1x80x10x10xf32>
-    %958 = stablehlo.broadcast_in_dim %957, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %959 = stablehlo.broadcast_in_dim %arg296, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %960 = stablehlo.multiply %958, %959 : tensor<1x80x10x10xf32>
-    %961 = stablehlo.broadcast_in_dim %960, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %962 = stablehlo.broadcast_in_dim %arg297, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %963 = stablehlo.multiply %961, %962 : tensor<1x80x10x10xf32>
-    %964 = stablehlo.broadcast_in_dim %963, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %965 = stablehlo.broadcast_in_dim %arg298, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %966 = stablehlo.add %964, %965 : tensor<1x80x10x10xf32>
-    %967 = stablehlo.add %966, %881 : tensor<1x80x10x10xf32>
-    %968 = stablehlo.convolution(%967, %arg71) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x10x10xf32>, tensor<480x80x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %969 = stablehlo.broadcast_in_dim %968, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %970 = stablehlo.broadcast_in_dim %arg299, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %971 = stablehlo.subtract %969, %970 : tensor<1x480x10x10xf32>
-    %972 = stablehlo.broadcast_in_dim %971, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %973 = stablehlo.broadcast_in_dim %arg300, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %974 = stablehlo.multiply %972, %973 : tensor<1x480x10x10xf32>
-    %975 = stablehlo.broadcast_in_dim %974, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %976 = stablehlo.broadcast_in_dim %arg301, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %977 = stablehlo.multiply %975, %976 : tensor<1x480x10x10xf32>
-    %978 = stablehlo.broadcast_in_dim %977, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %979 = stablehlo.broadcast_in_dim %arg302, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %980 = stablehlo.add %978, %979 : tensor<1x480x10x10xf32>
-    %981 = stablehlo.broadcast_in_dim %980, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %982 = stablehlo.add %981, %896 : tensor<1x480x10x10xf32>
-    %983 = stablehlo.maximum %982, %cst_22 : tensor<1x480x10x10xf32>
-    %984 = stablehlo.broadcast_in_dim %983, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %985 = stablehlo.minimum %984, %900 : tensor<1x480x10x10xf32>
-    %986 = stablehlo.broadcast_in_dim %985, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %987 = stablehlo.divide %986, %903 : tensor<1x480x10x10xf32>
-    %988 = stablehlo.multiply %987, %980 : tensor<1x480x10x10xf32>
-    %989 = stablehlo.convolution(%988, %arg72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 480 : i64} : (tensor<1x480x10x10xf32>, tensor<480x1x5x5xf32>) -> tensor<1x480x10x10xf32>
-    %990 = stablehlo.broadcast_in_dim %989, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %991 = stablehlo.broadcast_in_dim %arg303, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %992 = stablehlo.subtract %990, %991 : tensor<1x480x10x10xf32>
-    %993 = stablehlo.broadcast_in_dim %992, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %994 = stablehlo.broadcast_in_dim %arg304, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %995 = stablehlo.multiply %993, %994 : tensor<1x480x10x10xf32>
-    %996 = stablehlo.broadcast_in_dim %995, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %997 = stablehlo.broadcast_in_dim %arg305, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %998 = stablehlo.multiply %996, %997 : tensor<1x480x10x10xf32>
-    %999 = stablehlo.broadcast_in_dim %998, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1000 = stablehlo.broadcast_in_dim %arg306, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1001 = stablehlo.add %999, %1000 : tensor<1x480x10x10xf32>
-    %1002 = stablehlo.broadcast_in_dim %1001, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1003 = stablehlo.add %1002, %896 : tensor<1x480x10x10xf32>
-    %1004 = stablehlo.maximum %1003, %cst_22 : tensor<1x480x10x10xf32>
-    %1005 = stablehlo.broadcast_in_dim %1004, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1006 = stablehlo.minimum %1005, %900 : tensor<1x480x10x10xf32>
-    %1007 = stablehlo.broadcast_in_dim %1006, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1008 = stablehlo.divide %1007, %903 : tensor<1x480x10x10xf32>
-    %1009 = stablehlo.multiply %1008, %1001 : tensor<1x480x10x10xf32>
-    %1010 = stablehlo.reduce(%1009 init: %cst_9) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x480x10x10xf32>, tensor<f32>) -> tensor<1x480xf32>
-    %1011 = stablehlo.reshape %1010 : (tensor<1x480xf32>) -> tensor<1x480x1x1xf32>
-    %1012 = stablehlo.broadcast_in_dim %1011, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1013 = stablehlo.divide %1012, %930 : tensor<1x480x1x1xf32>
-    %1014 = stablehlo.convolution(%1013, %arg73) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x1x1xf32>, tensor<120x480x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %1015 = stablehlo.reshape %arg74 : (tensor<120xf32>) -> tensor<120x1x1xf32>
-    %1016 = stablehlo.broadcast_in_dim %1014, dims = [0, 1, 2, 3] : (tensor<1x120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %1017 = stablehlo.broadcast_in_dim %1015, dims = [1, 2, 3] : (tensor<120x1x1xf32>) -> tensor<1x120x1x1xf32>
-    %1018 = stablehlo.add %1016, %1017 : tensor<1x120x1x1xf32>
-    %1019 = stablehlo.maximum %1018, %cst_18 : tensor<1x120x1x1xf32>
-    %1020 = stablehlo.convolution(%1019, %arg75) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x120x1x1xf32>, tensor<480x120x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1021 = stablehlo.reshape %arg76 : (tensor<480xf32>) -> tensor<480x1x1xf32>
-    %1022 = stablehlo.broadcast_in_dim %1020, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1023 = stablehlo.broadcast_in_dim %1021, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1024 = stablehlo.add %1022, %1023 : tensor<1x480x1x1xf32>
-    %1025 = stablehlo.broadcast_in_dim %1024, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1026 = stablehlo.add %1025, %678 : tensor<1x480x1x1xf32>
-    %1027 = stablehlo.broadcast_in_dim %1026, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1028 = stablehlo.divide %1027, %681 : tensor<1x480x1x1xf32>
-    %1029 = stablehlo.broadcast_in_dim %1028, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1030 = stablehlo.minimum %683, %1029 : tensor<1x480x1x1xf32>
-    %1031 = stablehlo.broadcast_in_dim %1030, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x1x1xf32>
-    %1032 = stablehlo.maximum %686, %1031 : tensor<1x480x1x1xf32>
-    %1033 = stablehlo.broadcast_in_dim %1032, dims = [0, 1, 2, 3] : (tensor<1x480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1034 = stablehlo.broadcast_in_dim %1009, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1035 = stablehlo.multiply %1033, %1034 : tensor<1x480x10x10xf32>
-    %1036 = stablehlo.convolution(%1035, %arg77) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x10x10xf32>, tensor<80x480x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %1037 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %1038 = stablehlo.broadcast_in_dim %arg307, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %1039 = stablehlo.subtract %1037, %1038 : tensor<1x80x10x10xf32>
-    %1040 = stablehlo.broadcast_in_dim %1039, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %1041 = stablehlo.broadcast_in_dim %arg308, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %1042 = stablehlo.multiply %1040, %1041 : tensor<1x80x10x10xf32>
-    %1043 = stablehlo.broadcast_in_dim %1042, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %1044 = stablehlo.broadcast_in_dim %arg309, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %1045 = stablehlo.multiply %1043, %1044 : tensor<1x80x10x10xf32>
-    %1046 = stablehlo.broadcast_in_dim %1045, dims = [0, 1, 2, 3] : (tensor<1x80x10x10xf32>) -> tensor<1x80x10x10xf32>
-    %1047 = stablehlo.broadcast_in_dim %arg310, dims = [1, 2, 3] : (tensor<80x1x1xf32>) -> tensor<1x80x10x10xf32>
-    %1048 = stablehlo.add %1046, %1047 : tensor<1x80x10x10xf32>
-    %1049 = stablehlo.add %1048, %967 : tensor<1x80x10x10xf32>
-    %1050 = stablehlo.convolution(%1049, %arg78) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x80x10x10xf32>, tensor<480x80x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1051 = stablehlo.broadcast_in_dim %1050, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1052 = stablehlo.broadcast_in_dim %arg311, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1053 = stablehlo.subtract %1051, %1052 : tensor<1x480x10x10xf32>
-    %1054 = stablehlo.broadcast_in_dim %1053, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1055 = stablehlo.broadcast_in_dim %arg312, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1056 = stablehlo.multiply %1054, %1055 : tensor<1x480x10x10xf32>
-    %1057 = stablehlo.broadcast_in_dim %1056, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1058 = stablehlo.broadcast_in_dim %arg313, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1059 = stablehlo.multiply %1057, %1058 : tensor<1x480x10x10xf32>
-    %1060 = stablehlo.broadcast_in_dim %1059, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1061 = stablehlo.broadcast_in_dim %arg314, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1062 = stablehlo.add %1060, %1061 : tensor<1x480x10x10xf32>
-    %1063 = stablehlo.broadcast_in_dim %1062, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1064 = stablehlo.add %1063, %896 : tensor<1x480x10x10xf32>
-    %1065 = stablehlo.maximum %1064, %cst_22 : tensor<1x480x10x10xf32>
-    %1066 = stablehlo.broadcast_in_dim %1065, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1067 = stablehlo.minimum %1066, %900 : tensor<1x480x10x10xf32>
-    %1068 = stablehlo.broadcast_in_dim %1067, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1069 = stablehlo.divide %1068, %903 : tensor<1x480x10x10xf32>
-    %1070 = stablehlo.multiply %1069, %1062 : tensor<1x480x10x10xf32>
-    %1071 = stablehlo.convolution(%1070, %arg79) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x10x10xf32>, tensor<256x480x1x1xf32>) -> tensor<1x256x10x10xf32>
-    %1072 = stablehlo.broadcast_in_dim %1071, dims = [0, 1, 2, 3] : (tensor<1x256x10x10xf32>) -> tensor<1x256x10x10xf32>
-    %1073 = stablehlo.broadcast_in_dim %arg315, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x10x10xf32>
-    %1074 = stablehlo.subtract %1072, %1073 : tensor<1x256x10x10xf32>
-    %1075 = stablehlo.broadcast_in_dim %1074, dims = [0, 1, 2, 3] : (tensor<1x256x10x10xf32>) -> tensor<1x256x10x10xf32>
-    %1076 = stablehlo.broadcast_in_dim %arg316, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x10x10xf32>
-    %1077 = stablehlo.multiply %1075, %1076 : tensor<1x256x10x10xf32>
-    %1078 = stablehlo.broadcast_in_dim %1077, dims = [0, 1, 2, 3] : (tensor<1x256x10x10xf32>) -> tensor<1x256x10x10xf32>
-    %1079 = stablehlo.broadcast_in_dim %arg317, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x10x10xf32>
-    %1080 = stablehlo.multiply %1078, %1079 : tensor<1x256x10x10xf32>
-    %1081 = stablehlo.broadcast_in_dim %1080, dims = [0, 1, 2, 3] : (tensor<1x256x10x10xf32>) -> tensor<1x256x10x10xf32>
-    %1082 = stablehlo.broadcast_in_dim %arg318, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x10x10xf32>
-    %1083 = stablehlo.add %1081, %1082 : tensor<1x256x10x10xf32>
-    %1084 = stablehlo.convert %cst_0 : (tensor<f64>) -> tensor<f32>
-    %1085 = stablehlo.broadcast_in_dim %1083, dims = [0, 1, 2, 3] : (tensor<1x256x10x10xf32>) -> tensor<1x256x10x10xf32>
-    %1086 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x256x10x10xf32>
-    %1087 = stablehlo.maximum %1085, %1086 : tensor<1x256x10x10xf32>
-    %1088 = stablehlo.convert %cst : (tensor<f64>) -> tensor<f32>
-    %1089 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x256x10x10xf32>
-    %1090 = stablehlo.broadcast_in_dim %1087, dims = [0, 1, 2, 3] : (tensor<1x256x10x10xf32>) -> tensor<1x256x10x10xf32>
-    %1091 = stablehlo.minimum %1089, %1090 : tensor<1x256x10x10xf32>
-    %1092 = stablehlo.convolution(%1091, %arg80) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x10x10xf32>, tensor<256x1x3x3xf32>) -> tensor<1x256x5x5xf32>
-    %1093 = stablehlo.broadcast_in_dim %1092, dims = [0, 1, 2, 3] : (tensor<1x256x5x5xf32>) -> tensor<1x256x5x5xf32>
-    %1094 = stablehlo.broadcast_in_dim %arg319, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x5x5xf32>
-    %1095 = stablehlo.subtract %1093, %1094 : tensor<1x256x5x5xf32>
-    %1096 = stablehlo.broadcast_in_dim %1095, dims = [0, 1, 2, 3] : (tensor<1x256x5x5xf32>) -> tensor<1x256x5x5xf32>
-    %1097 = stablehlo.broadcast_in_dim %arg320, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x5x5xf32>
-    %1098 = stablehlo.multiply %1096, %1097 : tensor<1x256x5x5xf32>
-    %1099 = stablehlo.broadcast_in_dim %1098, dims = [0, 1, 2, 3] : (tensor<1x256x5x5xf32>) -> tensor<1x256x5x5xf32>
-    %1100 = stablehlo.broadcast_in_dim %arg321, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x5x5xf32>
-    %1101 = stablehlo.multiply %1099, %1100 : tensor<1x256x5x5xf32>
-    %1102 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2, 3] : (tensor<1x256x5x5xf32>) -> tensor<1x256x5x5xf32>
-    %1103 = stablehlo.broadcast_in_dim %arg322, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x5x5xf32>
-    %1104 = stablehlo.add %1102, %1103 : tensor<1x256x5x5xf32>
-    %1105 = stablehlo.broadcast_in_dim %1104, dims = [0, 1, 2, 3] : (tensor<1x256x5x5xf32>) -> tensor<1x256x5x5xf32>
-    %1106 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x256x5x5xf32>
-    %1107 = stablehlo.maximum %1105, %1106 : tensor<1x256x5x5xf32>
-    %1108 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x256x5x5xf32>
-    %1109 = stablehlo.broadcast_in_dim %1107, dims = [0, 1, 2, 3] : (tensor<1x256x5x5xf32>) -> tensor<1x256x5x5xf32>
-    %1110 = stablehlo.minimum %1108, %1109 : tensor<1x256x5x5xf32>
-    %1111 = stablehlo.convolution(%1110, %arg81) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x5x5xf32>, tensor<512x256x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1112 = stablehlo.broadcast_in_dim %1111, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1113 = stablehlo.broadcast_in_dim %arg323, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1114 = stablehlo.subtract %1112, %1113 : tensor<1x512x5x5xf32>
-    %1115 = stablehlo.broadcast_in_dim %1114, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1116 = stablehlo.broadcast_in_dim %arg324, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1117 = stablehlo.multiply %1115, %1116 : tensor<1x512x5x5xf32>
-    %1118 = stablehlo.broadcast_in_dim %1117, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1119 = stablehlo.broadcast_in_dim %arg325, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1120 = stablehlo.multiply %1118, %1119 : tensor<1x512x5x5xf32>
-    %1121 = stablehlo.broadcast_in_dim %1120, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1122 = stablehlo.broadcast_in_dim %arg326, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1123 = stablehlo.add %1121, %1122 : tensor<1x512x5x5xf32>
-    %1124 = stablehlo.broadcast_in_dim %1123, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1125 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x512x5x5xf32>
-    %1126 = stablehlo.maximum %1124, %1125 : tensor<1x512x5x5xf32>
-    %1127 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x512x5x5xf32>
-    %1128 = stablehlo.broadcast_in_dim %1126, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1129 = stablehlo.minimum %1127, %1128 : tensor<1x512x5x5xf32>
-    %1130 = stablehlo.convolution(%1129, %arg82) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x5x5xf32>, tensor<128x512x1x1xf32>) -> tensor<1x128x5x5xf32>
-    %1131 = stablehlo.broadcast_in_dim %1130, dims = [0, 1, 2, 3] : (tensor<1x128x5x5xf32>) -> tensor<1x128x5x5xf32>
-    %1132 = stablehlo.broadcast_in_dim %arg327, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x5x5xf32>
-    %1133 = stablehlo.subtract %1131, %1132 : tensor<1x128x5x5xf32>
-    %1134 = stablehlo.broadcast_in_dim %1133, dims = [0, 1, 2, 3] : (tensor<1x128x5x5xf32>) -> tensor<1x128x5x5xf32>
-    %1135 = stablehlo.broadcast_in_dim %arg328, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x5x5xf32>
-    %1136 = stablehlo.multiply %1134, %1135 : tensor<1x128x5x5xf32>
-    %1137 = stablehlo.broadcast_in_dim %1136, dims = [0, 1, 2, 3] : (tensor<1x128x5x5xf32>) -> tensor<1x128x5x5xf32>
-    %1138 = stablehlo.broadcast_in_dim %arg329, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x5x5xf32>
-    %1139 = stablehlo.multiply %1137, %1138 : tensor<1x128x5x5xf32>
-    %1140 = stablehlo.broadcast_in_dim %1139, dims = [0, 1, 2, 3] : (tensor<1x128x5x5xf32>) -> tensor<1x128x5x5xf32>
-    %1141 = stablehlo.broadcast_in_dim %arg330, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x5x5xf32>
-    %1142 = stablehlo.add %1140, %1141 : tensor<1x128x5x5xf32>
-    %1143 = stablehlo.broadcast_in_dim %1142, dims = [0, 1, 2, 3] : (tensor<1x128x5x5xf32>) -> tensor<1x128x5x5xf32>
-    %1144 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x128x5x5xf32>
-    %1145 = stablehlo.maximum %1143, %1144 : tensor<1x128x5x5xf32>
-    %1146 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x128x5x5xf32>
-    %1147 = stablehlo.broadcast_in_dim %1145, dims = [0, 1, 2, 3] : (tensor<1x128x5x5xf32>) -> tensor<1x128x5x5xf32>
-    %1148 = stablehlo.minimum %1146, %1147 : tensor<1x128x5x5xf32>
-    %1149 = stablehlo.convolution(%1148, %arg83) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x5x5xf32>, tensor<128x1x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1150 = stablehlo.broadcast_in_dim %1149, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1151 = stablehlo.broadcast_in_dim %arg331, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1152 = stablehlo.subtract %1150, %1151 : tensor<1x128x3x3xf32>
-    %1153 = stablehlo.broadcast_in_dim %1152, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1154 = stablehlo.broadcast_in_dim %arg332, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1155 = stablehlo.multiply %1153, %1154 : tensor<1x128x3x3xf32>
-    %1156 = stablehlo.broadcast_in_dim %1155, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1157 = stablehlo.broadcast_in_dim %arg333, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1158 = stablehlo.multiply %1156, %1157 : tensor<1x128x3x3xf32>
-    %1159 = stablehlo.broadcast_in_dim %1158, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1160 = stablehlo.broadcast_in_dim %arg334, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1161 = stablehlo.add %1159, %1160 : tensor<1x128x3x3xf32>
-    %1162 = stablehlo.broadcast_in_dim %1161, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1163 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x128x3x3xf32>
-    %1164 = stablehlo.maximum %1162, %1163 : tensor<1x128x3x3xf32>
-    %1165 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x128x3x3xf32>
-    %1166 = stablehlo.broadcast_in_dim %1164, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1167 = stablehlo.minimum %1165, %1166 : tensor<1x128x3x3xf32>
-    %1168 = stablehlo.convolution(%1167, %arg84) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x3x3xf32>, tensor<256x128x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1169 = stablehlo.broadcast_in_dim %1168, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1170 = stablehlo.broadcast_in_dim %arg335, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1171 = stablehlo.subtract %1169, %1170 : tensor<1x256x3x3xf32>
-    %1172 = stablehlo.broadcast_in_dim %1171, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1173 = stablehlo.broadcast_in_dim %arg336, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1174 = stablehlo.multiply %1172, %1173 : tensor<1x256x3x3xf32>
-    %1175 = stablehlo.broadcast_in_dim %1174, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1176 = stablehlo.broadcast_in_dim %arg337, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1177 = stablehlo.multiply %1175, %1176 : tensor<1x256x3x3xf32>
-    %1178 = stablehlo.broadcast_in_dim %1177, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1179 = stablehlo.broadcast_in_dim %arg338, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1180 = stablehlo.add %1178, %1179 : tensor<1x256x3x3xf32>
-    %1181 = stablehlo.broadcast_in_dim %1180, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1182 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x256x3x3xf32>
-    %1183 = stablehlo.maximum %1181, %1182 : tensor<1x256x3x3xf32>
-    %1184 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x256x3x3xf32>
-    %1185 = stablehlo.broadcast_in_dim %1183, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1186 = stablehlo.minimum %1184, %1185 : tensor<1x256x3x3xf32>
-    %1187 = stablehlo.convolution(%1186, %arg85) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x3x3xf32>, tensor<128x256x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1188 = stablehlo.broadcast_in_dim %1187, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1189 = stablehlo.broadcast_in_dim %arg339, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1190 = stablehlo.subtract %1188, %1189 : tensor<1x128x3x3xf32>
-    %1191 = stablehlo.broadcast_in_dim %1190, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1192 = stablehlo.broadcast_in_dim %arg340, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1193 = stablehlo.multiply %1191, %1192 : tensor<1x128x3x3xf32>
-    %1194 = stablehlo.broadcast_in_dim %1193, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1195 = stablehlo.broadcast_in_dim %arg341, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1196 = stablehlo.multiply %1194, %1195 : tensor<1x128x3x3xf32>
-    %1197 = stablehlo.broadcast_in_dim %1196, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1198 = stablehlo.broadcast_in_dim %arg342, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x3x3xf32>
-    %1199 = stablehlo.add %1197, %1198 : tensor<1x128x3x3xf32>
-    %1200 = stablehlo.broadcast_in_dim %1199, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1201 = stablehlo.maximum %1200, %1163 : tensor<1x128x3x3xf32>
-    %1202 = stablehlo.broadcast_in_dim %1201, dims = [0, 1, 2, 3] : (tensor<1x128x3x3xf32>) -> tensor<1x128x3x3xf32>
-    %1203 = stablehlo.minimum %1165, %1202 : tensor<1x128x3x3xf32>
-    %1204 = stablehlo.convolution(%1203, %arg86) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x3x3xf32>, tensor<128x1x3x3xf32>) -> tensor<1x128x2x2xf32>
-    %1205 = stablehlo.broadcast_in_dim %1204, dims = [0, 1, 2, 3] : (tensor<1x128x2x2xf32>) -> tensor<1x128x2x2xf32>
-    %1206 = stablehlo.broadcast_in_dim %arg343, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x2x2xf32>
-    %1207 = stablehlo.subtract %1205, %1206 : tensor<1x128x2x2xf32>
-    %1208 = stablehlo.broadcast_in_dim %1207, dims = [0, 1, 2, 3] : (tensor<1x128x2x2xf32>) -> tensor<1x128x2x2xf32>
-    %1209 = stablehlo.broadcast_in_dim %arg344, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x2x2xf32>
-    %1210 = stablehlo.multiply %1208, %1209 : tensor<1x128x2x2xf32>
-    %1211 = stablehlo.broadcast_in_dim %1210, dims = [0, 1, 2, 3] : (tensor<1x128x2x2xf32>) -> tensor<1x128x2x2xf32>
-    %1212 = stablehlo.broadcast_in_dim %arg345, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x2x2xf32>
-    %1213 = stablehlo.multiply %1211, %1212 : tensor<1x128x2x2xf32>
-    %1214 = stablehlo.broadcast_in_dim %1213, dims = [0, 1, 2, 3] : (tensor<1x128x2x2xf32>) -> tensor<1x128x2x2xf32>
-    %1215 = stablehlo.broadcast_in_dim %arg346, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x2x2xf32>
-    %1216 = stablehlo.add %1214, %1215 : tensor<1x128x2x2xf32>
-    %1217 = stablehlo.broadcast_in_dim %1216, dims = [0, 1, 2, 3] : (tensor<1x128x2x2xf32>) -> tensor<1x128x2x2xf32>
-    %1218 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x128x2x2xf32>
-    %1219 = stablehlo.maximum %1217, %1218 : tensor<1x128x2x2xf32>
-    %1220 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x128x2x2xf32>
-    %1221 = stablehlo.broadcast_in_dim %1219, dims = [0, 1, 2, 3] : (tensor<1x128x2x2xf32>) -> tensor<1x128x2x2xf32>
-    %1222 = stablehlo.minimum %1220, %1221 : tensor<1x128x2x2xf32>
-    %1223 = stablehlo.convolution(%1222, %arg87) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x2x2xf32>, tensor<256x128x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1224 = stablehlo.broadcast_in_dim %1223, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1225 = stablehlo.broadcast_in_dim %arg347, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1226 = stablehlo.subtract %1224, %1225 : tensor<1x256x2x2xf32>
-    %1227 = stablehlo.broadcast_in_dim %1226, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1228 = stablehlo.broadcast_in_dim %arg348, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1229 = stablehlo.multiply %1227, %1228 : tensor<1x256x2x2xf32>
-    %1230 = stablehlo.broadcast_in_dim %1229, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1231 = stablehlo.broadcast_in_dim %arg349, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1232 = stablehlo.multiply %1230, %1231 : tensor<1x256x2x2xf32>
-    %1233 = stablehlo.broadcast_in_dim %1232, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1234 = stablehlo.broadcast_in_dim %arg350, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1235 = stablehlo.add %1233, %1234 : tensor<1x256x2x2xf32>
-    %1236 = stablehlo.broadcast_in_dim %1235, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1237 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x256x2x2xf32>
-    %1238 = stablehlo.maximum %1236, %1237 : tensor<1x256x2x2xf32>
-    %1239 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x256x2x2xf32>
-    %1240 = stablehlo.broadcast_in_dim %1238, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1241 = stablehlo.minimum %1239, %1240 : tensor<1x256x2x2xf32>
-    %1242 = stablehlo.convolution(%1241, %arg88) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x2x2xf32>, tensor<64x256x1x1xf32>) -> tensor<1x64x2x2xf32>
-    %1243 = stablehlo.broadcast_in_dim %1242, dims = [0, 1, 2, 3] : (tensor<1x64x2x2xf32>) -> tensor<1x64x2x2xf32>
-    %1244 = stablehlo.broadcast_in_dim %arg351, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x2x2xf32>
-    %1245 = stablehlo.subtract %1243, %1244 : tensor<1x64x2x2xf32>
-    %1246 = stablehlo.broadcast_in_dim %1245, dims = [0, 1, 2, 3] : (tensor<1x64x2x2xf32>) -> tensor<1x64x2x2xf32>
-    %1247 = stablehlo.broadcast_in_dim %arg352, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x2x2xf32>
-    %1248 = stablehlo.multiply %1246, %1247 : tensor<1x64x2x2xf32>
-    %1249 = stablehlo.broadcast_in_dim %1248, dims = [0, 1, 2, 3] : (tensor<1x64x2x2xf32>) -> tensor<1x64x2x2xf32>
-    %1250 = stablehlo.broadcast_in_dim %arg353, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x2x2xf32>
-    %1251 = stablehlo.multiply %1249, %1250 : tensor<1x64x2x2xf32>
-    %1252 = stablehlo.broadcast_in_dim %1251, dims = [0, 1, 2, 3] : (tensor<1x64x2x2xf32>) -> tensor<1x64x2x2xf32>
-    %1253 = stablehlo.broadcast_in_dim %arg354, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x2x2xf32>
-    %1254 = stablehlo.add %1252, %1253 : tensor<1x64x2x2xf32>
-    %1255 = stablehlo.broadcast_in_dim %1254, dims = [0, 1, 2, 3] : (tensor<1x64x2x2xf32>) -> tensor<1x64x2x2xf32>
-    %1256 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x64x2x2xf32>
-    %1257 = stablehlo.maximum %1255, %1256 : tensor<1x64x2x2xf32>
-    %1258 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x64x2x2xf32>
-    %1259 = stablehlo.broadcast_in_dim %1257, dims = [0, 1, 2, 3] : (tensor<1x64x2x2xf32>) -> tensor<1x64x2x2xf32>
-    %1260 = stablehlo.minimum %1258, %1259 : tensor<1x64x2x2xf32>
-    %1261 = stablehlo.convolution(%1260, %arg89) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 64 : i64} : (tensor<1x64x2x2xf32>, tensor<64x1x3x3xf32>) -> tensor<1x64x1x1xf32>
-    %1262 = stablehlo.broadcast_in_dim %1261, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1263 = stablehlo.broadcast_in_dim %arg355, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1264 = stablehlo.subtract %1262, %1263 : tensor<1x64x1x1xf32>
-    %1265 = stablehlo.broadcast_in_dim %1264, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1266 = stablehlo.broadcast_in_dim %arg356, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1267 = stablehlo.multiply %1265, %1266 : tensor<1x64x1x1xf32>
-    %1268 = stablehlo.broadcast_in_dim %1267, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1269 = stablehlo.broadcast_in_dim %arg357, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1270 = stablehlo.multiply %1268, %1269 : tensor<1x64x1x1xf32>
-    %1271 = stablehlo.broadcast_in_dim %1270, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1272 = stablehlo.broadcast_in_dim %arg358, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1273 = stablehlo.add %1271, %1272 : tensor<1x64x1x1xf32>
-    %1274 = stablehlo.broadcast_in_dim %1273, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1275 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x64x1x1xf32>
-    %1276 = stablehlo.maximum %1274, %1275 : tensor<1x64x1x1xf32>
-    %1277 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x64x1x1xf32>
-    %1278 = stablehlo.broadcast_in_dim %1276, dims = [0, 1, 2, 3] : (tensor<1x64x1x1xf32>) -> tensor<1x64x1x1xf32>
-    %1279 = stablehlo.minimum %1277, %1278 : tensor<1x64x1x1xf32>
-    %1280 = stablehlo.convolution(%1279, %arg90) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x1x1xf32>, tensor<128x64x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1281 = stablehlo.broadcast_in_dim %1280, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1282 = stablehlo.broadcast_in_dim %arg359, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1283 = stablehlo.subtract %1281, %1282 : tensor<1x128x1x1xf32>
-    %1284 = stablehlo.broadcast_in_dim %1283, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1285 = stablehlo.broadcast_in_dim %arg360, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1286 = stablehlo.multiply %1284, %1285 : tensor<1x128x1x1xf32>
-    %1287 = stablehlo.broadcast_in_dim %1286, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1288 = stablehlo.broadcast_in_dim %arg361, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1289 = stablehlo.multiply %1287, %1288 : tensor<1x128x1x1xf32>
-    %1290 = stablehlo.broadcast_in_dim %1289, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1291 = stablehlo.broadcast_in_dim %arg362, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1292 = stablehlo.add %1290, %1291 : tensor<1x128x1x1xf32>
-    %1293 = stablehlo.broadcast_in_dim %1292, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1294 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x128x1x1xf32>
-    %1295 = stablehlo.maximum %1293, %1294 : tensor<1x128x1x1xf32>
-    %1296 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x128x1x1xf32>
-    %1297 = stablehlo.broadcast_in_dim %1295, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1298 = stablehlo.minimum %1296, %1297 : tensor<1x128x1x1xf32>
-    %1299 = stablehlo.convolution(%815, %arg91) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 672 : i64} : (tensor<1x672x20x20xf32>, tensor<672x1x3x3xf32>) -> tensor<1x672x20x20xf32>
-    %1300 = stablehlo.broadcast_in_dim %1299, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1301 = stablehlo.broadcast_in_dim %arg363, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1302 = stablehlo.subtract %1300, %1301 : tensor<1x672x20x20xf32>
-    %1303 = stablehlo.broadcast_in_dim %1302, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1304 = stablehlo.broadcast_in_dim %arg364, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1305 = stablehlo.multiply %1303, %1304 : tensor<1x672x20x20xf32>
-    %1306 = stablehlo.broadcast_in_dim %1305, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1307 = stablehlo.broadcast_in_dim %arg365, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1308 = stablehlo.multiply %1306, %1307 : tensor<1x672x20x20xf32>
-    %1309 = stablehlo.broadcast_in_dim %1308, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1310 = stablehlo.broadcast_in_dim %arg366, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1311 = stablehlo.add %1309, %1310 : tensor<1x672x20x20xf32>
-    %1312 = stablehlo.broadcast_in_dim %1311, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1313 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x672x20x20xf32>
-    %1314 = stablehlo.maximum %1312, %1313 : tensor<1x672x20x20xf32>
-    %1315 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x672x20x20xf32>
-    %1316 = stablehlo.broadcast_in_dim %1314, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1317 = stablehlo.minimum %1315, %1316 : tensor<1x672x20x20xf32>
-    %1318 = stablehlo.convolution(%1317, %arg92) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x672x20x20xf32>, tensor<24x672x1x1xf32>) -> tensor<1x24x20x20xf32>
-    %1319 = stablehlo.reshape %arg93 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %1320 = stablehlo.broadcast_in_dim %1318, dims = [0, 1, 2, 3] : (tensor<1x24x20x20xf32>) -> tensor<1x24x20x20xf32>
-    %1321 = stablehlo.broadcast_in_dim %1319, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x20x20xf32>
-    %1322 = stablehlo.add %1320, %1321 : tensor<1x24x20x20xf32>
-    %1323 = stablehlo.reshape %1322 : (tensor<1x24x20x20xf32>) -> tensor<1x6x4x20x20xf32>
-    %1324 = stablehlo.transpose %1323, dims = [0, 3, 4, 1, 2] : (tensor<1x6x4x20x20xf32>) -> tensor<1x20x20x6x4xf32>
-    %1325 = stablehlo.reshape %1324 : (tensor<1x20x20x6x4xf32>) -> tensor<1x2400x4xf32>
-    %1326 = stablehlo.convolution(%1070, %arg94) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 480 : i64} : (tensor<1x480x10x10xf32>, tensor<480x1x3x3xf32>) -> tensor<1x480x10x10xf32>
-    %1327 = stablehlo.broadcast_in_dim %1326, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1328 = stablehlo.broadcast_in_dim %arg367, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1329 = stablehlo.subtract %1327, %1328 : tensor<1x480x10x10xf32>
-    %1330 = stablehlo.broadcast_in_dim %1329, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1331 = stablehlo.broadcast_in_dim %arg368, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1332 = stablehlo.multiply %1330, %1331 : tensor<1x480x10x10xf32>
-    %1333 = stablehlo.broadcast_in_dim %1332, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1334 = stablehlo.broadcast_in_dim %arg369, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1335 = stablehlo.multiply %1333, %1334 : tensor<1x480x10x10xf32>
-    %1336 = stablehlo.broadcast_in_dim %1335, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1337 = stablehlo.broadcast_in_dim %arg370, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1338 = stablehlo.add %1336, %1337 : tensor<1x480x10x10xf32>
-    %1339 = stablehlo.broadcast_in_dim %1338, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1340 = stablehlo.broadcast_in_dim %1084, dims = [] : (tensor<f32>) -> tensor<1x480x10x10xf32>
-    %1341 = stablehlo.maximum %1339, %1340 : tensor<1x480x10x10xf32>
-    %1342 = stablehlo.broadcast_in_dim %1088, dims = [] : (tensor<f32>) -> tensor<1x480x10x10xf32>
-    %1343 = stablehlo.broadcast_in_dim %1341, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1344 = stablehlo.minimum %1342, %1343 : tensor<1x480x10x10xf32>
-    %1345 = stablehlo.convolution(%1344, %arg95) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x10x10xf32>, tensor<24x480x1x1xf32>) -> tensor<1x24x10x10xf32>
-    %1346 = stablehlo.reshape %arg96 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %1347 = stablehlo.broadcast_in_dim %1345, dims = [0, 1, 2, 3] : (tensor<1x24x10x10xf32>) -> tensor<1x24x10x10xf32>
-    %1348 = stablehlo.broadcast_in_dim %1346, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x10x10xf32>
-    %1349 = stablehlo.add %1347, %1348 : tensor<1x24x10x10xf32>
-    %1350 = stablehlo.reshape %1349 : (tensor<1x24x10x10xf32>) -> tensor<1x6x4x10x10xf32>
-    %1351 = stablehlo.transpose %1350, dims = [0, 3, 4, 1, 2] : (tensor<1x6x4x10x10xf32>) -> tensor<1x10x10x6x4xf32>
-    %1352 = stablehlo.reshape %1351 : (tensor<1x10x10x6x4xf32>) -> tensor<1x600x4xf32>
-    %1353 = stablehlo.convolution(%1129, %arg97) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x5x5xf32>, tensor<512x1x3x3xf32>) -> tensor<1x512x5x5xf32>
-    %1354 = stablehlo.broadcast_in_dim %1353, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1355 = stablehlo.broadcast_in_dim %arg371, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1356 = stablehlo.subtract %1354, %1355 : tensor<1x512x5x5xf32>
-    %1357 = stablehlo.broadcast_in_dim %1356, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1358 = stablehlo.broadcast_in_dim %arg372, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1359 = stablehlo.multiply %1357, %1358 : tensor<1x512x5x5xf32>
-    %1360 = stablehlo.broadcast_in_dim %1359, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1361 = stablehlo.broadcast_in_dim %arg373, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1362 = stablehlo.multiply %1360, %1361 : tensor<1x512x5x5xf32>
-    %1363 = stablehlo.broadcast_in_dim %1362, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1364 = stablehlo.broadcast_in_dim %arg374, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1365 = stablehlo.add %1363, %1364 : tensor<1x512x5x5xf32>
-    %1366 = stablehlo.broadcast_in_dim %1365, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1367 = stablehlo.maximum %1366, %1125 : tensor<1x512x5x5xf32>
-    %1368 = stablehlo.broadcast_in_dim %1367, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1369 = stablehlo.minimum %1127, %1368 : tensor<1x512x5x5xf32>
-    %1370 = stablehlo.convolution(%1369, %arg98) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x5x5xf32>, tensor<24x512x1x1xf32>) -> tensor<1x24x5x5xf32>
-    %1371 = stablehlo.reshape %arg99 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %1372 = stablehlo.broadcast_in_dim %1370, dims = [0, 1, 2, 3] : (tensor<1x24x5x5xf32>) -> tensor<1x24x5x5xf32>
-    %1373 = stablehlo.broadcast_in_dim %1371, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x5x5xf32>
-    %1374 = stablehlo.add %1372, %1373 : tensor<1x24x5x5xf32>
-    %1375 = stablehlo.reshape %1374 : (tensor<1x24x5x5xf32>) -> tensor<1x6x4x5x5xf32>
-    %1376 = stablehlo.transpose %1375, dims = [0, 3, 4, 1, 2] : (tensor<1x6x4x5x5xf32>) -> tensor<1x5x5x6x4xf32>
-    %1377 = stablehlo.reshape %1376 : (tensor<1x5x5x6x4xf32>) -> tensor<1x150x4xf32>
-    %1378 = stablehlo.convolution(%1186, %arg100) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x3x3xf32>, tensor<256x1x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1379 = stablehlo.broadcast_in_dim %1378, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1380 = stablehlo.broadcast_in_dim %arg375, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1381 = stablehlo.subtract %1379, %1380 : tensor<1x256x3x3xf32>
-    %1382 = stablehlo.broadcast_in_dim %1381, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1383 = stablehlo.broadcast_in_dim %arg376, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1384 = stablehlo.multiply %1382, %1383 : tensor<1x256x3x3xf32>
-    %1385 = stablehlo.broadcast_in_dim %1384, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1386 = stablehlo.broadcast_in_dim %arg377, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1387 = stablehlo.multiply %1385, %1386 : tensor<1x256x3x3xf32>
-    %1388 = stablehlo.broadcast_in_dim %1387, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1389 = stablehlo.broadcast_in_dim %arg378, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1390 = stablehlo.add %1388, %1389 : tensor<1x256x3x3xf32>
-    %1391 = stablehlo.broadcast_in_dim %1390, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1392 = stablehlo.maximum %1391, %1182 : tensor<1x256x3x3xf32>
-    %1393 = stablehlo.broadcast_in_dim %1392, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1394 = stablehlo.minimum %1184, %1393 : tensor<1x256x3x3xf32>
-    %1395 = stablehlo.convolution(%1394, %arg101) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x3x3xf32>, tensor<24x256x1x1xf32>) -> tensor<1x24x3x3xf32>
-    %1396 = stablehlo.reshape %arg102 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %1397 = stablehlo.broadcast_in_dim %1395, dims = [0, 1, 2, 3] : (tensor<1x24x3x3xf32>) -> tensor<1x24x3x3xf32>
-    %1398 = stablehlo.broadcast_in_dim %1396, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x3x3xf32>
-    %1399 = stablehlo.add %1397, %1398 : tensor<1x24x3x3xf32>
-    %1400 = stablehlo.reshape %1399 : (tensor<1x24x3x3xf32>) -> tensor<1x6x4x3x3xf32>
-    %1401 = stablehlo.transpose %1400, dims = [0, 3, 4, 1, 2] : (tensor<1x6x4x3x3xf32>) -> tensor<1x3x3x6x4xf32>
-    %1402 = stablehlo.reshape %1401 : (tensor<1x3x3x6x4xf32>) -> tensor<1x54x4xf32>
-    %1403 = stablehlo.convolution(%1241, %arg103) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x2x2xf32>, tensor<256x1x3x3xf32>) -> tensor<1x256x2x2xf32>
-    %1404 = stablehlo.broadcast_in_dim %1403, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1405 = stablehlo.broadcast_in_dim %arg379, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1406 = stablehlo.subtract %1404, %1405 : tensor<1x256x2x2xf32>
-    %1407 = stablehlo.broadcast_in_dim %1406, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1408 = stablehlo.broadcast_in_dim %arg380, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1409 = stablehlo.multiply %1407, %1408 : tensor<1x256x2x2xf32>
-    %1410 = stablehlo.broadcast_in_dim %1409, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1411 = stablehlo.broadcast_in_dim %arg381, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1412 = stablehlo.multiply %1410, %1411 : tensor<1x256x2x2xf32>
-    %1413 = stablehlo.broadcast_in_dim %1412, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1414 = stablehlo.broadcast_in_dim %arg382, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1415 = stablehlo.add %1413, %1414 : tensor<1x256x2x2xf32>
-    %1416 = stablehlo.broadcast_in_dim %1415, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1417 = stablehlo.maximum %1416, %1237 : tensor<1x256x2x2xf32>
-    %1418 = stablehlo.broadcast_in_dim %1417, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1419 = stablehlo.minimum %1239, %1418 : tensor<1x256x2x2xf32>
-    %1420 = stablehlo.convolution(%1419, %arg104) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x2x2xf32>, tensor<24x256x1x1xf32>) -> tensor<1x24x2x2xf32>
-    %1421 = stablehlo.reshape %arg105 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %1422 = stablehlo.broadcast_in_dim %1420, dims = [0, 1, 2, 3] : (tensor<1x24x2x2xf32>) -> tensor<1x24x2x2xf32>
-    %1423 = stablehlo.broadcast_in_dim %1421, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x2x2xf32>
-    %1424 = stablehlo.add %1422, %1423 : tensor<1x24x2x2xf32>
-    %1425 = stablehlo.reshape %1424 : (tensor<1x24x2x2xf32>) -> tensor<1x6x4x2x2xf32>
-    %1426 = stablehlo.transpose %1425, dims = [0, 3, 4, 1, 2] : (tensor<1x6x4x2x2xf32>) -> tensor<1x2x2x6x4xf32>
-    %1427 = stablehlo.reshape %1426 : (tensor<1x2x2x6x4xf32>) -> tensor<1x24x4xf32>
-    %1428 = stablehlo.convolution(%1298, %arg106) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x1x1xf32>, tensor<128x1x3x3xf32>) -> tensor<1x128x1x1xf32>
-    %1429 = stablehlo.broadcast_in_dim %1428, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1430 = stablehlo.broadcast_in_dim %arg383, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1431 = stablehlo.subtract %1429, %1430 : tensor<1x128x1x1xf32>
-    %1432 = stablehlo.broadcast_in_dim %1431, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1433 = stablehlo.broadcast_in_dim %arg384, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1434 = stablehlo.multiply %1432, %1433 : tensor<1x128x1x1xf32>
-    %1435 = stablehlo.broadcast_in_dim %1434, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1436 = stablehlo.broadcast_in_dim %arg385, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1437 = stablehlo.multiply %1435, %1436 : tensor<1x128x1x1xf32>
-    %1438 = stablehlo.broadcast_in_dim %1437, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1439 = stablehlo.broadcast_in_dim %arg386, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1440 = stablehlo.add %1438, %1439 : tensor<1x128x1x1xf32>
-    %1441 = stablehlo.broadcast_in_dim %1440, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1442 = stablehlo.maximum %1441, %1294 : tensor<1x128x1x1xf32>
-    %1443 = stablehlo.broadcast_in_dim %1442, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1444 = stablehlo.minimum %1296, %1443 : tensor<1x128x1x1xf32>
-    %1445 = stablehlo.convolution(%1444, %arg107) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x1x1xf32>, tensor<24x128x1x1xf32>) -> tensor<1x24x1x1xf32>
-    %1446 = stablehlo.reshape %arg108 : (tensor<24xf32>) -> tensor<24x1x1xf32>
-    %1447 = stablehlo.broadcast_in_dim %1445, dims = [0, 1, 2, 3] : (tensor<1x24x1x1xf32>) -> tensor<1x24x1x1xf32>
-    %1448 = stablehlo.broadcast_in_dim %1446, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x1x1xf32>
-    %1449 = stablehlo.add %1447, %1448 : tensor<1x24x1x1xf32>
-    %1450 = stablehlo.reshape %1449 : (tensor<1x24x1x1xf32>) -> tensor<1x6x4x1x1xf32>
-    %1451 = stablehlo.transpose %1450, dims = [0, 3, 4, 1, 2] : (tensor<1x6x4x1x1xf32>) -> tensor<1x1x1x6x4xf32>
-    %1452 = stablehlo.reshape %1451 : (tensor<1x1x1x6x4xf32>) -> tensor<1x6x4xf32>
-    %1453 = stablehlo.concatenate %1325, %1352, %1377, %1402, %1427, %1452, dim = 1 : (tensor<1x2400x4xf32>, tensor<1x600x4xf32>, tensor<1x150x4xf32>, tensor<1x54x4xf32>, tensor<1x24x4xf32>, tensor<1x6x4xf32>) -> tensor<1x3234x4xf32>
-    %1454 = stablehlo.convolution(%815, %arg109) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 672 : i64} : (tensor<1x672x20x20xf32>, tensor<672x1x3x3xf32>) -> tensor<1x672x20x20xf32>
-    %1455 = stablehlo.broadcast_in_dim %1454, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1456 = stablehlo.broadcast_in_dim %arg387, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1457 = stablehlo.subtract %1455, %1456 : tensor<1x672x20x20xf32>
-    %1458 = stablehlo.broadcast_in_dim %1457, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1459 = stablehlo.broadcast_in_dim %arg388, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1460 = stablehlo.multiply %1458, %1459 : tensor<1x672x20x20xf32>
-    %1461 = stablehlo.broadcast_in_dim %1460, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1462 = stablehlo.broadcast_in_dim %arg389, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1463 = stablehlo.multiply %1461, %1462 : tensor<1x672x20x20xf32>
-    %1464 = stablehlo.broadcast_in_dim %1463, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1465 = stablehlo.broadcast_in_dim %arg390, dims = [1, 2, 3] : (tensor<672x1x1xf32>) -> tensor<1x672x20x20xf32>
-    %1466 = stablehlo.add %1464, %1465 : tensor<1x672x20x20xf32>
-    %1467 = stablehlo.broadcast_in_dim %1466, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1468 = stablehlo.maximum %1467, %1313 : tensor<1x672x20x20xf32>
-    %1469 = stablehlo.broadcast_in_dim %1468, dims = [0, 1, 2, 3] : (tensor<1x672x20x20xf32>) -> tensor<1x672x20x20xf32>
-    %1470 = stablehlo.minimum %1315, %1469 : tensor<1x672x20x20xf32>
-    %1471 = stablehlo.convolution(%1470, %arg110) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x672x20x20xf32>, tensor<546x672x1x1xf32>) -> tensor<1x546x20x20xf32>
-    %1472 = stablehlo.reshape %arg111 : (tensor<546xf32>) -> tensor<546x1x1xf32>
-    %1473 = stablehlo.broadcast_in_dim %1471, dims = [0, 1, 2, 3] : (tensor<1x546x20x20xf32>) -> tensor<1x546x20x20xf32>
-    %1474 = stablehlo.broadcast_in_dim %1472, dims = [1, 2, 3] : (tensor<546x1x1xf32>) -> tensor<1x546x20x20xf32>
-    %1475 = stablehlo.add %1473, %1474 : tensor<1x546x20x20xf32>
-    %1476 = stablehlo.reshape %1475 : (tensor<1x546x20x20xf32>) -> tensor<1x6x91x20x20xf32>
-    %1477 = stablehlo.transpose %1476, dims = [0, 3, 4, 1, 2] : (tensor<1x6x91x20x20xf32>) -> tensor<1x20x20x6x91xf32>
-    %1478 = stablehlo.reshape %1477 : (tensor<1x20x20x6x91xf32>) -> tensor<1x2400x91xf32>
-    %1479 = stablehlo.convolution(%1070, %arg112) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 480 : i64} : (tensor<1x480x10x10xf32>, tensor<480x1x3x3xf32>) -> tensor<1x480x10x10xf32>
-    %1480 = stablehlo.broadcast_in_dim %1479, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1481 = stablehlo.broadcast_in_dim %arg391, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1482 = stablehlo.subtract %1480, %1481 : tensor<1x480x10x10xf32>
-    %1483 = stablehlo.broadcast_in_dim %1482, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1484 = stablehlo.broadcast_in_dim %arg392, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1485 = stablehlo.multiply %1483, %1484 : tensor<1x480x10x10xf32>
-    %1486 = stablehlo.broadcast_in_dim %1485, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1487 = stablehlo.broadcast_in_dim %arg393, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1488 = stablehlo.multiply %1486, %1487 : tensor<1x480x10x10xf32>
-    %1489 = stablehlo.broadcast_in_dim %1488, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1490 = stablehlo.broadcast_in_dim %arg394, dims = [1, 2, 3] : (tensor<480x1x1xf32>) -> tensor<1x480x10x10xf32>
-    %1491 = stablehlo.add %1489, %1490 : tensor<1x480x10x10xf32>
-    %1492 = stablehlo.broadcast_in_dim %1491, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1493 = stablehlo.maximum %1492, %1340 : tensor<1x480x10x10xf32>
-    %1494 = stablehlo.broadcast_in_dim %1493, dims = [0, 1, 2, 3] : (tensor<1x480x10x10xf32>) -> tensor<1x480x10x10xf32>
-    %1495 = stablehlo.minimum %1342, %1494 : tensor<1x480x10x10xf32>
-    %1496 = stablehlo.convolution(%1495, %arg113) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x480x10x10xf32>, tensor<546x480x1x1xf32>) -> tensor<1x546x10x10xf32>
-    %1497 = stablehlo.reshape %arg114 : (tensor<546xf32>) -> tensor<546x1x1xf32>
-    %1498 = stablehlo.broadcast_in_dim %1496, dims = [0, 1, 2, 3] : (tensor<1x546x10x10xf32>) -> tensor<1x546x10x10xf32>
-    %1499 = stablehlo.broadcast_in_dim %1497, dims = [1, 2, 3] : (tensor<546x1x1xf32>) -> tensor<1x546x10x10xf32>
-    %1500 = stablehlo.add %1498, %1499 : tensor<1x546x10x10xf32>
-    %1501 = stablehlo.reshape %1500 : (tensor<1x546x10x10xf32>) -> tensor<1x6x91x10x10xf32>
-    %1502 = stablehlo.transpose %1501, dims = [0, 3, 4, 1, 2] : (tensor<1x6x91x10x10xf32>) -> tensor<1x10x10x6x91xf32>
-    %1503 = stablehlo.reshape %1502 : (tensor<1x10x10x6x91xf32>) -> tensor<1x600x91xf32>
-    %1504 = stablehlo.convolution(%1129, %arg115) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x5x5xf32>, tensor<512x1x3x3xf32>) -> tensor<1x512x5x5xf32>
-    %1505 = stablehlo.broadcast_in_dim %1504, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1506 = stablehlo.broadcast_in_dim %arg395, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1507 = stablehlo.subtract %1505, %1506 : tensor<1x512x5x5xf32>
-    %1508 = stablehlo.broadcast_in_dim %1507, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1509 = stablehlo.broadcast_in_dim %arg396, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1510 = stablehlo.multiply %1508, %1509 : tensor<1x512x5x5xf32>
-    %1511 = stablehlo.broadcast_in_dim %1510, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1512 = stablehlo.broadcast_in_dim %arg397, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1513 = stablehlo.multiply %1511, %1512 : tensor<1x512x5x5xf32>
-    %1514 = stablehlo.broadcast_in_dim %1513, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1515 = stablehlo.broadcast_in_dim %arg398, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x5x5xf32>
-    %1516 = stablehlo.add %1514, %1515 : tensor<1x512x5x5xf32>
-    %1517 = stablehlo.broadcast_in_dim %1516, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1518 = stablehlo.maximum %1517, %1125 : tensor<1x512x5x5xf32>
-    %1519 = stablehlo.broadcast_in_dim %1518, dims = [0, 1, 2, 3] : (tensor<1x512x5x5xf32>) -> tensor<1x512x5x5xf32>
-    %1520 = stablehlo.minimum %1127, %1519 : tensor<1x512x5x5xf32>
-    %1521 = stablehlo.convolution(%1520, %arg116) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x5x5xf32>, tensor<546x512x1x1xf32>) -> tensor<1x546x5x5xf32>
-    %1522 = stablehlo.reshape %arg117 : (tensor<546xf32>) -> tensor<546x1x1xf32>
-    %1523 = stablehlo.broadcast_in_dim %1521, dims = [0, 1, 2, 3] : (tensor<1x546x5x5xf32>) -> tensor<1x546x5x5xf32>
-    %1524 = stablehlo.broadcast_in_dim %1522, dims = [1, 2, 3] : (tensor<546x1x1xf32>) -> tensor<1x546x5x5xf32>
-    %1525 = stablehlo.add %1523, %1524 : tensor<1x546x5x5xf32>
-    %1526 = stablehlo.reshape %1525 : (tensor<1x546x5x5xf32>) -> tensor<1x6x91x5x5xf32>
-    %1527 = stablehlo.transpose %1526, dims = [0, 3, 4, 1, 2] : (tensor<1x6x91x5x5xf32>) -> tensor<1x5x5x6x91xf32>
-    %1528 = stablehlo.reshape %1527 : (tensor<1x5x5x6x91xf32>) -> tensor<1x150x91xf32>
-    %1529 = stablehlo.convolution(%1186, %arg118) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x3x3xf32>, tensor<256x1x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1530 = stablehlo.broadcast_in_dim %1529, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1531 = stablehlo.broadcast_in_dim %arg399, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1532 = stablehlo.subtract %1530, %1531 : tensor<1x256x3x3xf32>
-    %1533 = stablehlo.broadcast_in_dim %1532, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1534 = stablehlo.broadcast_in_dim %arg400, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1535 = stablehlo.multiply %1533, %1534 : tensor<1x256x3x3xf32>
-    %1536 = stablehlo.broadcast_in_dim %1535, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1537 = stablehlo.broadcast_in_dim %arg401, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1538 = stablehlo.multiply %1536, %1537 : tensor<1x256x3x3xf32>
-    %1539 = stablehlo.broadcast_in_dim %1538, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1540 = stablehlo.broadcast_in_dim %arg402, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x3x3xf32>
-    %1541 = stablehlo.add %1539, %1540 : tensor<1x256x3x3xf32>
-    %1542 = stablehlo.broadcast_in_dim %1541, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1543 = stablehlo.maximum %1542, %1182 : tensor<1x256x3x3xf32>
-    %1544 = stablehlo.broadcast_in_dim %1543, dims = [0, 1, 2, 3] : (tensor<1x256x3x3xf32>) -> tensor<1x256x3x3xf32>
-    %1545 = stablehlo.minimum %1184, %1544 : tensor<1x256x3x3xf32>
-    %1546 = stablehlo.convolution(%1545, %arg119) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x3x3xf32>, tensor<546x256x1x1xf32>) -> tensor<1x546x3x3xf32>
-    %1547 = stablehlo.reshape %arg120 : (tensor<546xf32>) -> tensor<546x1x1xf32>
-    %1548 = stablehlo.broadcast_in_dim %1546, dims = [0, 1, 2, 3] : (tensor<1x546x3x3xf32>) -> tensor<1x546x3x3xf32>
-    %1549 = stablehlo.broadcast_in_dim %1547, dims = [1, 2, 3] : (tensor<546x1x1xf32>) -> tensor<1x546x3x3xf32>
-    %1550 = stablehlo.add %1548, %1549 : tensor<1x546x3x3xf32>
-    %1551 = stablehlo.reshape %1550 : (tensor<1x546x3x3xf32>) -> tensor<1x6x91x3x3xf32>
-    %1552 = stablehlo.transpose %1551, dims = [0, 3, 4, 1, 2] : (tensor<1x6x91x3x3xf32>) -> tensor<1x3x3x6x91xf32>
-    %1553 = stablehlo.reshape %1552 : (tensor<1x3x3x6x91xf32>) -> tensor<1x54x91xf32>
-    %1554 = stablehlo.convolution(%1241, %arg121) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x2x2xf32>, tensor<256x1x3x3xf32>) -> tensor<1x256x2x2xf32>
-    %1555 = stablehlo.broadcast_in_dim %1554, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1556 = stablehlo.broadcast_in_dim %arg403, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1557 = stablehlo.subtract %1555, %1556 : tensor<1x256x2x2xf32>
-    %1558 = stablehlo.broadcast_in_dim %1557, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1559 = stablehlo.broadcast_in_dim %arg404, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1560 = stablehlo.multiply %1558, %1559 : tensor<1x256x2x2xf32>
-    %1561 = stablehlo.broadcast_in_dim %1560, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1562 = stablehlo.broadcast_in_dim %arg405, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1563 = stablehlo.multiply %1561, %1562 : tensor<1x256x2x2xf32>
-    %1564 = stablehlo.broadcast_in_dim %1563, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1565 = stablehlo.broadcast_in_dim %arg406, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x2x2xf32>
-    %1566 = stablehlo.add %1564, %1565 : tensor<1x256x2x2xf32>
-    %1567 = stablehlo.broadcast_in_dim %1566, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1568 = stablehlo.maximum %1567, %1237 : tensor<1x256x2x2xf32>
-    %1569 = stablehlo.broadcast_in_dim %1568, dims = [0, 1, 2, 3] : (tensor<1x256x2x2xf32>) -> tensor<1x256x2x2xf32>
-    %1570 = stablehlo.minimum %1239, %1569 : tensor<1x256x2x2xf32>
-    %1571 = stablehlo.convolution(%1570, %arg122) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x2x2xf32>, tensor<546x256x1x1xf32>) -> tensor<1x546x2x2xf32>
-    %1572 = stablehlo.reshape %arg123 : (tensor<546xf32>) -> tensor<546x1x1xf32>
-    %1573 = stablehlo.broadcast_in_dim %1571, dims = [0, 1, 2, 3] : (tensor<1x546x2x2xf32>) -> tensor<1x546x2x2xf32>
-    %1574 = stablehlo.broadcast_in_dim %1572, dims = [1, 2, 3] : (tensor<546x1x1xf32>) -> tensor<1x546x2x2xf32>
-    %1575 = stablehlo.add %1573, %1574 : tensor<1x546x2x2xf32>
-    %1576 = stablehlo.reshape %1575 : (tensor<1x546x2x2xf32>) -> tensor<1x6x91x2x2xf32>
-    %1577 = stablehlo.transpose %1576, dims = [0, 3, 4, 1, 2] : (tensor<1x6x91x2x2xf32>) -> tensor<1x2x2x6x91xf32>
-    %1578 = stablehlo.reshape %1577 : (tensor<1x2x2x6x91xf32>) -> tensor<1x24x91xf32>
-    %1579 = stablehlo.convolution(%1298, %arg124) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x1x1xf32>, tensor<128x1x3x3xf32>) -> tensor<1x128x1x1xf32>
-    %1580 = stablehlo.broadcast_in_dim %1579, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1581 = stablehlo.broadcast_in_dim %arg407, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1582 = stablehlo.subtract %1580, %1581 : tensor<1x128x1x1xf32>
-    %1583 = stablehlo.broadcast_in_dim %1582, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1584 = stablehlo.broadcast_in_dim %arg408, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1585 = stablehlo.multiply %1583, %1584 : tensor<1x128x1x1xf32>
-    %1586 = stablehlo.broadcast_in_dim %1585, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1587 = stablehlo.broadcast_in_dim %arg409, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1588 = stablehlo.multiply %1586, %1587 : tensor<1x128x1x1xf32>
-    %1589 = stablehlo.broadcast_in_dim %1588, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1590 = stablehlo.broadcast_in_dim %arg410, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1591 = stablehlo.add %1589, %1590 : tensor<1x128x1x1xf32>
-    %1592 = stablehlo.broadcast_in_dim %1591, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1593 = stablehlo.maximum %1592, %1294 : tensor<1x128x1x1xf32>
-    %1594 = stablehlo.broadcast_in_dim %1593, dims = [0, 1, 2, 3] : (tensor<1x128x1x1xf32>) -> tensor<1x128x1x1xf32>
-    %1595 = stablehlo.minimum %1296, %1594 : tensor<1x128x1x1xf32>
-    %1596 = stablehlo.convolution(%1595, %arg125) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x1x1xf32>, tensor<546x128x1x1xf32>) -> tensor<1x546x1x1xf32>
-    %1597 = stablehlo.reshape %arg126 : (tensor<546xf32>) -> tensor<546x1x1xf32>
-    %1598 = stablehlo.broadcast_in_dim %1596, dims = [0, 1, 2, 3] : (tensor<1x546x1x1xf32>) -> tensor<1x546x1x1xf32>
-    %1599 = stablehlo.broadcast_in_dim %1597, dims = [1, 2, 3] : (tensor<546x1x1xf32>) -> tensor<1x546x1x1xf32>
-    %1600 = stablehlo.add %1598, %1599 : tensor<1x546x1x1xf32>
-    %1601 = stablehlo.reshape %1600 : (tensor<1x546x1x1xf32>) -> tensor<1x6x91x1x1xf32>
-    %1602 = stablehlo.transpose %1601, dims = [0, 3, 4, 1, 2] : (tensor<1x6x91x1x1xf32>) -> tensor<1x1x1x6x91xf32>
-    %1603 = stablehlo.reshape %1602 : (tensor<1x1x1x6x91xf32>) -> tensor<1x6x91xf32>
-    %1604 = stablehlo.concatenate %1478, %1503, %1528, %1553, %1578, %1603, dim = 1 : (tensor<1x2400x91xf32>, tensor<1x600x91xf32>, tensor<1x150x91xf32>, tensor<1x54x91xf32>, tensor<1x24x91xf32>, tensor<1x6x91xf32>) -> tensor<1x3234x91xf32>
-    return %1453, %1604, %arg411, %22 : tensor<1x3234x4xf32>, tensor<1x3234x91xf32>, tensor<3234x4xf32>, tensor<1x3x320x320xf32>
-  }
-}
diff --git a/mlir_tests/MobileNetV2.mlir b/mlir_tests/MobileNetV2.mlir
deleted file mode 100644
index d2e9b1e2..00000000
--- a/mlir_tests/MobileNetV2.mlir
+++ /dev/null
@@ -1,1089 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x224x224xbf16>, %arg1: tensor<32x3x3x3xbf16>, %arg2: tensor<32x1x3x3xbf16>, %arg3: tensor<16x32x1x1xbf16>, %arg4: tensor<96x16x1x1xbf16>, %arg5: tensor<96x1x3x3xbf16>, %arg6: tensor<24x96x1x1xbf16>, %arg7: tensor<144x24x1x1xbf16>, %arg8: tensor<144x1x3x3xbf16>, %arg9: tensor<24x144x1x1xbf16>, %arg10: tensor<144x24x1x1xbf16>, %arg11: tensor<144x1x3x3xbf16>, %arg12: tensor<32x144x1x1xbf16>, %arg13: tensor<192x32x1x1xbf16>, %arg14: tensor<192x1x3x3xbf16>, %arg15: tensor<32x192x1x1xbf16>, %arg16: tensor<192x32x1x1xbf16>, %arg17: tensor<192x1x3x3xbf16>, %arg18: tensor<32x192x1x1xbf16>, %arg19: tensor<192x32x1x1xbf16>, %arg20: tensor<192x1x3x3xbf16>, %arg21: tensor<64x192x1x1xbf16>, %arg22: tensor<384x64x1x1xbf16>, %arg23: tensor<384x1x3x3xbf16>, %arg24: tensor<64x384x1x1xbf16>, %arg25: tensor<384x64x1x1xbf16>, %arg26: tensor<384x1x3x3xbf16>, %arg27: tensor<64x384x1x1xbf16>, %arg28: tensor<384x64x1x1xbf16>, %arg29: tensor<384x1x3x3xbf16>, %arg30: tensor<64x384x1x1xbf16>, %arg31: tensor<384x64x1x1xbf16>, %arg32: tensor<384x1x3x3xbf16>, %arg33: tensor<96x384x1x1xbf16>, %arg34: tensor<576x96x1x1xbf16>, %arg35: tensor<576x1x3x3xbf16>, %arg36: tensor<96x576x1x1xbf16>, %arg37: tensor<576x96x1x1xbf16>, %arg38: tensor<576x1x3x3xbf16>, %arg39: tensor<96x576x1x1xbf16>, %arg40: tensor<576x96x1x1xbf16>, %arg41: tensor<576x1x3x3xbf16>, %arg42: tensor<160x576x1x1xbf16>, %arg43: tensor<960x160x1x1xbf16>, %arg44: tensor<960x1x3x3xbf16>, %arg45: tensor<160x960x1x1xbf16>, %arg46: tensor<960x160x1x1xbf16>, %arg47: tensor<960x1x3x3xbf16>, %arg48: tensor<160x960x1x1xbf16>, %arg49: tensor<960x160x1x1xbf16>, %arg50: tensor<960x1x3x3xbf16>, %arg51: tensor<320x960x1x1xbf16>, %arg52: tensor<1280x320x1x1xbf16>, %arg53: tensor<32x1x1xf32>, %arg54: tensor<32x1x1xf32>, %arg55: tensor<32x1x1xbf16>, %arg56: tensor<32x1x1xbf16>, %arg57: tensor<32x1x1xf32>, %arg58: tensor<32x1x1xf32>, %arg59: tensor<32x1x1xbf16>, %arg60: tensor<32x1x1xbf16>, %arg61: tensor<16x1x1xf32>, %arg62: tensor<16x1x1xf32>, %arg63: tensor<16x1x1xbf16>, %arg64: tensor<16x1x1xbf16>, %arg65: tensor<96x1x1xf32>, %arg66: tensor<96x1x1xf32>, %arg67: tensor<96x1x1xbf16>, %arg68: tensor<96x1x1xbf16>, %arg69: tensor<96x1x1xf32>, %arg70: tensor<96x1x1xf32>, %arg71: tensor<96x1x1xbf16>, %arg72: tensor<96x1x1xbf16>, %arg73: tensor<24x1x1xf32>, %arg74: tensor<24x1x1xf32>, %arg75: tensor<24x1x1xbf16>, %arg76: tensor<24x1x1xbf16>, %arg77: tensor<144x1x1xf32>, %arg78: tensor<144x1x1xf32>, %arg79: tensor<144x1x1xbf16>, %arg80: tensor<144x1x1xbf16>, %arg81: tensor<144x1x1xf32>, %arg82: tensor<144x1x1xf32>, %arg83: tensor<144x1x1xbf16>, %arg84: tensor<144x1x1xbf16>, %arg85: tensor<24x1x1xf32>, %arg86: tensor<24x1x1xf32>, %arg87: tensor<24x1x1xbf16>, %arg88: tensor<24x1x1xbf16>, %arg89: tensor<144x1x1xf32>, %arg90: tensor<144x1x1xf32>, %arg91: tensor<144x1x1xbf16>, %arg92: tensor<144x1x1xbf16>, %arg93: tensor<144x1x1xf32>, %arg94: tensor<144x1x1xf32>, %arg95: tensor<144x1x1xbf16>, %arg96: tensor<144x1x1xbf16>, %arg97: tensor<32x1x1xf32>, %arg98: tensor<32x1x1xf32>, %arg99: tensor<32x1x1xbf16>, %arg100: tensor<32x1x1xbf16>, %arg101: tensor<192x1x1xf32>, %arg102: tensor<192x1x1xf32>, %arg103: tensor<192x1x1xbf16>, %arg104: tensor<192x1x1xbf16>, %arg105: tensor<192x1x1xf32>, %arg106: tensor<192x1x1xf32>, %arg107: tensor<192x1x1xbf16>, %arg108: tensor<192x1x1xbf16>, %arg109: tensor<32x1x1xf32>, %arg110: tensor<32x1x1xf32>, %arg111: tensor<32x1x1xbf16>, %arg112: tensor<32x1x1xbf16>, %arg113: tensor<192x1x1xf32>, %arg114: tensor<192x1x1xf32>, %arg115: tensor<192x1x1xbf16>, %arg116: tensor<192x1x1xbf16>, %arg117: tensor<192x1x1xf32>, %arg118: tensor<192x1x1xf32>, %arg119: tensor<192x1x1xbf16>, %arg120: tensor<192x1x1xbf16>, %arg121: tensor<32x1x1xf32>, %arg122: tensor<32x1x1xf32>, %arg123: tensor<32x1x1xbf16>, %arg124: tensor<32x1x1xbf16>, %arg125: tensor<192x1x1xf32>, %arg126: tensor<192x1x1xf32>, %arg127: tensor<192x1x1xbf16>, %arg128: tensor<192x1x1xbf16>, %arg129: tensor<192x1x1xf32>, %arg130: tensor<192x1x1xf32>, %arg131: tensor<192x1x1xbf16>, %arg132: tensor<192x1x1xbf16>, %arg133: tensor<64x1x1xf32>, %arg134: tensor<64x1x1xf32>, %arg135: tensor<64x1x1xbf16>, %arg136: tensor<64x1x1xbf16>, %arg137: tensor<384x1x1xf32>, %arg138: tensor<384x1x1xf32>, %arg139: tensor<384x1x1xbf16>, %arg140: tensor<384x1x1xbf16>, %arg141: tensor<384x1x1xf32>, %arg142: tensor<384x1x1xf32>, %arg143: tensor<384x1x1xbf16>, %arg144: tensor<384x1x1xbf16>, %arg145: tensor<64x1x1xf32>, %arg146: tensor<64x1x1xf32>, %arg147: tensor<64x1x1xbf16>, %arg148: tensor<64x1x1xbf16>, %arg149: tensor<384x1x1xf32>, %arg150: tensor<384x1x1xf32>, %arg151: tensor<384x1x1xbf16>, %arg152: tensor<384x1x1xbf16>, %arg153: tensor<384x1x1xf32>, %arg154: tensor<384x1x1xf32>, %arg155: tensor<384x1x1xbf16>, %arg156: tensor<384x1x1xbf16>, %arg157: tensor<64x1x1xf32>, %arg158: tensor<64x1x1xf32>, %arg159: tensor<64x1x1xbf16>, %arg160: tensor<64x1x1xbf16>, %arg161: tensor<384x1x1xf32>, %arg162: tensor<384x1x1xf32>, %arg163: tensor<384x1x1xbf16>, %arg164: tensor<384x1x1xbf16>, %arg165: tensor<384x1x1xf32>, %arg166: tensor<384x1x1xf32>, %arg167: tensor<384x1x1xbf16>, %arg168: tensor<384x1x1xbf16>, %arg169: tensor<64x1x1xf32>, %arg170: tensor<64x1x1xf32>, %arg171: tensor<64x1x1xbf16>, %arg172: tensor<64x1x1xbf16>, %arg173: tensor<384x1x1xf32>, %arg174: tensor<384x1x1xf32>, %arg175: tensor<384x1x1xbf16>, %arg176: tensor<384x1x1xbf16>, %arg177: tensor<384x1x1xf32>, %arg178: tensor<384x1x1xf32>, %arg179: tensor<384x1x1xbf16>, %arg180: tensor<384x1x1xbf16>, %arg181: tensor<96x1x1xf32>, %arg182: tensor<96x1x1xf32>, %arg183: tensor<96x1x1xbf16>, %arg184: tensor<96x1x1xbf16>, %arg185: tensor<576x1x1xf32>, %arg186: tensor<576x1x1xf32>, %arg187: tensor<576x1x1xbf16>, %arg188: tensor<576x1x1xbf16>, %arg189: tensor<576x1x1xf32>, %arg190: tensor<576x1x1xf32>, %arg191: tensor<576x1x1xbf16>, %arg192: tensor<576x1x1xbf16>, %arg193: tensor<96x1x1xf32>, %arg194: tensor<96x1x1xf32>, %arg195: tensor<96x1x1xbf16>, %arg196: tensor<96x1x1xbf16>, %arg197: tensor<576x1x1xf32>, %arg198: tensor<576x1x1xf32>, %arg199: tensor<576x1x1xbf16>, %arg200: tensor<576x1x1xbf16>, %arg201: tensor<576x1x1xf32>, %arg202: tensor<576x1x1xf32>, %arg203: tensor<576x1x1xbf16>, %arg204: tensor<576x1x1xbf16>, %arg205: tensor<96x1x1xf32>, %arg206: tensor<96x1x1xf32>, %arg207: tensor<96x1x1xbf16>, %arg208: tensor<96x1x1xbf16>, %arg209: tensor<576x1x1xf32>, %arg210: tensor<576x1x1xf32>, %arg211: tensor<576x1x1xbf16>, %arg212: tensor<576x1x1xbf16>, %arg213: tensor<576x1x1xf32>, %arg214: tensor<576x1x1xf32>, %arg215: tensor<576x1x1xbf16>, %arg216: tensor<576x1x1xbf16>, %arg217: tensor<160x1x1xf32>, %arg218: tensor<160x1x1xf32>, %arg219: tensor<160x1x1xbf16>, %arg220: tensor<160x1x1xbf16>, %arg221: tensor<960x1x1xf32>, %arg222: tensor<960x1x1xf32>, %arg223: tensor<960x1x1xbf16>, %arg224: tensor<960x1x1xbf16>, %arg225: tensor<960x1x1xf32>, %arg226: tensor<960x1x1xf32>, %arg227: tensor<960x1x1xbf16>, %arg228: tensor<960x1x1xbf16>, %arg229: tensor<160x1x1xf32>, %arg230: tensor<160x1x1xf32>, %arg231: tensor<160x1x1xbf16>, %arg232: tensor<160x1x1xbf16>, %arg233: tensor<960x1x1xf32>, %arg234: tensor<960x1x1xf32>, %arg235: tensor<960x1x1xbf16>, %arg236: tensor<960x1x1xbf16>, %arg237: tensor<960x1x1xf32>, %arg238: tensor<960x1x1xf32>, %arg239: tensor<960x1x1xbf16>, %arg240: tensor<960x1x1xbf16>, %arg241: tensor<160x1x1xf32>, %arg242: tensor<160x1x1xf32>, %arg243: tensor<160x1x1xbf16>, %arg244: tensor<160x1x1xbf16>, %arg245: tensor<960x1x1xf32>, %arg246: tensor<960x1x1xf32>, %arg247: tensor<960x1x1xbf16>, %arg248: tensor<960x1x1xbf16>, %arg249: tensor<960x1x1xf32>, %arg250: tensor<960x1x1xf32>, %arg251: tensor<960x1x1xbf16>, %arg252: tensor<960x1x1xbf16>, %arg253: tensor<320x1x1xf32>, %arg254: tensor<320x1x1xf32>, %arg255: tensor<320x1x1xbf16>, %arg256: tensor<320x1x1xbf16>, %arg257: tensor<1280x1x1xf32>, %arg258: tensor<1280x1x1xf32>, %arg259: tensor<1280x1x1xbf16>, %arg260: tensor<1280x1x1xbf16>, %arg261: tensor<1280x1000xf32>, %arg262: tensor<1000xf32>) -> tensor<1x1000xbf16> {
-    %cst = stablehlo.constant dense<6.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<bf16>
-    %cst_2 = arith.constant dense<49> : tensor<1xi64>
-    %cst_3 = arith.constant dense<1> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x224x224xbf16>, tensor<32x3x3x3xbf16>) -> tensor<1x32x112x112xbf16>
-    %1 = stablehlo.convert %0 : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xf32>
-    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %3 = stablehlo.broadcast_in_dim %arg53, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %4 = stablehlo.subtract %2, %3 : tensor<1x32x112x112xf32>
-    %5 = stablehlo.broadcast_in_dim %4, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %6 = stablehlo.broadcast_in_dim %arg54, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %7 = stablehlo.multiply %5, %6 : tensor<1x32x112x112xf32>
-    %8 = stablehlo.convert %arg55 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %10 = stablehlo.broadcast_in_dim %8, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %11 = stablehlo.multiply %9, %10 : tensor<1x32x112x112xf32>
-    %12 = stablehlo.convert %arg56 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %15 = stablehlo.add %13, %14 : tensor<1x32x112x112xf32>
-    %16 = stablehlo.convert %15 : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xbf16>
-    %17 = stablehlo.convert %cst_0 : (tensor<f64>) -> tensor<bf16>
-    %18 = stablehlo.broadcast_in_dim %16, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xbf16>
-    %19 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x32x112x112xbf16>
-    %20 = stablehlo.maximum %18, %19 : tensor<1x32x112x112xbf16>
-    %21 = stablehlo.convert %cst : (tensor<f64>) -> tensor<bf16>
-    %22 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x32x112x112xbf16>
-    %23 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xbf16>
-    %24 = stablehlo.minimum %22, %23 : tensor<1x32x112x112xbf16>
-    %25 = stablehlo.convolution(%24, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 32 : i64} : (tensor<1x32x112x112xbf16>, tensor<32x1x3x3xbf16>) -> tensor<1x32x112x112xbf16>
-    %26 = stablehlo.convert %25 : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xf32>
-    %27 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %28 = stablehlo.broadcast_in_dim %arg57, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %29 = stablehlo.subtract %27, %28 : tensor<1x32x112x112xf32>
-    %30 = stablehlo.broadcast_in_dim %29, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %31 = stablehlo.broadcast_in_dim %arg58, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %32 = stablehlo.multiply %30, %31 : tensor<1x32x112x112xf32>
-    %33 = stablehlo.convert %arg59 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %34 = stablehlo.broadcast_in_dim %32, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %35 = stablehlo.broadcast_in_dim %33, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %36 = stablehlo.multiply %34, %35 : tensor<1x32x112x112xf32>
-    %37 = stablehlo.convert %arg60 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %38 = stablehlo.broadcast_in_dim %36, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %39 = stablehlo.broadcast_in_dim %37, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %40 = stablehlo.add %38, %39 : tensor<1x32x112x112xf32>
-    %41 = stablehlo.convert %40 : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xbf16>
-    %42 = stablehlo.broadcast_in_dim %41, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xbf16>
-    %43 = stablehlo.maximum %42, %19 : tensor<1x32x112x112xbf16>
-    %44 = stablehlo.broadcast_in_dim %43, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xbf16>
-    %45 = stablehlo.minimum %22, %44 : tensor<1x32x112x112xbf16>
-    %46 = stablehlo.convolution(%45, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x112x112xbf16>, tensor<16x32x1x1xbf16>) -> tensor<1x16x112x112xbf16>
-    %47 = stablehlo.convert %46 : (tensor<1x16x112x112xbf16>) -> tensor<1x16x112x112xf32>
-    %48 = stablehlo.broadcast_in_dim %47, dims = [0, 1, 2, 3] : (tensor<1x16x112x112xf32>) -> tensor<1x16x112x112xf32>
-    %49 = stablehlo.broadcast_in_dim %arg61, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x112x112xf32>
-    %50 = stablehlo.subtract %48, %49 : tensor<1x16x112x112xf32>
-    %51 = stablehlo.broadcast_in_dim %50, dims = [0, 1, 2, 3] : (tensor<1x16x112x112xf32>) -> tensor<1x16x112x112xf32>
-    %52 = stablehlo.broadcast_in_dim %arg62, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x112x112xf32>
-    %53 = stablehlo.multiply %51, %52 : tensor<1x16x112x112xf32>
-    %54 = stablehlo.convert %arg63 : (tensor<16x1x1xbf16>) -> tensor<16x1x1xf32>
-    %55 = stablehlo.broadcast_in_dim %53, dims = [0, 1, 2, 3] : (tensor<1x16x112x112xf32>) -> tensor<1x16x112x112xf32>
-    %56 = stablehlo.broadcast_in_dim %54, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x112x112xf32>
-    %57 = stablehlo.multiply %55, %56 : tensor<1x16x112x112xf32>
-    %58 = stablehlo.convert %arg64 : (tensor<16x1x1xbf16>) -> tensor<16x1x1xf32>
-    %59 = stablehlo.broadcast_in_dim %57, dims = [0, 1, 2, 3] : (tensor<1x16x112x112xf32>) -> tensor<1x16x112x112xf32>
-    %60 = stablehlo.broadcast_in_dim %58, dims = [1, 2, 3] : (tensor<16x1x1xf32>) -> tensor<1x16x112x112xf32>
-    %61 = stablehlo.add %59, %60 : tensor<1x16x112x112xf32>
-    %62 = stablehlo.convert %61 : (tensor<1x16x112x112xf32>) -> tensor<1x16x112x112xbf16>
-    %63 = stablehlo.convolution(%62, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x16x112x112xbf16>, tensor<96x16x1x1xbf16>) -> tensor<1x96x112x112xbf16>
-    %64 = stablehlo.convert %63 : (tensor<1x96x112x112xbf16>) -> tensor<1x96x112x112xf32>
-    %65 = stablehlo.broadcast_in_dim %64, dims = [0, 1, 2, 3] : (tensor<1x96x112x112xf32>) -> tensor<1x96x112x112xf32>
-    %66 = stablehlo.broadcast_in_dim %arg65, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x112x112xf32>
-    %67 = stablehlo.subtract %65, %66 : tensor<1x96x112x112xf32>
-    %68 = stablehlo.broadcast_in_dim %67, dims = [0, 1, 2, 3] : (tensor<1x96x112x112xf32>) -> tensor<1x96x112x112xf32>
-    %69 = stablehlo.broadcast_in_dim %arg66, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x112x112xf32>
-    %70 = stablehlo.multiply %68, %69 : tensor<1x96x112x112xf32>
-    %71 = stablehlo.convert %arg67 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %72 = stablehlo.broadcast_in_dim %70, dims = [0, 1, 2, 3] : (tensor<1x96x112x112xf32>) -> tensor<1x96x112x112xf32>
-    %73 = stablehlo.broadcast_in_dim %71, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x112x112xf32>
-    %74 = stablehlo.multiply %72, %73 : tensor<1x96x112x112xf32>
-    %75 = stablehlo.convert %arg68 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %76 = stablehlo.broadcast_in_dim %74, dims = [0, 1, 2, 3] : (tensor<1x96x112x112xf32>) -> tensor<1x96x112x112xf32>
-    %77 = stablehlo.broadcast_in_dim %75, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x112x112xf32>
-    %78 = stablehlo.add %76, %77 : tensor<1x96x112x112xf32>
-    %79 = stablehlo.convert %78 : (tensor<1x96x112x112xf32>) -> tensor<1x96x112x112xbf16>
-    %80 = stablehlo.broadcast_in_dim %79, dims = [0, 1, 2, 3] : (tensor<1x96x112x112xbf16>) -> tensor<1x96x112x112xbf16>
-    %81 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x96x112x112xbf16>
-    %82 = stablehlo.maximum %80, %81 : tensor<1x96x112x112xbf16>
-    %83 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x96x112x112xbf16>
-    %84 = stablehlo.broadcast_in_dim %82, dims = [0, 1, 2, 3] : (tensor<1x96x112x112xbf16>) -> tensor<1x96x112x112xbf16>
-    %85 = stablehlo.minimum %83, %84 : tensor<1x96x112x112xbf16>
-    %86 = stablehlo.convolution(%85, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 96 : i64} : (tensor<1x96x112x112xbf16>, tensor<96x1x3x3xbf16>) -> tensor<1x96x56x56xbf16>
-    %87 = stablehlo.convert %86 : (tensor<1x96x56x56xbf16>) -> tensor<1x96x56x56xf32>
-    %88 = stablehlo.broadcast_in_dim %87, dims = [0, 1, 2, 3] : (tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
-    %89 = stablehlo.broadcast_in_dim %arg69, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x56x56xf32>
-    %90 = stablehlo.subtract %88, %89 : tensor<1x96x56x56xf32>
-    %91 = stablehlo.broadcast_in_dim %90, dims = [0, 1, 2, 3] : (tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
-    %92 = stablehlo.broadcast_in_dim %arg70, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x56x56xf32>
-    %93 = stablehlo.multiply %91, %92 : tensor<1x96x56x56xf32>
-    %94 = stablehlo.convert %arg71 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %95 = stablehlo.broadcast_in_dim %93, dims = [0, 1, 2, 3] : (tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
-    %96 = stablehlo.broadcast_in_dim %94, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x56x56xf32>
-    %97 = stablehlo.multiply %95, %96 : tensor<1x96x56x56xf32>
-    %98 = stablehlo.convert %arg72 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %99 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xf32>
-    %100 = stablehlo.broadcast_in_dim %98, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x56x56xf32>
-    %101 = stablehlo.add %99, %100 : tensor<1x96x56x56xf32>
-    %102 = stablehlo.convert %101 : (tensor<1x96x56x56xf32>) -> tensor<1x96x56x56xbf16>
-    %103 = stablehlo.broadcast_in_dim %102, dims = [0, 1, 2, 3] : (tensor<1x96x56x56xbf16>) -> tensor<1x96x56x56xbf16>
-    %104 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x96x56x56xbf16>
-    %105 = stablehlo.maximum %103, %104 : tensor<1x96x56x56xbf16>
-    %106 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x96x56x56xbf16>
-    %107 = stablehlo.broadcast_in_dim %105, dims = [0, 1, 2, 3] : (tensor<1x96x56x56xbf16>) -> tensor<1x96x56x56xbf16>
-    %108 = stablehlo.minimum %106, %107 : tensor<1x96x56x56xbf16>
-    %109 = stablehlo.convolution(%108, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x96x56x56xbf16>, tensor<24x96x1x1xbf16>) -> tensor<1x24x56x56xbf16>
-    %110 = stablehlo.convert %109 : (tensor<1x24x56x56xbf16>) -> tensor<1x24x56x56xf32>
-    %111 = stablehlo.broadcast_in_dim %110, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %112 = stablehlo.broadcast_in_dim %arg73, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %113 = stablehlo.subtract %111, %112 : tensor<1x24x56x56xf32>
-    %114 = stablehlo.broadcast_in_dim %113, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %115 = stablehlo.broadcast_in_dim %arg74, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %116 = stablehlo.multiply %114, %115 : tensor<1x24x56x56xf32>
-    %117 = stablehlo.convert %arg75 : (tensor<24x1x1xbf16>) -> tensor<24x1x1xf32>
-    %118 = stablehlo.broadcast_in_dim %116, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %119 = stablehlo.broadcast_in_dim %117, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %120 = stablehlo.multiply %118, %119 : tensor<1x24x56x56xf32>
-    %121 = stablehlo.convert %arg76 : (tensor<24x1x1xbf16>) -> tensor<24x1x1xf32>
-    %122 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %123 = stablehlo.broadcast_in_dim %121, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %124 = stablehlo.add %122, %123 : tensor<1x24x56x56xf32>
-    %125 = stablehlo.convert %124 : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xbf16>
-    %126 = stablehlo.convolution(%125, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x24x56x56xbf16>, tensor<144x24x1x1xbf16>) -> tensor<1x144x56x56xbf16>
-    %127 = stablehlo.convert %126 : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xf32>
-    %128 = stablehlo.broadcast_in_dim %127, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %129 = stablehlo.broadcast_in_dim %arg77, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %130 = stablehlo.subtract %128, %129 : tensor<1x144x56x56xf32>
-    %131 = stablehlo.broadcast_in_dim %130, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %132 = stablehlo.broadcast_in_dim %arg78, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %133 = stablehlo.multiply %131, %132 : tensor<1x144x56x56xf32>
-    %134 = stablehlo.convert %arg79 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %135 = stablehlo.broadcast_in_dim %133, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %136 = stablehlo.broadcast_in_dim %134, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %137 = stablehlo.multiply %135, %136 : tensor<1x144x56x56xf32>
-    %138 = stablehlo.convert %arg80 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %139 = stablehlo.broadcast_in_dim %137, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %140 = stablehlo.broadcast_in_dim %138, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %141 = stablehlo.add %139, %140 : tensor<1x144x56x56xf32>
-    %142 = stablehlo.convert %141 : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xbf16>
-    %143 = stablehlo.broadcast_in_dim %142, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xbf16>
-    %144 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x144x56x56xbf16>
-    %145 = stablehlo.maximum %143, %144 : tensor<1x144x56x56xbf16>
-    %146 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x144x56x56xbf16>
-    %147 = stablehlo.broadcast_in_dim %145, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xbf16>
-    %148 = stablehlo.minimum %146, %147 : tensor<1x144x56x56xbf16>
-    %149 = stablehlo.convolution(%148, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 144 : i64} : (tensor<1x144x56x56xbf16>, tensor<144x1x3x3xbf16>) -> tensor<1x144x56x56xbf16>
-    %150 = stablehlo.convert %149 : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xf32>
-    %151 = stablehlo.broadcast_in_dim %150, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %152 = stablehlo.broadcast_in_dim %arg81, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %153 = stablehlo.subtract %151, %152 : tensor<1x144x56x56xf32>
-    %154 = stablehlo.broadcast_in_dim %153, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %155 = stablehlo.broadcast_in_dim %arg82, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %156 = stablehlo.multiply %154, %155 : tensor<1x144x56x56xf32>
-    %157 = stablehlo.convert %arg83 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %158 = stablehlo.broadcast_in_dim %156, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %159 = stablehlo.broadcast_in_dim %157, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %160 = stablehlo.multiply %158, %159 : tensor<1x144x56x56xf32>
-    %161 = stablehlo.convert %arg84 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %162 = stablehlo.broadcast_in_dim %160, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %163 = stablehlo.broadcast_in_dim %161, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %164 = stablehlo.add %162, %163 : tensor<1x144x56x56xf32>
-    %165 = stablehlo.convert %164 : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xbf16>
-    %166 = stablehlo.broadcast_in_dim %165, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xbf16>
-    %167 = stablehlo.maximum %166, %144 : tensor<1x144x56x56xbf16>
-    %168 = stablehlo.broadcast_in_dim %167, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xbf16>
-    %169 = stablehlo.minimum %146, %168 : tensor<1x144x56x56xbf16>
-    %170 = stablehlo.convolution(%169, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x144x56x56xbf16>, tensor<24x144x1x1xbf16>) -> tensor<1x24x56x56xbf16>
-    %171 = stablehlo.convert %170 : (tensor<1x24x56x56xbf16>) -> tensor<1x24x56x56xf32>
-    %172 = stablehlo.broadcast_in_dim %171, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %173 = stablehlo.broadcast_in_dim %arg85, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %174 = stablehlo.subtract %172, %173 : tensor<1x24x56x56xf32>
-    %175 = stablehlo.broadcast_in_dim %174, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %176 = stablehlo.broadcast_in_dim %arg86, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %177 = stablehlo.multiply %175, %176 : tensor<1x24x56x56xf32>
-    %178 = stablehlo.convert %arg87 : (tensor<24x1x1xbf16>) -> tensor<24x1x1xf32>
-    %179 = stablehlo.broadcast_in_dim %177, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %180 = stablehlo.broadcast_in_dim %178, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %181 = stablehlo.multiply %179, %180 : tensor<1x24x56x56xf32>
-    %182 = stablehlo.convert %arg88 : (tensor<24x1x1xbf16>) -> tensor<24x1x1xf32>
-    %183 = stablehlo.broadcast_in_dim %181, dims = [0, 1, 2, 3] : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xf32>
-    %184 = stablehlo.broadcast_in_dim %182, dims = [1, 2, 3] : (tensor<24x1x1xf32>) -> tensor<1x24x56x56xf32>
-    %185 = stablehlo.add %183, %184 : tensor<1x24x56x56xf32>
-    %186 = stablehlo.convert %185 : (tensor<1x24x56x56xf32>) -> tensor<1x24x56x56xbf16>
-    %187 = stablehlo.add %125, %186 : tensor<1x24x56x56xbf16>
-    %188 = stablehlo.convolution(%187, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x24x56x56xbf16>, tensor<144x24x1x1xbf16>) -> tensor<1x144x56x56xbf16>
-    %189 = stablehlo.convert %188 : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xf32>
-    %190 = stablehlo.broadcast_in_dim %189, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %191 = stablehlo.broadcast_in_dim %arg89, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %192 = stablehlo.subtract %190, %191 : tensor<1x144x56x56xf32>
-    %193 = stablehlo.broadcast_in_dim %192, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %194 = stablehlo.broadcast_in_dim %arg90, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %195 = stablehlo.multiply %193, %194 : tensor<1x144x56x56xf32>
-    %196 = stablehlo.convert %arg91 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %197 = stablehlo.broadcast_in_dim %195, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %198 = stablehlo.broadcast_in_dim %196, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %199 = stablehlo.multiply %197, %198 : tensor<1x144x56x56xf32>
-    %200 = stablehlo.convert %arg92 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %201 = stablehlo.broadcast_in_dim %199, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xf32>
-    %202 = stablehlo.broadcast_in_dim %200, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x56x56xf32>
-    %203 = stablehlo.add %201, %202 : tensor<1x144x56x56xf32>
-    %204 = stablehlo.convert %203 : (tensor<1x144x56x56xf32>) -> tensor<1x144x56x56xbf16>
-    %205 = stablehlo.broadcast_in_dim %204, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xbf16>
-    %206 = stablehlo.maximum %205, %144 : tensor<1x144x56x56xbf16>
-    %207 = stablehlo.broadcast_in_dim %206, dims = [0, 1, 2, 3] : (tensor<1x144x56x56xbf16>) -> tensor<1x144x56x56xbf16>
-    %208 = stablehlo.minimum %146, %207 : tensor<1x144x56x56xbf16>
-    %209 = stablehlo.convolution(%208, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 144 : i64} : (tensor<1x144x56x56xbf16>, tensor<144x1x3x3xbf16>) -> tensor<1x144x28x28xbf16>
-    %210 = stablehlo.convert %209 : (tensor<1x144x28x28xbf16>) -> tensor<1x144x28x28xf32>
-    %211 = stablehlo.broadcast_in_dim %210, dims = [0, 1, 2, 3] : (tensor<1x144x28x28xf32>) -> tensor<1x144x28x28xf32>
-    %212 = stablehlo.broadcast_in_dim %arg93, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x28x28xf32>
-    %213 = stablehlo.subtract %211, %212 : tensor<1x144x28x28xf32>
-    %214 = stablehlo.broadcast_in_dim %213, dims = [0, 1, 2, 3] : (tensor<1x144x28x28xf32>) -> tensor<1x144x28x28xf32>
-    %215 = stablehlo.broadcast_in_dim %arg94, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x28x28xf32>
-    %216 = stablehlo.multiply %214, %215 : tensor<1x144x28x28xf32>
-    %217 = stablehlo.convert %arg95 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %218 = stablehlo.broadcast_in_dim %216, dims = [0, 1, 2, 3] : (tensor<1x144x28x28xf32>) -> tensor<1x144x28x28xf32>
-    %219 = stablehlo.broadcast_in_dim %217, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x28x28xf32>
-    %220 = stablehlo.multiply %218, %219 : tensor<1x144x28x28xf32>
-    %221 = stablehlo.convert %arg96 : (tensor<144x1x1xbf16>) -> tensor<144x1x1xf32>
-    %222 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2, 3] : (tensor<1x144x28x28xf32>) -> tensor<1x144x28x28xf32>
-    %223 = stablehlo.broadcast_in_dim %221, dims = [1, 2, 3] : (tensor<144x1x1xf32>) -> tensor<1x144x28x28xf32>
-    %224 = stablehlo.add %222, %223 : tensor<1x144x28x28xf32>
-    %225 = stablehlo.convert %224 : (tensor<1x144x28x28xf32>) -> tensor<1x144x28x28xbf16>
-    %226 = stablehlo.broadcast_in_dim %225, dims = [0, 1, 2, 3] : (tensor<1x144x28x28xbf16>) -> tensor<1x144x28x28xbf16>
-    %227 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x144x28x28xbf16>
-    %228 = stablehlo.maximum %226, %227 : tensor<1x144x28x28xbf16>
-    %229 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x144x28x28xbf16>
-    %230 = stablehlo.broadcast_in_dim %228, dims = [0, 1, 2, 3] : (tensor<1x144x28x28xbf16>) -> tensor<1x144x28x28xbf16>
-    %231 = stablehlo.minimum %229, %230 : tensor<1x144x28x28xbf16>
-    %232 = stablehlo.convolution(%231, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x144x28x28xbf16>, tensor<32x144x1x1xbf16>) -> tensor<1x32x28x28xbf16>
-    %233 = stablehlo.convert %232 : (tensor<1x32x28x28xbf16>) -> tensor<1x32x28x28xf32>
-    %234 = stablehlo.broadcast_in_dim %233, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %235 = stablehlo.broadcast_in_dim %arg97, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %236 = stablehlo.subtract %234, %235 : tensor<1x32x28x28xf32>
-    %237 = stablehlo.broadcast_in_dim %236, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %238 = stablehlo.broadcast_in_dim %arg98, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %239 = stablehlo.multiply %237, %238 : tensor<1x32x28x28xf32>
-    %240 = stablehlo.convert %arg99 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %241 = stablehlo.broadcast_in_dim %239, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %242 = stablehlo.broadcast_in_dim %240, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %243 = stablehlo.multiply %241, %242 : tensor<1x32x28x28xf32>
-    %244 = stablehlo.convert %arg100 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %245 = stablehlo.broadcast_in_dim %243, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %246 = stablehlo.broadcast_in_dim %244, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %247 = stablehlo.add %245, %246 : tensor<1x32x28x28xf32>
-    %248 = stablehlo.convert %247 : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xbf16>
-    %249 = stablehlo.convolution(%248, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x28x28xbf16>, tensor<192x32x1x1xbf16>) -> tensor<1x192x28x28xbf16>
-    %250 = stablehlo.convert %249 : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xf32>
-    %251 = stablehlo.broadcast_in_dim %250, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %252 = stablehlo.broadcast_in_dim %arg101, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %253 = stablehlo.subtract %251, %252 : tensor<1x192x28x28xf32>
-    %254 = stablehlo.broadcast_in_dim %253, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %255 = stablehlo.broadcast_in_dim %arg102, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %256 = stablehlo.multiply %254, %255 : tensor<1x192x28x28xf32>
-    %257 = stablehlo.convert %arg103 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %258 = stablehlo.broadcast_in_dim %256, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %259 = stablehlo.broadcast_in_dim %257, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %260 = stablehlo.multiply %258, %259 : tensor<1x192x28x28xf32>
-    %261 = stablehlo.convert %arg104 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %262 = stablehlo.broadcast_in_dim %260, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %263 = stablehlo.broadcast_in_dim %261, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %264 = stablehlo.add %262, %263 : tensor<1x192x28x28xf32>
-    %265 = stablehlo.convert %264 : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xbf16>
-    %266 = stablehlo.broadcast_in_dim %265, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %267 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x192x28x28xbf16>
-    %268 = stablehlo.maximum %266, %267 : tensor<1x192x28x28xbf16>
-    %269 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x192x28x28xbf16>
-    %270 = stablehlo.broadcast_in_dim %268, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %271 = stablehlo.minimum %269, %270 : tensor<1x192x28x28xbf16>
-    %272 = stablehlo.convolution(%271, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 192 : i64} : (tensor<1x192x28x28xbf16>, tensor<192x1x3x3xbf16>) -> tensor<1x192x28x28xbf16>
-    %273 = stablehlo.convert %272 : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xf32>
-    %274 = stablehlo.broadcast_in_dim %273, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %275 = stablehlo.broadcast_in_dim %arg105, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %276 = stablehlo.subtract %274, %275 : tensor<1x192x28x28xf32>
-    %277 = stablehlo.broadcast_in_dim %276, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %278 = stablehlo.broadcast_in_dim %arg106, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %279 = stablehlo.multiply %277, %278 : tensor<1x192x28x28xf32>
-    %280 = stablehlo.convert %arg107 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %281 = stablehlo.broadcast_in_dim %279, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %282 = stablehlo.broadcast_in_dim %280, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %283 = stablehlo.multiply %281, %282 : tensor<1x192x28x28xf32>
-    %284 = stablehlo.convert %arg108 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %285 = stablehlo.broadcast_in_dim %283, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %286 = stablehlo.broadcast_in_dim %284, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %287 = stablehlo.add %285, %286 : tensor<1x192x28x28xf32>
-    %288 = stablehlo.convert %287 : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xbf16>
-    %289 = stablehlo.broadcast_in_dim %288, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %290 = stablehlo.maximum %289, %267 : tensor<1x192x28x28xbf16>
-    %291 = stablehlo.broadcast_in_dim %290, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %292 = stablehlo.minimum %269, %291 : tensor<1x192x28x28xbf16>
-    %293 = stablehlo.convolution(%292, %arg15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x192x28x28xbf16>, tensor<32x192x1x1xbf16>) -> tensor<1x32x28x28xbf16>
-    %294 = stablehlo.convert %293 : (tensor<1x32x28x28xbf16>) -> tensor<1x32x28x28xf32>
-    %295 = stablehlo.broadcast_in_dim %294, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %296 = stablehlo.broadcast_in_dim %arg109, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %297 = stablehlo.subtract %295, %296 : tensor<1x32x28x28xf32>
-    %298 = stablehlo.broadcast_in_dim %297, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %299 = stablehlo.broadcast_in_dim %arg110, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %300 = stablehlo.multiply %298, %299 : tensor<1x32x28x28xf32>
-    %301 = stablehlo.convert %arg111 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %302 = stablehlo.broadcast_in_dim %300, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %303 = stablehlo.broadcast_in_dim %301, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %304 = stablehlo.multiply %302, %303 : tensor<1x32x28x28xf32>
-    %305 = stablehlo.convert %arg112 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %306 = stablehlo.broadcast_in_dim %304, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %307 = stablehlo.broadcast_in_dim %305, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %308 = stablehlo.add %306, %307 : tensor<1x32x28x28xf32>
-    %309 = stablehlo.convert %308 : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xbf16>
-    %310 = stablehlo.add %248, %309 : tensor<1x32x28x28xbf16>
-    %311 = stablehlo.convolution(%310, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x28x28xbf16>, tensor<192x32x1x1xbf16>) -> tensor<1x192x28x28xbf16>
-    %312 = stablehlo.convert %311 : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xf32>
-    %313 = stablehlo.broadcast_in_dim %312, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %314 = stablehlo.broadcast_in_dim %arg113, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %315 = stablehlo.subtract %313, %314 : tensor<1x192x28x28xf32>
-    %316 = stablehlo.broadcast_in_dim %315, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %317 = stablehlo.broadcast_in_dim %arg114, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %318 = stablehlo.multiply %316, %317 : tensor<1x192x28x28xf32>
-    %319 = stablehlo.convert %arg115 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %320 = stablehlo.broadcast_in_dim %318, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %321 = stablehlo.broadcast_in_dim %319, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %322 = stablehlo.multiply %320, %321 : tensor<1x192x28x28xf32>
-    %323 = stablehlo.convert %arg116 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %324 = stablehlo.broadcast_in_dim %322, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %325 = stablehlo.broadcast_in_dim %323, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %326 = stablehlo.add %324, %325 : tensor<1x192x28x28xf32>
-    %327 = stablehlo.convert %326 : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xbf16>
-    %328 = stablehlo.broadcast_in_dim %327, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %329 = stablehlo.maximum %328, %267 : tensor<1x192x28x28xbf16>
-    %330 = stablehlo.broadcast_in_dim %329, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %331 = stablehlo.minimum %269, %330 : tensor<1x192x28x28xbf16>
-    %332 = stablehlo.convolution(%331, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 192 : i64} : (tensor<1x192x28x28xbf16>, tensor<192x1x3x3xbf16>) -> tensor<1x192x28x28xbf16>
-    %333 = stablehlo.convert %332 : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xf32>
-    %334 = stablehlo.broadcast_in_dim %333, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %335 = stablehlo.broadcast_in_dim %arg117, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %336 = stablehlo.subtract %334, %335 : tensor<1x192x28x28xf32>
-    %337 = stablehlo.broadcast_in_dim %336, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %338 = stablehlo.broadcast_in_dim %arg118, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %339 = stablehlo.multiply %337, %338 : tensor<1x192x28x28xf32>
-    %340 = stablehlo.convert %arg119 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %341 = stablehlo.broadcast_in_dim %339, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %342 = stablehlo.broadcast_in_dim %340, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %343 = stablehlo.multiply %341, %342 : tensor<1x192x28x28xf32>
-    %344 = stablehlo.convert %arg120 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %345 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %346 = stablehlo.broadcast_in_dim %344, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %347 = stablehlo.add %345, %346 : tensor<1x192x28x28xf32>
-    %348 = stablehlo.convert %347 : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xbf16>
-    %349 = stablehlo.broadcast_in_dim %348, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %350 = stablehlo.maximum %349, %267 : tensor<1x192x28x28xbf16>
-    %351 = stablehlo.broadcast_in_dim %350, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %352 = stablehlo.minimum %269, %351 : tensor<1x192x28x28xbf16>
-    %353 = stablehlo.convolution(%352, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x192x28x28xbf16>, tensor<32x192x1x1xbf16>) -> tensor<1x32x28x28xbf16>
-    %354 = stablehlo.convert %353 : (tensor<1x32x28x28xbf16>) -> tensor<1x32x28x28xf32>
-    %355 = stablehlo.broadcast_in_dim %354, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %356 = stablehlo.broadcast_in_dim %arg121, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %357 = stablehlo.subtract %355, %356 : tensor<1x32x28x28xf32>
-    %358 = stablehlo.broadcast_in_dim %357, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %359 = stablehlo.broadcast_in_dim %arg122, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %360 = stablehlo.multiply %358, %359 : tensor<1x32x28x28xf32>
-    %361 = stablehlo.convert %arg123 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %362 = stablehlo.broadcast_in_dim %360, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %363 = stablehlo.broadcast_in_dim %361, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %364 = stablehlo.multiply %362, %363 : tensor<1x32x28x28xf32>
-    %365 = stablehlo.convert %arg124 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %366 = stablehlo.broadcast_in_dim %364, dims = [0, 1, 2, 3] : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xf32>
-    %367 = stablehlo.broadcast_in_dim %365, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x28x28xf32>
-    %368 = stablehlo.add %366, %367 : tensor<1x32x28x28xf32>
-    %369 = stablehlo.convert %368 : (tensor<1x32x28x28xf32>) -> tensor<1x32x28x28xbf16>
-    %370 = stablehlo.add %310, %369 : tensor<1x32x28x28xbf16>
-    %371 = stablehlo.convolution(%370, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x28x28xbf16>, tensor<192x32x1x1xbf16>) -> tensor<1x192x28x28xbf16>
-    %372 = stablehlo.convert %371 : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xf32>
-    %373 = stablehlo.broadcast_in_dim %372, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %374 = stablehlo.broadcast_in_dim %arg125, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %375 = stablehlo.subtract %373, %374 : tensor<1x192x28x28xf32>
-    %376 = stablehlo.broadcast_in_dim %375, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %377 = stablehlo.broadcast_in_dim %arg126, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %378 = stablehlo.multiply %376, %377 : tensor<1x192x28x28xf32>
-    %379 = stablehlo.convert %arg127 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %380 = stablehlo.broadcast_in_dim %378, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %381 = stablehlo.broadcast_in_dim %379, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %382 = stablehlo.multiply %380, %381 : tensor<1x192x28x28xf32>
-    %383 = stablehlo.convert %arg128 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %384 = stablehlo.broadcast_in_dim %382, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xf32>
-    %385 = stablehlo.broadcast_in_dim %383, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x28x28xf32>
-    %386 = stablehlo.add %384, %385 : tensor<1x192x28x28xf32>
-    %387 = stablehlo.convert %386 : (tensor<1x192x28x28xf32>) -> tensor<1x192x28x28xbf16>
-    %388 = stablehlo.broadcast_in_dim %387, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %389 = stablehlo.maximum %388, %267 : tensor<1x192x28x28xbf16>
-    %390 = stablehlo.broadcast_in_dim %389, dims = [0, 1, 2, 3] : (tensor<1x192x28x28xbf16>) -> tensor<1x192x28x28xbf16>
-    %391 = stablehlo.minimum %269, %390 : tensor<1x192x28x28xbf16>
-    %392 = stablehlo.convolution(%391, %arg20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 192 : i64} : (tensor<1x192x28x28xbf16>, tensor<192x1x3x3xbf16>) -> tensor<1x192x14x14xbf16>
-    %393 = stablehlo.convert %392 : (tensor<1x192x14x14xbf16>) -> tensor<1x192x14x14xf32>
-    %394 = stablehlo.broadcast_in_dim %393, dims = [0, 1, 2, 3] : (tensor<1x192x14x14xf32>) -> tensor<1x192x14x14xf32>
-    %395 = stablehlo.broadcast_in_dim %arg129, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x14x14xf32>
-    %396 = stablehlo.subtract %394, %395 : tensor<1x192x14x14xf32>
-    %397 = stablehlo.broadcast_in_dim %396, dims = [0, 1, 2, 3] : (tensor<1x192x14x14xf32>) -> tensor<1x192x14x14xf32>
-    %398 = stablehlo.broadcast_in_dim %arg130, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x14x14xf32>
-    %399 = stablehlo.multiply %397, %398 : tensor<1x192x14x14xf32>
-    %400 = stablehlo.convert %arg131 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %401 = stablehlo.broadcast_in_dim %399, dims = [0, 1, 2, 3] : (tensor<1x192x14x14xf32>) -> tensor<1x192x14x14xf32>
-    %402 = stablehlo.broadcast_in_dim %400, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x14x14xf32>
-    %403 = stablehlo.multiply %401, %402 : tensor<1x192x14x14xf32>
-    %404 = stablehlo.convert %arg132 : (tensor<192x1x1xbf16>) -> tensor<192x1x1xf32>
-    %405 = stablehlo.broadcast_in_dim %403, dims = [0, 1, 2, 3] : (tensor<1x192x14x14xf32>) -> tensor<1x192x14x14xf32>
-    %406 = stablehlo.broadcast_in_dim %404, dims = [1, 2, 3] : (tensor<192x1x1xf32>) -> tensor<1x192x14x14xf32>
-    %407 = stablehlo.add %405, %406 : tensor<1x192x14x14xf32>
-    %408 = stablehlo.convert %407 : (tensor<1x192x14x14xf32>) -> tensor<1x192x14x14xbf16>
-    %409 = stablehlo.broadcast_in_dim %408, dims = [0, 1, 2, 3] : (tensor<1x192x14x14xbf16>) -> tensor<1x192x14x14xbf16>
-    %410 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x192x14x14xbf16>
-    %411 = stablehlo.maximum %409, %410 : tensor<1x192x14x14xbf16>
-    %412 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x192x14x14xbf16>
-    %413 = stablehlo.broadcast_in_dim %411, dims = [0, 1, 2, 3] : (tensor<1x192x14x14xbf16>) -> tensor<1x192x14x14xbf16>
-    %414 = stablehlo.minimum %412, %413 : tensor<1x192x14x14xbf16>
-    %415 = stablehlo.convolution(%414, %arg21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x192x14x14xbf16>, tensor<64x192x1x1xbf16>) -> tensor<1x64x14x14xbf16>
-    %416 = stablehlo.convert %415 : (tensor<1x64x14x14xbf16>) -> tensor<1x64x14x14xf32>
-    %417 = stablehlo.broadcast_in_dim %416, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %418 = stablehlo.broadcast_in_dim %arg133, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %419 = stablehlo.subtract %417, %418 : tensor<1x64x14x14xf32>
-    %420 = stablehlo.broadcast_in_dim %419, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %421 = stablehlo.broadcast_in_dim %arg134, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %422 = stablehlo.multiply %420, %421 : tensor<1x64x14x14xf32>
-    %423 = stablehlo.convert %arg135 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %424 = stablehlo.broadcast_in_dim %422, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %425 = stablehlo.broadcast_in_dim %423, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %426 = stablehlo.multiply %424, %425 : tensor<1x64x14x14xf32>
-    %427 = stablehlo.convert %arg136 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %428 = stablehlo.broadcast_in_dim %426, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %429 = stablehlo.broadcast_in_dim %427, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %430 = stablehlo.add %428, %429 : tensor<1x64x14x14xf32>
-    %431 = stablehlo.convert %430 : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xbf16>
-    %432 = stablehlo.convolution(%431, %arg22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x14x14xbf16>, tensor<384x64x1x1xbf16>) -> tensor<1x384x14x14xbf16>
-    %433 = stablehlo.convert %432 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %434 = stablehlo.broadcast_in_dim %433, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %435 = stablehlo.broadcast_in_dim %arg137, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %436 = stablehlo.subtract %434, %435 : tensor<1x384x14x14xf32>
-    %437 = stablehlo.broadcast_in_dim %436, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %438 = stablehlo.broadcast_in_dim %arg138, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %439 = stablehlo.multiply %437, %438 : tensor<1x384x14x14xf32>
-    %440 = stablehlo.convert %arg139 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %441 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %442 = stablehlo.broadcast_in_dim %440, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %443 = stablehlo.multiply %441, %442 : tensor<1x384x14x14xf32>
-    %444 = stablehlo.convert %arg140 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %445 = stablehlo.broadcast_in_dim %443, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %446 = stablehlo.broadcast_in_dim %444, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %447 = stablehlo.add %445, %446 : tensor<1x384x14x14xf32>
-    %448 = stablehlo.convert %447 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %449 = stablehlo.broadcast_in_dim %448, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %450 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x384x14x14xbf16>
-    %451 = stablehlo.maximum %449, %450 : tensor<1x384x14x14xbf16>
-    %452 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x384x14x14xbf16>
-    %453 = stablehlo.broadcast_in_dim %451, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %454 = stablehlo.minimum %452, %453 : tensor<1x384x14x14xbf16>
-    %455 = stablehlo.convolution(%454, %arg23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 384 : i64} : (tensor<1x384x14x14xbf16>, tensor<384x1x3x3xbf16>) -> tensor<1x384x14x14xbf16>
-    %456 = stablehlo.convert %455 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %458 = stablehlo.broadcast_in_dim %arg141, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %459 = stablehlo.subtract %457, %458 : tensor<1x384x14x14xf32>
-    %460 = stablehlo.broadcast_in_dim %459, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %461 = stablehlo.broadcast_in_dim %arg142, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %462 = stablehlo.multiply %460, %461 : tensor<1x384x14x14xf32>
-    %463 = stablehlo.convert %arg143 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %464 = stablehlo.broadcast_in_dim %462, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %465 = stablehlo.broadcast_in_dim %463, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %466 = stablehlo.multiply %464, %465 : tensor<1x384x14x14xf32>
-    %467 = stablehlo.convert %arg144 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %468 = stablehlo.broadcast_in_dim %466, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %469 = stablehlo.broadcast_in_dim %467, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %470 = stablehlo.add %468, %469 : tensor<1x384x14x14xf32>
-    %471 = stablehlo.convert %470 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %472 = stablehlo.broadcast_in_dim %471, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %473 = stablehlo.maximum %472, %450 : tensor<1x384x14x14xbf16>
-    %474 = stablehlo.broadcast_in_dim %473, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %475 = stablehlo.minimum %452, %474 : tensor<1x384x14x14xbf16>
-    %476 = stablehlo.convolution(%475, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x384x14x14xbf16>, tensor<64x384x1x1xbf16>) -> tensor<1x64x14x14xbf16>
-    %477 = stablehlo.convert %476 : (tensor<1x64x14x14xbf16>) -> tensor<1x64x14x14xf32>
-    %478 = stablehlo.broadcast_in_dim %477, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %479 = stablehlo.broadcast_in_dim %arg145, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %480 = stablehlo.subtract %478, %479 : tensor<1x64x14x14xf32>
-    %481 = stablehlo.broadcast_in_dim %480, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %482 = stablehlo.broadcast_in_dim %arg146, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %483 = stablehlo.multiply %481, %482 : tensor<1x64x14x14xf32>
-    %484 = stablehlo.convert %arg147 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %485 = stablehlo.broadcast_in_dim %483, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %486 = stablehlo.broadcast_in_dim %484, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %487 = stablehlo.multiply %485, %486 : tensor<1x64x14x14xf32>
-    %488 = stablehlo.convert %arg148 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %489 = stablehlo.broadcast_in_dim %487, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %490 = stablehlo.broadcast_in_dim %488, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %491 = stablehlo.add %489, %490 : tensor<1x64x14x14xf32>
-    %492 = stablehlo.convert %491 : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xbf16>
-    %493 = stablehlo.add %431, %492 : tensor<1x64x14x14xbf16>
-    %494 = stablehlo.convolution(%493, %arg25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x14x14xbf16>, tensor<384x64x1x1xbf16>) -> tensor<1x384x14x14xbf16>
-    %495 = stablehlo.convert %494 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %496 = stablehlo.broadcast_in_dim %495, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %497 = stablehlo.broadcast_in_dim %arg149, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %498 = stablehlo.subtract %496, %497 : tensor<1x384x14x14xf32>
-    %499 = stablehlo.broadcast_in_dim %498, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %500 = stablehlo.broadcast_in_dim %arg150, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %501 = stablehlo.multiply %499, %500 : tensor<1x384x14x14xf32>
-    %502 = stablehlo.convert %arg151 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %503 = stablehlo.broadcast_in_dim %501, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %504 = stablehlo.broadcast_in_dim %502, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %505 = stablehlo.multiply %503, %504 : tensor<1x384x14x14xf32>
-    %506 = stablehlo.convert %arg152 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %507 = stablehlo.broadcast_in_dim %505, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %508 = stablehlo.broadcast_in_dim %506, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %509 = stablehlo.add %507, %508 : tensor<1x384x14x14xf32>
-    %510 = stablehlo.convert %509 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %511 = stablehlo.broadcast_in_dim %510, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %512 = stablehlo.maximum %511, %450 : tensor<1x384x14x14xbf16>
-    %513 = stablehlo.broadcast_in_dim %512, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %514 = stablehlo.minimum %452, %513 : tensor<1x384x14x14xbf16>
-    %515 = stablehlo.convolution(%514, %arg26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 384 : i64} : (tensor<1x384x14x14xbf16>, tensor<384x1x3x3xbf16>) -> tensor<1x384x14x14xbf16>
-    %516 = stablehlo.convert %515 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %517 = stablehlo.broadcast_in_dim %516, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %518 = stablehlo.broadcast_in_dim %arg153, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %519 = stablehlo.subtract %517, %518 : tensor<1x384x14x14xf32>
-    %520 = stablehlo.broadcast_in_dim %519, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %521 = stablehlo.broadcast_in_dim %arg154, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %522 = stablehlo.multiply %520, %521 : tensor<1x384x14x14xf32>
-    %523 = stablehlo.convert %arg155 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %524 = stablehlo.broadcast_in_dim %522, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %525 = stablehlo.broadcast_in_dim %523, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %526 = stablehlo.multiply %524, %525 : tensor<1x384x14x14xf32>
-    %527 = stablehlo.convert %arg156 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %528 = stablehlo.broadcast_in_dim %526, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %529 = stablehlo.broadcast_in_dim %527, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %530 = stablehlo.add %528, %529 : tensor<1x384x14x14xf32>
-    %531 = stablehlo.convert %530 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %532 = stablehlo.broadcast_in_dim %531, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %533 = stablehlo.maximum %532, %450 : tensor<1x384x14x14xbf16>
-    %534 = stablehlo.broadcast_in_dim %533, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %535 = stablehlo.minimum %452, %534 : tensor<1x384x14x14xbf16>
-    %536 = stablehlo.convolution(%535, %arg27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x384x14x14xbf16>, tensor<64x384x1x1xbf16>) -> tensor<1x64x14x14xbf16>
-    %537 = stablehlo.convert %536 : (tensor<1x64x14x14xbf16>) -> tensor<1x64x14x14xf32>
-    %538 = stablehlo.broadcast_in_dim %537, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %539 = stablehlo.broadcast_in_dim %arg157, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %540 = stablehlo.subtract %538, %539 : tensor<1x64x14x14xf32>
-    %541 = stablehlo.broadcast_in_dim %540, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %542 = stablehlo.broadcast_in_dim %arg158, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %543 = stablehlo.multiply %541, %542 : tensor<1x64x14x14xf32>
-    %544 = stablehlo.convert %arg159 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %545 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %546 = stablehlo.broadcast_in_dim %544, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %547 = stablehlo.multiply %545, %546 : tensor<1x64x14x14xf32>
-    %548 = stablehlo.convert %arg160 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %549 = stablehlo.broadcast_in_dim %547, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %550 = stablehlo.broadcast_in_dim %548, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %551 = stablehlo.add %549, %550 : tensor<1x64x14x14xf32>
-    %552 = stablehlo.convert %551 : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xbf16>
-    %553 = stablehlo.add %493, %552 : tensor<1x64x14x14xbf16>
-    %554 = stablehlo.convolution(%553, %arg28) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x14x14xbf16>, tensor<384x64x1x1xbf16>) -> tensor<1x384x14x14xbf16>
-    %555 = stablehlo.convert %554 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %556 = stablehlo.broadcast_in_dim %555, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %557 = stablehlo.broadcast_in_dim %arg161, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %558 = stablehlo.subtract %556, %557 : tensor<1x384x14x14xf32>
-    %559 = stablehlo.broadcast_in_dim %558, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %560 = stablehlo.broadcast_in_dim %arg162, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %561 = stablehlo.multiply %559, %560 : tensor<1x384x14x14xf32>
-    %562 = stablehlo.convert %arg163 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %563 = stablehlo.broadcast_in_dim %561, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %564 = stablehlo.broadcast_in_dim %562, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %565 = stablehlo.multiply %563, %564 : tensor<1x384x14x14xf32>
-    %566 = stablehlo.convert %arg164 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %567 = stablehlo.broadcast_in_dim %565, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %568 = stablehlo.broadcast_in_dim %566, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %569 = stablehlo.add %567, %568 : tensor<1x384x14x14xf32>
-    %570 = stablehlo.convert %569 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %571 = stablehlo.broadcast_in_dim %570, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %572 = stablehlo.maximum %571, %450 : tensor<1x384x14x14xbf16>
-    %573 = stablehlo.broadcast_in_dim %572, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %574 = stablehlo.minimum %452, %573 : tensor<1x384x14x14xbf16>
-    %575 = stablehlo.convolution(%574, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 384 : i64} : (tensor<1x384x14x14xbf16>, tensor<384x1x3x3xbf16>) -> tensor<1x384x14x14xbf16>
-    %576 = stablehlo.convert %575 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %577 = stablehlo.broadcast_in_dim %576, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %578 = stablehlo.broadcast_in_dim %arg165, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %579 = stablehlo.subtract %577, %578 : tensor<1x384x14x14xf32>
-    %580 = stablehlo.broadcast_in_dim %579, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %581 = stablehlo.broadcast_in_dim %arg166, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %582 = stablehlo.multiply %580, %581 : tensor<1x384x14x14xf32>
-    %583 = stablehlo.convert %arg167 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %584 = stablehlo.broadcast_in_dim %582, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %585 = stablehlo.broadcast_in_dim %583, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %586 = stablehlo.multiply %584, %585 : tensor<1x384x14x14xf32>
-    %587 = stablehlo.convert %arg168 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %588 = stablehlo.broadcast_in_dim %586, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %589 = stablehlo.broadcast_in_dim %587, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %590 = stablehlo.add %588, %589 : tensor<1x384x14x14xf32>
-    %591 = stablehlo.convert %590 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %592 = stablehlo.broadcast_in_dim %591, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %593 = stablehlo.maximum %592, %450 : tensor<1x384x14x14xbf16>
-    %594 = stablehlo.broadcast_in_dim %593, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %595 = stablehlo.minimum %452, %594 : tensor<1x384x14x14xbf16>
-    %596 = stablehlo.convolution(%595, %arg30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x384x14x14xbf16>, tensor<64x384x1x1xbf16>) -> tensor<1x64x14x14xbf16>
-    %597 = stablehlo.convert %596 : (tensor<1x64x14x14xbf16>) -> tensor<1x64x14x14xf32>
-    %598 = stablehlo.broadcast_in_dim %597, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %599 = stablehlo.broadcast_in_dim %arg169, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %600 = stablehlo.subtract %598, %599 : tensor<1x64x14x14xf32>
-    %601 = stablehlo.broadcast_in_dim %600, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %602 = stablehlo.broadcast_in_dim %arg170, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %603 = stablehlo.multiply %601, %602 : tensor<1x64x14x14xf32>
-    %604 = stablehlo.convert %arg171 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %605 = stablehlo.broadcast_in_dim %603, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %606 = stablehlo.broadcast_in_dim %604, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %607 = stablehlo.multiply %605, %606 : tensor<1x64x14x14xf32>
-    %608 = stablehlo.convert %arg172 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %609 = stablehlo.broadcast_in_dim %607, dims = [0, 1, 2, 3] : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xf32>
-    %610 = stablehlo.broadcast_in_dim %608, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x14x14xf32>
-    %611 = stablehlo.add %609, %610 : tensor<1x64x14x14xf32>
-    %612 = stablehlo.convert %611 : (tensor<1x64x14x14xf32>) -> tensor<1x64x14x14xbf16>
-    %613 = stablehlo.add %553, %612 : tensor<1x64x14x14xbf16>
-    %614 = stablehlo.convolution(%613, %arg31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x14x14xbf16>, tensor<384x64x1x1xbf16>) -> tensor<1x384x14x14xbf16>
-    %615 = stablehlo.convert %614 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %616 = stablehlo.broadcast_in_dim %615, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %617 = stablehlo.broadcast_in_dim %arg173, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %618 = stablehlo.subtract %616, %617 : tensor<1x384x14x14xf32>
-    %619 = stablehlo.broadcast_in_dim %618, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %620 = stablehlo.broadcast_in_dim %arg174, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %621 = stablehlo.multiply %619, %620 : tensor<1x384x14x14xf32>
-    %622 = stablehlo.convert %arg175 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %623 = stablehlo.broadcast_in_dim %621, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %624 = stablehlo.broadcast_in_dim %622, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %625 = stablehlo.multiply %623, %624 : tensor<1x384x14x14xf32>
-    %626 = stablehlo.convert %arg176 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %627 = stablehlo.broadcast_in_dim %625, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %628 = stablehlo.broadcast_in_dim %626, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %629 = stablehlo.add %627, %628 : tensor<1x384x14x14xf32>
-    %630 = stablehlo.convert %629 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %631 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %632 = stablehlo.maximum %631, %450 : tensor<1x384x14x14xbf16>
-    %633 = stablehlo.broadcast_in_dim %632, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %634 = stablehlo.minimum %452, %633 : tensor<1x384x14x14xbf16>
-    %635 = stablehlo.convolution(%634, %arg32) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 384 : i64} : (tensor<1x384x14x14xbf16>, tensor<384x1x3x3xbf16>) -> tensor<1x384x14x14xbf16>
-    %636 = stablehlo.convert %635 : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xf32>
-    %637 = stablehlo.broadcast_in_dim %636, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %638 = stablehlo.broadcast_in_dim %arg177, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %639 = stablehlo.subtract %637, %638 : tensor<1x384x14x14xf32>
-    %640 = stablehlo.broadcast_in_dim %639, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %641 = stablehlo.broadcast_in_dim %arg178, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %642 = stablehlo.multiply %640, %641 : tensor<1x384x14x14xf32>
-    %643 = stablehlo.convert %arg179 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %644 = stablehlo.broadcast_in_dim %642, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %645 = stablehlo.broadcast_in_dim %643, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %646 = stablehlo.multiply %644, %645 : tensor<1x384x14x14xf32>
-    %647 = stablehlo.convert %arg180 : (tensor<384x1x1xbf16>) -> tensor<384x1x1xf32>
-    %648 = stablehlo.broadcast_in_dim %646, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xf32>
-    %649 = stablehlo.broadcast_in_dim %647, dims = [1, 2, 3] : (tensor<384x1x1xf32>) -> tensor<1x384x14x14xf32>
-    %650 = stablehlo.add %648, %649 : tensor<1x384x14x14xf32>
-    %651 = stablehlo.convert %650 : (tensor<1x384x14x14xf32>) -> tensor<1x384x14x14xbf16>
-    %652 = stablehlo.broadcast_in_dim %651, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %653 = stablehlo.maximum %652, %450 : tensor<1x384x14x14xbf16>
-    %654 = stablehlo.broadcast_in_dim %653, dims = [0, 1, 2, 3] : (tensor<1x384x14x14xbf16>) -> tensor<1x384x14x14xbf16>
-    %655 = stablehlo.minimum %452, %654 : tensor<1x384x14x14xbf16>
-    %656 = stablehlo.convolution(%655, %arg33) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x384x14x14xbf16>, tensor<96x384x1x1xbf16>) -> tensor<1x96x14x14xbf16>
-    %657 = stablehlo.convert %656 : (tensor<1x96x14x14xbf16>) -> tensor<1x96x14x14xf32>
-    %658 = stablehlo.broadcast_in_dim %657, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %659 = stablehlo.broadcast_in_dim %arg181, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %660 = stablehlo.subtract %658, %659 : tensor<1x96x14x14xf32>
-    %661 = stablehlo.broadcast_in_dim %660, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %662 = stablehlo.broadcast_in_dim %arg182, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %663 = stablehlo.multiply %661, %662 : tensor<1x96x14x14xf32>
-    %664 = stablehlo.convert %arg183 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %665 = stablehlo.broadcast_in_dim %663, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %666 = stablehlo.broadcast_in_dim %664, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %667 = stablehlo.multiply %665, %666 : tensor<1x96x14x14xf32>
-    %668 = stablehlo.convert %arg184 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %669 = stablehlo.broadcast_in_dim %667, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %670 = stablehlo.broadcast_in_dim %668, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %671 = stablehlo.add %669, %670 : tensor<1x96x14x14xf32>
-    %672 = stablehlo.convert %671 : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xbf16>
-    %673 = stablehlo.convolution(%672, %arg34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x96x14x14xbf16>, tensor<576x96x1x1xbf16>) -> tensor<1x576x14x14xbf16>
-    %674 = stablehlo.convert %673 : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xf32>
-    %675 = stablehlo.broadcast_in_dim %674, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %676 = stablehlo.broadcast_in_dim %arg185, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %677 = stablehlo.subtract %675, %676 : tensor<1x576x14x14xf32>
-    %678 = stablehlo.broadcast_in_dim %677, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %679 = stablehlo.broadcast_in_dim %arg186, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %680 = stablehlo.multiply %678, %679 : tensor<1x576x14x14xf32>
-    %681 = stablehlo.convert %arg187 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %682 = stablehlo.broadcast_in_dim %680, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %683 = stablehlo.broadcast_in_dim %681, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %684 = stablehlo.multiply %682, %683 : tensor<1x576x14x14xf32>
-    %685 = stablehlo.convert %arg188 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %686 = stablehlo.broadcast_in_dim %684, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %687 = stablehlo.broadcast_in_dim %685, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %688 = stablehlo.add %686, %687 : tensor<1x576x14x14xf32>
-    %689 = stablehlo.convert %688 : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xbf16>
-    %690 = stablehlo.broadcast_in_dim %689, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %691 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x576x14x14xbf16>
-    %692 = stablehlo.maximum %690, %691 : tensor<1x576x14x14xbf16>
-    %693 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x576x14x14xbf16>
-    %694 = stablehlo.broadcast_in_dim %692, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %695 = stablehlo.minimum %693, %694 : tensor<1x576x14x14xbf16>
-    %696 = stablehlo.convolution(%695, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 576 : i64} : (tensor<1x576x14x14xbf16>, tensor<576x1x3x3xbf16>) -> tensor<1x576x14x14xbf16>
-    %697 = stablehlo.convert %696 : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xf32>
-    %698 = stablehlo.broadcast_in_dim %697, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %699 = stablehlo.broadcast_in_dim %arg189, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %700 = stablehlo.subtract %698, %699 : tensor<1x576x14x14xf32>
-    %701 = stablehlo.broadcast_in_dim %700, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %702 = stablehlo.broadcast_in_dim %arg190, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %703 = stablehlo.multiply %701, %702 : tensor<1x576x14x14xf32>
-    %704 = stablehlo.convert %arg191 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %705 = stablehlo.broadcast_in_dim %703, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %706 = stablehlo.broadcast_in_dim %704, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %707 = stablehlo.multiply %705, %706 : tensor<1x576x14x14xf32>
-    %708 = stablehlo.convert %arg192 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %709 = stablehlo.broadcast_in_dim %707, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %710 = stablehlo.broadcast_in_dim %708, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %711 = stablehlo.add %709, %710 : tensor<1x576x14x14xf32>
-    %712 = stablehlo.convert %711 : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xbf16>
-    %713 = stablehlo.broadcast_in_dim %712, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %714 = stablehlo.maximum %713, %691 : tensor<1x576x14x14xbf16>
-    %715 = stablehlo.broadcast_in_dim %714, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %716 = stablehlo.minimum %693, %715 : tensor<1x576x14x14xbf16>
-    %717 = stablehlo.convolution(%716, %arg36) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x576x14x14xbf16>, tensor<96x576x1x1xbf16>) -> tensor<1x96x14x14xbf16>
-    %718 = stablehlo.convert %717 : (tensor<1x96x14x14xbf16>) -> tensor<1x96x14x14xf32>
-    %719 = stablehlo.broadcast_in_dim %718, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %720 = stablehlo.broadcast_in_dim %arg193, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %721 = stablehlo.subtract %719, %720 : tensor<1x96x14x14xf32>
-    %722 = stablehlo.broadcast_in_dim %721, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %723 = stablehlo.broadcast_in_dim %arg194, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %724 = stablehlo.multiply %722, %723 : tensor<1x96x14x14xf32>
-    %725 = stablehlo.convert %arg195 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %726 = stablehlo.broadcast_in_dim %724, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %727 = stablehlo.broadcast_in_dim %725, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %728 = stablehlo.multiply %726, %727 : tensor<1x96x14x14xf32>
-    %729 = stablehlo.convert %arg196 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %730 = stablehlo.broadcast_in_dim %728, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %731 = stablehlo.broadcast_in_dim %729, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %732 = stablehlo.add %730, %731 : tensor<1x96x14x14xf32>
-    %733 = stablehlo.convert %732 : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xbf16>
-    %734 = stablehlo.add %672, %733 : tensor<1x96x14x14xbf16>
-    %735 = stablehlo.convolution(%734, %arg37) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x96x14x14xbf16>, tensor<576x96x1x1xbf16>) -> tensor<1x576x14x14xbf16>
-    %736 = stablehlo.convert %735 : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xf32>
-    %737 = stablehlo.broadcast_in_dim %736, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %738 = stablehlo.broadcast_in_dim %arg197, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %739 = stablehlo.subtract %737, %738 : tensor<1x576x14x14xf32>
-    %740 = stablehlo.broadcast_in_dim %739, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %741 = stablehlo.broadcast_in_dim %arg198, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %742 = stablehlo.multiply %740, %741 : tensor<1x576x14x14xf32>
-    %743 = stablehlo.convert %arg199 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %744 = stablehlo.broadcast_in_dim %742, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %745 = stablehlo.broadcast_in_dim %743, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %746 = stablehlo.multiply %744, %745 : tensor<1x576x14x14xf32>
-    %747 = stablehlo.convert %arg200 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %748 = stablehlo.broadcast_in_dim %746, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %749 = stablehlo.broadcast_in_dim %747, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %750 = stablehlo.add %748, %749 : tensor<1x576x14x14xf32>
-    %751 = stablehlo.convert %750 : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xbf16>
-    %752 = stablehlo.broadcast_in_dim %751, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %753 = stablehlo.maximum %752, %691 : tensor<1x576x14x14xbf16>
-    %754 = stablehlo.broadcast_in_dim %753, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %755 = stablehlo.minimum %693, %754 : tensor<1x576x14x14xbf16>
-    %756 = stablehlo.convolution(%755, %arg38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 576 : i64} : (tensor<1x576x14x14xbf16>, tensor<576x1x3x3xbf16>) -> tensor<1x576x14x14xbf16>
-    %757 = stablehlo.convert %756 : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xf32>
-    %758 = stablehlo.broadcast_in_dim %757, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %759 = stablehlo.broadcast_in_dim %arg201, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %760 = stablehlo.subtract %758, %759 : tensor<1x576x14x14xf32>
-    %761 = stablehlo.broadcast_in_dim %760, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %762 = stablehlo.broadcast_in_dim %arg202, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %763 = stablehlo.multiply %761, %762 : tensor<1x576x14x14xf32>
-    %764 = stablehlo.convert %arg203 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %765 = stablehlo.broadcast_in_dim %763, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %766 = stablehlo.broadcast_in_dim %764, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %767 = stablehlo.multiply %765, %766 : tensor<1x576x14x14xf32>
-    %768 = stablehlo.convert %arg204 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %769 = stablehlo.broadcast_in_dim %767, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %770 = stablehlo.broadcast_in_dim %768, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %771 = stablehlo.add %769, %770 : tensor<1x576x14x14xf32>
-    %772 = stablehlo.convert %771 : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xbf16>
-    %773 = stablehlo.broadcast_in_dim %772, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %774 = stablehlo.maximum %773, %691 : tensor<1x576x14x14xbf16>
-    %775 = stablehlo.broadcast_in_dim %774, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %776 = stablehlo.minimum %693, %775 : tensor<1x576x14x14xbf16>
-    %777 = stablehlo.convolution(%776, %arg39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x576x14x14xbf16>, tensor<96x576x1x1xbf16>) -> tensor<1x96x14x14xbf16>
-    %778 = stablehlo.convert %777 : (tensor<1x96x14x14xbf16>) -> tensor<1x96x14x14xf32>
-    %779 = stablehlo.broadcast_in_dim %778, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %780 = stablehlo.broadcast_in_dim %arg205, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %781 = stablehlo.subtract %779, %780 : tensor<1x96x14x14xf32>
-    %782 = stablehlo.broadcast_in_dim %781, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %783 = stablehlo.broadcast_in_dim %arg206, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %784 = stablehlo.multiply %782, %783 : tensor<1x96x14x14xf32>
-    %785 = stablehlo.convert %arg207 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %786 = stablehlo.broadcast_in_dim %784, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %787 = stablehlo.broadcast_in_dim %785, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %788 = stablehlo.multiply %786, %787 : tensor<1x96x14x14xf32>
-    %789 = stablehlo.convert %arg208 : (tensor<96x1x1xbf16>) -> tensor<96x1x1xf32>
-    %790 = stablehlo.broadcast_in_dim %788, dims = [0, 1, 2, 3] : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xf32>
-    %791 = stablehlo.broadcast_in_dim %789, dims = [1, 2, 3] : (tensor<96x1x1xf32>) -> tensor<1x96x14x14xf32>
-    %792 = stablehlo.add %790, %791 : tensor<1x96x14x14xf32>
-    %793 = stablehlo.convert %792 : (tensor<1x96x14x14xf32>) -> tensor<1x96x14x14xbf16>
-    %794 = stablehlo.add %734, %793 : tensor<1x96x14x14xbf16>
-    %795 = stablehlo.convolution(%794, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x96x14x14xbf16>, tensor<576x96x1x1xbf16>) -> tensor<1x576x14x14xbf16>
-    %796 = stablehlo.convert %795 : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xf32>
-    %797 = stablehlo.broadcast_in_dim %796, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %798 = stablehlo.broadcast_in_dim %arg209, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %799 = stablehlo.subtract %797, %798 : tensor<1x576x14x14xf32>
-    %800 = stablehlo.broadcast_in_dim %799, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %801 = stablehlo.broadcast_in_dim %arg210, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %802 = stablehlo.multiply %800, %801 : tensor<1x576x14x14xf32>
-    %803 = stablehlo.convert %arg211 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %804 = stablehlo.broadcast_in_dim %802, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %805 = stablehlo.broadcast_in_dim %803, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %806 = stablehlo.multiply %804, %805 : tensor<1x576x14x14xf32>
-    %807 = stablehlo.convert %arg212 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %808 = stablehlo.broadcast_in_dim %806, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xf32>
-    %809 = stablehlo.broadcast_in_dim %807, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x14x14xf32>
-    %810 = stablehlo.add %808, %809 : tensor<1x576x14x14xf32>
-    %811 = stablehlo.convert %810 : (tensor<1x576x14x14xf32>) -> tensor<1x576x14x14xbf16>
-    %812 = stablehlo.broadcast_in_dim %811, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %813 = stablehlo.maximum %812, %691 : tensor<1x576x14x14xbf16>
-    %814 = stablehlo.broadcast_in_dim %813, dims = [0, 1, 2, 3] : (tensor<1x576x14x14xbf16>) -> tensor<1x576x14x14xbf16>
-    %815 = stablehlo.minimum %693, %814 : tensor<1x576x14x14xbf16>
-    %816 = stablehlo.convolution(%815, %arg41) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 576 : i64} : (tensor<1x576x14x14xbf16>, tensor<576x1x3x3xbf16>) -> tensor<1x576x7x7xbf16>
-    %817 = stablehlo.convert %816 : (tensor<1x576x7x7xbf16>) -> tensor<1x576x7x7xf32>
-    %818 = stablehlo.broadcast_in_dim %817, dims = [0, 1, 2, 3] : (tensor<1x576x7x7xf32>) -> tensor<1x576x7x7xf32>
-    %819 = stablehlo.broadcast_in_dim %arg213, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x7x7xf32>
-    %820 = stablehlo.subtract %818, %819 : tensor<1x576x7x7xf32>
-    %821 = stablehlo.broadcast_in_dim %820, dims = [0, 1, 2, 3] : (tensor<1x576x7x7xf32>) -> tensor<1x576x7x7xf32>
-    %822 = stablehlo.broadcast_in_dim %arg214, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x7x7xf32>
-    %823 = stablehlo.multiply %821, %822 : tensor<1x576x7x7xf32>
-    %824 = stablehlo.convert %arg215 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %825 = stablehlo.broadcast_in_dim %823, dims = [0, 1, 2, 3] : (tensor<1x576x7x7xf32>) -> tensor<1x576x7x7xf32>
-    %826 = stablehlo.broadcast_in_dim %824, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x7x7xf32>
-    %827 = stablehlo.multiply %825, %826 : tensor<1x576x7x7xf32>
-    %828 = stablehlo.convert %arg216 : (tensor<576x1x1xbf16>) -> tensor<576x1x1xf32>
-    %829 = stablehlo.broadcast_in_dim %827, dims = [0, 1, 2, 3] : (tensor<1x576x7x7xf32>) -> tensor<1x576x7x7xf32>
-    %830 = stablehlo.broadcast_in_dim %828, dims = [1, 2, 3] : (tensor<576x1x1xf32>) -> tensor<1x576x7x7xf32>
-    %831 = stablehlo.add %829, %830 : tensor<1x576x7x7xf32>
-    %832 = stablehlo.convert %831 : (tensor<1x576x7x7xf32>) -> tensor<1x576x7x7xbf16>
-    %833 = stablehlo.broadcast_in_dim %832, dims = [0, 1, 2, 3] : (tensor<1x576x7x7xbf16>) -> tensor<1x576x7x7xbf16>
-    %834 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x576x7x7xbf16>
-    %835 = stablehlo.maximum %833, %834 : tensor<1x576x7x7xbf16>
-    %836 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x576x7x7xbf16>
-    %837 = stablehlo.broadcast_in_dim %835, dims = [0, 1, 2, 3] : (tensor<1x576x7x7xbf16>) -> tensor<1x576x7x7xbf16>
-    %838 = stablehlo.minimum %836, %837 : tensor<1x576x7x7xbf16>
-    %839 = stablehlo.convolution(%838, %arg42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x576x7x7xbf16>, tensor<160x576x1x1xbf16>) -> tensor<1x160x7x7xbf16>
-    %840 = stablehlo.convert %839 : (tensor<1x160x7x7xbf16>) -> tensor<1x160x7x7xf32>
-    %841 = stablehlo.broadcast_in_dim %840, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %842 = stablehlo.broadcast_in_dim %arg217, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %843 = stablehlo.subtract %841, %842 : tensor<1x160x7x7xf32>
-    %844 = stablehlo.broadcast_in_dim %843, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %845 = stablehlo.broadcast_in_dim %arg218, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %846 = stablehlo.multiply %844, %845 : tensor<1x160x7x7xf32>
-    %847 = stablehlo.convert %arg219 : (tensor<160x1x1xbf16>) -> tensor<160x1x1xf32>
-    %848 = stablehlo.broadcast_in_dim %846, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %849 = stablehlo.broadcast_in_dim %847, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %850 = stablehlo.multiply %848, %849 : tensor<1x160x7x7xf32>
-    %851 = stablehlo.convert %arg220 : (tensor<160x1x1xbf16>) -> tensor<160x1x1xf32>
-    %852 = stablehlo.broadcast_in_dim %850, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %853 = stablehlo.broadcast_in_dim %851, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %854 = stablehlo.add %852, %853 : tensor<1x160x7x7xf32>
-    %855 = stablehlo.convert %854 : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xbf16>
-    %856 = stablehlo.convolution(%855, %arg43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x160x7x7xbf16>, tensor<960x160x1x1xbf16>) -> tensor<1x960x7x7xbf16>
-    %857 = stablehlo.convert %856 : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xf32>
-    %858 = stablehlo.broadcast_in_dim %857, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %859 = stablehlo.broadcast_in_dim %arg221, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %860 = stablehlo.subtract %858, %859 : tensor<1x960x7x7xf32>
-    %861 = stablehlo.broadcast_in_dim %860, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %862 = stablehlo.broadcast_in_dim %arg222, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %863 = stablehlo.multiply %861, %862 : tensor<1x960x7x7xf32>
-    %864 = stablehlo.convert %arg223 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %865 = stablehlo.broadcast_in_dim %863, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %866 = stablehlo.broadcast_in_dim %864, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %867 = stablehlo.multiply %865, %866 : tensor<1x960x7x7xf32>
-    %868 = stablehlo.convert %arg224 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %869 = stablehlo.broadcast_in_dim %867, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %870 = stablehlo.broadcast_in_dim %868, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %871 = stablehlo.add %869, %870 : tensor<1x960x7x7xf32>
-    %872 = stablehlo.convert %871 : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xbf16>
-    %873 = stablehlo.broadcast_in_dim %872, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %874 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x960x7x7xbf16>
-    %875 = stablehlo.maximum %873, %874 : tensor<1x960x7x7xbf16>
-    %876 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x960x7x7xbf16>
-    %877 = stablehlo.broadcast_in_dim %875, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %878 = stablehlo.minimum %876, %877 : tensor<1x960x7x7xbf16>
-    %879 = stablehlo.convolution(%878, %arg44) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 960 : i64} : (tensor<1x960x7x7xbf16>, tensor<960x1x3x3xbf16>) -> tensor<1x960x7x7xbf16>
-    %880 = stablehlo.convert %879 : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xf32>
-    %881 = stablehlo.broadcast_in_dim %880, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %882 = stablehlo.broadcast_in_dim %arg225, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %883 = stablehlo.subtract %881, %882 : tensor<1x960x7x7xf32>
-    %884 = stablehlo.broadcast_in_dim %883, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %885 = stablehlo.broadcast_in_dim %arg226, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %886 = stablehlo.multiply %884, %885 : tensor<1x960x7x7xf32>
-    %887 = stablehlo.convert %arg227 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %888 = stablehlo.broadcast_in_dim %886, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %889 = stablehlo.broadcast_in_dim %887, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %890 = stablehlo.multiply %888, %889 : tensor<1x960x7x7xf32>
-    %891 = stablehlo.convert %arg228 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %892 = stablehlo.broadcast_in_dim %890, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %893 = stablehlo.broadcast_in_dim %891, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %894 = stablehlo.add %892, %893 : tensor<1x960x7x7xf32>
-    %895 = stablehlo.convert %894 : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xbf16>
-    %896 = stablehlo.broadcast_in_dim %895, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %897 = stablehlo.maximum %896, %874 : tensor<1x960x7x7xbf16>
-    %898 = stablehlo.broadcast_in_dim %897, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %899 = stablehlo.minimum %876, %898 : tensor<1x960x7x7xbf16>
-    %900 = stablehlo.convolution(%899, %arg45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x960x7x7xbf16>, tensor<160x960x1x1xbf16>) -> tensor<1x160x7x7xbf16>
-    %901 = stablehlo.convert %900 : (tensor<1x160x7x7xbf16>) -> tensor<1x160x7x7xf32>
-    %902 = stablehlo.broadcast_in_dim %901, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %903 = stablehlo.broadcast_in_dim %arg229, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %904 = stablehlo.subtract %902, %903 : tensor<1x160x7x7xf32>
-    %905 = stablehlo.broadcast_in_dim %904, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %906 = stablehlo.broadcast_in_dim %arg230, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %907 = stablehlo.multiply %905, %906 : tensor<1x160x7x7xf32>
-    %908 = stablehlo.convert %arg231 : (tensor<160x1x1xbf16>) -> tensor<160x1x1xf32>
-    %909 = stablehlo.broadcast_in_dim %907, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %910 = stablehlo.broadcast_in_dim %908, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %911 = stablehlo.multiply %909, %910 : tensor<1x160x7x7xf32>
-    %912 = stablehlo.convert %arg232 : (tensor<160x1x1xbf16>) -> tensor<160x1x1xf32>
-    %913 = stablehlo.broadcast_in_dim %911, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %914 = stablehlo.broadcast_in_dim %912, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %915 = stablehlo.add %913, %914 : tensor<1x160x7x7xf32>
-    %916 = stablehlo.convert %915 : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xbf16>
-    %917 = stablehlo.add %855, %916 : tensor<1x160x7x7xbf16>
-    %918 = stablehlo.convolution(%917, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x160x7x7xbf16>, tensor<960x160x1x1xbf16>) -> tensor<1x960x7x7xbf16>
-    %919 = stablehlo.convert %918 : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xf32>
-    %920 = stablehlo.broadcast_in_dim %919, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %921 = stablehlo.broadcast_in_dim %arg233, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %922 = stablehlo.subtract %920, %921 : tensor<1x960x7x7xf32>
-    %923 = stablehlo.broadcast_in_dim %922, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %924 = stablehlo.broadcast_in_dim %arg234, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %925 = stablehlo.multiply %923, %924 : tensor<1x960x7x7xf32>
-    %926 = stablehlo.convert %arg235 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %927 = stablehlo.broadcast_in_dim %925, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %928 = stablehlo.broadcast_in_dim %926, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %929 = stablehlo.multiply %927, %928 : tensor<1x960x7x7xf32>
-    %930 = stablehlo.convert %arg236 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %931 = stablehlo.broadcast_in_dim %929, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %932 = stablehlo.broadcast_in_dim %930, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %933 = stablehlo.add %931, %932 : tensor<1x960x7x7xf32>
-    %934 = stablehlo.convert %933 : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xbf16>
-    %935 = stablehlo.broadcast_in_dim %934, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %936 = stablehlo.maximum %935, %874 : tensor<1x960x7x7xbf16>
-    %937 = stablehlo.broadcast_in_dim %936, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %938 = stablehlo.minimum %876, %937 : tensor<1x960x7x7xbf16>
-    %939 = stablehlo.convolution(%938, %arg47) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 960 : i64} : (tensor<1x960x7x7xbf16>, tensor<960x1x3x3xbf16>) -> tensor<1x960x7x7xbf16>
-    %940 = stablehlo.convert %939 : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xf32>
-    %941 = stablehlo.broadcast_in_dim %940, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %942 = stablehlo.broadcast_in_dim %arg237, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %943 = stablehlo.subtract %941, %942 : tensor<1x960x7x7xf32>
-    %944 = stablehlo.broadcast_in_dim %943, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %945 = stablehlo.broadcast_in_dim %arg238, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %946 = stablehlo.multiply %944, %945 : tensor<1x960x7x7xf32>
-    %947 = stablehlo.convert %arg239 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %948 = stablehlo.broadcast_in_dim %946, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %949 = stablehlo.broadcast_in_dim %947, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %950 = stablehlo.multiply %948, %949 : tensor<1x960x7x7xf32>
-    %951 = stablehlo.convert %arg240 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %952 = stablehlo.broadcast_in_dim %950, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %953 = stablehlo.broadcast_in_dim %951, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %954 = stablehlo.add %952, %953 : tensor<1x960x7x7xf32>
-    %955 = stablehlo.convert %954 : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xbf16>
-    %956 = stablehlo.broadcast_in_dim %955, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %957 = stablehlo.maximum %956, %874 : tensor<1x960x7x7xbf16>
-    %958 = stablehlo.broadcast_in_dim %957, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %959 = stablehlo.minimum %876, %958 : tensor<1x960x7x7xbf16>
-    %960 = stablehlo.convolution(%959, %arg48) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x960x7x7xbf16>, tensor<160x960x1x1xbf16>) -> tensor<1x160x7x7xbf16>
-    %961 = stablehlo.convert %960 : (tensor<1x160x7x7xbf16>) -> tensor<1x160x7x7xf32>
-    %962 = stablehlo.broadcast_in_dim %961, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %963 = stablehlo.broadcast_in_dim %arg241, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %964 = stablehlo.subtract %962, %963 : tensor<1x160x7x7xf32>
-    %965 = stablehlo.broadcast_in_dim %964, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %966 = stablehlo.broadcast_in_dim %arg242, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %967 = stablehlo.multiply %965, %966 : tensor<1x160x7x7xf32>
-    %968 = stablehlo.convert %arg243 : (tensor<160x1x1xbf16>) -> tensor<160x1x1xf32>
-    %969 = stablehlo.broadcast_in_dim %967, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %970 = stablehlo.broadcast_in_dim %968, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %971 = stablehlo.multiply %969, %970 : tensor<1x160x7x7xf32>
-    %972 = stablehlo.convert %arg244 : (tensor<160x1x1xbf16>) -> tensor<160x1x1xf32>
-    %973 = stablehlo.broadcast_in_dim %971, dims = [0, 1, 2, 3] : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xf32>
-    %974 = stablehlo.broadcast_in_dim %972, dims = [1, 2, 3] : (tensor<160x1x1xf32>) -> tensor<1x160x7x7xf32>
-    %975 = stablehlo.add %973, %974 : tensor<1x160x7x7xf32>
-    %976 = stablehlo.convert %975 : (tensor<1x160x7x7xf32>) -> tensor<1x160x7x7xbf16>
-    %977 = stablehlo.add %917, %976 : tensor<1x160x7x7xbf16>
-    %978 = stablehlo.convolution(%977, %arg49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x160x7x7xbf16>, tensor<960x160x1x1xbf16>) -> tensor<1x960x7x7xbf16>
-    %979 = stablehlo.convert %978 : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xf32>
-    %980 = stablehlo.broadcast_in_dim %979, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %981 = stablehlo.broadcast_in_dim %arg245, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %982 = stablehlo.subtract %980, %981 : tensor<1x960x7x7xf32>
-    %983 = stablehlo.broadcast_in_dim %982, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %984 = stablehlo.broadcast_in_dim %arg246, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %985 = stablehlo.multiply %983, %984 : tensor<1x960x7x7xf32>
-    %986 = stablehlo.convert %arg247 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %987 = stablehlo.broadcast_in_dim %985, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %988 = stablehlo.broadcast_in_dim %986, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %989 = stablehlo.multiply %987, %988 : tensor<1x960x7x7xf32>
-    %990 = stablehlo.convert %arg248 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %991 = stablehlo.broadcast_in_dim %989, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %992 = stablehlo.broadcast_in_dim %990, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %993 = stablehlo.add %991, %992 : tensor<1x960x7x7xf32>
-    %994 = stablehlo.convert %993 : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xbf16>
-    %995 = stablehlo.broadcast_in_dim %994, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %996 = stablehlo.maximum %995, %874 : tensor<1x960x7x7xbf16>
-    %997 = stablehlo.broadcast_in_dim %996, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %998 = stablehlo.minimum %876, %997 : tensor<1x960x7x7xbf16>
-    %999 = stablehlo.convolution(%998, %arg50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 960 : i64} : (tensor<1x960x7x7xbf16>, tensor<960x1x3x3xbf16>) -> tensor<1x960x7x7xbf16>
-    %1000 = stablehlo.convert %999 : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xf32>
-    %1001 = stablehlo.broadcast_in_dim %1000, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %1002 = stablehlo.broadcast_in_dim %arg249, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %1003 = stablehlo.subtract %1001, %1002 : tensor<1x960x7x7xf32>
-    %1004 = stablehlo.broadcast_in_dim %1003, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %1005 = stablehlo.broadcast_in_dim %arg250, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %1006 = stablehlo.multiply %1004, %1005 : tensor<1x960x7x7xf32>
-    %1007 = stablehlo.convert %arg251 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %1008 = stablehlo.broadcast_in_dim %1006, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %1009 = stablehlo.broadcast_in_dim %1007, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %1010 = stablehlo.multiply %1008, %1009 : tensor<1x960x7x7xf32>
-    %1011 = stablehlo.convert %arg252 : (tensor<960x1x1xbf16>) -> tensor<960x1x1xf32>
-    %1012 = stablehlo.broadcast_in_dim %1010, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xf32>
-    %1013 = stablehlo.broadcast_in_dim %1011, dims = [1, 2, 3] : (tensor<960x1x1xf32>) -> tensor<1x960x7x7xf32>
-    %1014 = stablehlo.add %1012, %1013 : tensor<1x960x7x7xf32>
-    %1015 = stablehlo.convert %1014 : (tensor<1x960x7x7xf32>) -> tensor<1x960x7x7xbf16>
-    %1016 = stablehlo.broadcast_in_dim %1015, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %1017 = stablehlo.maximum %1016, %874 : tensor<1x960x7x7xbf16>
-    %1018 = stablehlo.broadcast_in_dim %1017, dims = [0, 1, 2, 3] : (tensor<1x960x7x7xbf16>) -> tensor<1x960x7x7xbf16>
-    %1019 = stablehlo.minimum %876, %1018 : tensor<1x960x7x7xbf16>
-    %1020 = stablehlo.convolution(%1019, %arg51) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x960x7x7xbf16>, tensor<320x960x1x1xbf16>) -> tensor<1x320x7x7xbf16>
-    %1021 = stablehlo.convert %1020 : (tensor<1x320x7x7xbf16>) -> tensor<1x320x7x7xf32>
-    %1022 = stablehlo.broadcast_in_dim %1021, dims = [0, 1, 2, 3] : (tensor<1x320x7x7xf32>) -> tensor<1x320x7x7xf32>
-    %1023 = stablehlo.broadcast_in_dim %arg253, dims = [1, 2, 3] : (tensor<320x1x1xf32>) -> tensor<1x320x7x7xf32>
-    %1024 = stablehlo.subtract %1022, %1023 : tensor<1x320x7x7xf32>
-    %1025 = stablehlo.broadcast_in_dim %1024, dims = [0, 1, 2, 3] : (tensor<1x320x7x7xf32>) -> tensor<1x320x7x7xf32>
-    %1026 = stablehlo.broadcast_in_dim %arg254, dims = [1, 2, 3] : (tensor<320x1x1xf32>) -> tensor<1x320x7x7xf32>
-    %1027 = stablehlo.multiply %1025, %1026 : tensor<1x320x7x7xf32>
-    %1028 = stablehlo.convert %arg255 : (tensor<320x1x1xbf16>) -> tensor<320x1x1xf32>
-    %1029 = stablehlo.broadcast_in_dim %1027, dims = [0, 1, 2, 3] : (tensor<1x320x7x7xf32>) -> tensor<1x320x7x7xf32>
-    %1030 = stablehlo.broadcast_in_dim %1028, dims = [1, 2, 3] : (tensor<320x1x1xf32>) -> tensor<1x320x7x7xf32>
-    %1031 = stablehlo.multiply %1029, %1030 : tensor<1x320x7x7xf32>
-    %1032 = stablehlo.convert %arg256 : (tensor<320x1x1xbf16>) -> tensor<320x1x1xf32>
-    %1033 = stablehlo.broadcast_in_dim %1031, dims = [0, 1, 2, 3] : (tensor<1x320x7x7xf32>) -> tensor<1x320x7x7xf32>
-    %1034 = stablehlo.broadcast_in_dim %1032, dims = [1, 2, 3] : (tensor<320x1x1xf32>) -> tensor<1x320x7x7xf32>
-    %1035 = stablehlo.add %1033, %1034 : tensor<1x320x7x7xf32>
-    %1036 = stablehlo.convert %1035 : (tensor<1x320x7x7xf32>) -> tensor<1x320x7x7xbf16>
-    %1037 = stablehlo.convolution(%1036, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x320x7x7xbf16>, tensor<1280x320x1x1xbf16>) -> tensor<1x1280x7x7xbf16>
-    %1038 = stablehlo.convert %1037 : (tensor<1x1280x7x7xbf16>) -> tensor<1x1280x7x7xf32>
-    %1039 = stablehlo.broadcast_in_dim %1038, dims = [0, 1, 2, 3] : (tensor<1x1280x7x7xf32>) -> tensor<1x1280x7x7xf32>
-    %1040 = stablehlo.broadcast_in_dim %arg257, dims = [1, 2, 3] : (tensor<1280x1x1xf32>) -> tensor<1x1280x7x7xf32>
-    %1041 = stablehlo.subtract %1039, %1040 : tensor<1x1280x7x7xf32>
-    %1042 = stablehlo.broadcast_in_dim %1041, dims = [0, 1, 2, 3] : (tensor<1x1280x7x7xf32>) -> tensor<1x1280x7x7xf32>
-    %1043 = stablehlo.broadcast_in_dim %arg258, dims = [1, 2, 3] : (tensor<1280x1x1xf32>) -> tensor<1x1280x7x7xf32>
-    %1044 = stablehlo.multiply %1042, %1043 : tensor<1x1280x7x7xf32>
-    %1045 = stablehlo.convert %arg259 : (tensor<1280x1x1xbf16>) -> tensor<1280x1x1xf32>
-    %1046 = stablehlo.broadcast_in_dim %1044, dims = [0, 1, 2, 3] : (tensor<1x1280x7x7xf32>) -> tensor<1x1280x7x7xf32>
-    %1047 = stablehlo.broadcast_in_dim %1045, dims = [1, 2, 3] : (tensor<1280x1x1xf32>) -> tensor<1x1280x7x7xf32>
-    %1048 = stablehlo.multiply %1046, %1047 : tensor<1x1280x7x7xf32>
-    %1049 = stablehlo.convert %arg260 : (tensor<1280x1x1xbf16>) -> tensor<1280x1x1xf32>
-    %1050 = stablehlo.broadcast_in_dim %1048, dims = [0, 1, 2, 3] : (tensor<1x1280x7x7xf32>) -> tensor<1x1280x7x7xf32>
-    %1051 = stablehlo.broadcast_in_dim %1049, dims = [1, 2, 3] : (tensor<1280x1x1xf32>) -> tensor<1x1280x7x7xf32>
-    %1052 = stablehlo.add %1050, %1051 : tensor<1x1280x7x7xf32>
-    %1053 = stablehlo.convert %1052 : (tensor<1x1280x7x7xf32>) -> tensor<1x1280x7x7xbf16>
-    %1054 = stablehlo.broadcast_in_dim %1053, dims = [0, 1, 2, 3] : (tensor<1x1280x7x7xbf16>) -> tensor<1x1280x7x7xbf16>
-    %1055 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x1280x7x7xbf16>
-    %1056 = stablehlo.maximum %1054, %1055 : tensor<1x1280x7x7xbf16>
-    %1057 = stablehlo.broadcast_in_dim %21, dims = [] : (tensor<bf16>) -> tensor<1x1280x7x7xbf16>
-    %1058 = stablehlo.broadcast_in_dim %1056, dims = [0, 1, 2, 3] : (tensor<1x1280x7x7xbf16>) -> tensor<1x1280x7x7xbf16>
-    %1059 = stablehlo.minimum %1057, %1058 : tensor<1x1280x7x7xbf16>
-    %1060 = stablehlo.reduce(%1059 init: %cst_1) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x1280x7x7xbf16>, tensor<bf16>) -> tensor<1x1280xbf16>
-    %1061 = stablehlo.reshape %1060 : (tensor<1x1280xbf16>) -> tensor<1x1280x1x1xbf16>
-    %1062 = stablehlo.convert %cst_2 : (tensor<1xi64>) -> tensor<1xbf16>
-    %1063 = stablehlo.reshape %1062 : (tensor<1xbf16>) -> tensor<bf16>
-    %1064 = stablehlo.broadcast_in_dim %1061, dims = [0, 1, 2, 3] : (tensor<1x1280x1x1xbf16>) -> tensor<1x1280x1x1xbf16>
-    %1065 = stablehlo.broadcast_in_dim %1063, dims = [] : (tensor<bf16>) -> tensor<1x1280x1x1xbf16>
-    %1066 = stablehlo.divide %1064, %1065 : tensor<1x1280x1x1xbf16>
-    %1067 = stablehlo.reshape %1066 : (tensor<1x1280x1x1xbf16>) -> tensor<1x1280xbf16>
-    %1068 = stablehlo.convert %1067 : (tensor<1x1280xbf16>) -> tensor<1x1280xf32>
-    %1069 = stablehlo.dot_general %1068, %arg261, contracting_dims = [1] x [0] : (tensor<1x1280xf32>, tensor<1280x1000xf32>) -> tensor<1x1000xf32>
-    %1070 = stablehlo.convert %cst_3 : (tensor<1xi64>) -> tensor<1xf32>
-    %1071 = stablehlo.reshape %1070 : (tensor<1xf32>) -> tensor<f32>
-    %1072 = stablehlo.broadcast_in_dim %1069, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %1073 = stablehlo.broadcast_in_dim %1071, dims = [] : (tensor<f32>) -> tensor<1x1000xf32>
-    %1074 = stablehlo.multiply %1072, %1073 : tensor<1x1000xf32>
-    %1075 = stablehlo.broadcast_in_dim %1074, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %1076 = stablehlo.broadcast_in_dim %arg262, dims = [1] : (tensor<1000xf32>) -> tensor<1x1000xf32>
-    %1077 = stablehlo.add %1075, %1076 : tensor<1x1000xf32>
-    %1078 = stablehlo.convert %1077 : (tensor<1x1000xf32>) -> tensor<1x1000xbf16>
-    return %1078 : tensor<1x1000xbf16>
-  }
-}
diff --git a/mlir_tests/OpenPose V2.mlir b/mlir_tests/OpenPose V2.mlir
deleted file mode 100644
index 83827d92..00000000
--- a/mlir_tests/OpenPose V2.mlir	
+++ /dev/null
@@ -1,867 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x224x224xbf16>, %arg1: tensor<32x3x3x3xbf16>, %arg2: tensor<32x1x3x3xbf16>, %arg3: tensor<64x32x1x1xbf16>, %arg4: tensor<64x1x3x3xbf16>, %arg5: tensor<128x64x1x1xbf16>, %arg6: tensor<128x1x3x3xbf16>, %arg7: tensor<128x128x1x1xbf16>, %arg8: tensor<128x1x3x3xbf16>, %arg9: tensor<256x128x1x1xbf16>, %arg10: tensor<256x1x3x3xbf16>, %arg11: tensor<256x256x1x1xbf16>, %arg12: tensor<256x1x3x3xbf16>, %arg13: tensor<512x256x1x1xbf16>, %arg14: tensor<512x1x3x3xbf16>, %arg15: tensor<512x512x1x1xbf16>, %arg16: tensor<512x1x3x3xbf16>, %arg17: tensor<512x512x1x1xbf16>, %arg18: tensor<512x1x3x3xbf16>, %arg19: tensor<512x512x1x1xbf16>, %arg20: tensor<512x1x3x3xbf16>, %arg21: tensor<512x512x1x1xbf16>, %arg22: tensor<512x1x3x3xbf16>, %arg23: tensor<512x512x1x1xbf16>, %arg24: tensor<128x512x1x1xbf16>, %arg25: tensor<128xbf16>, %arg26: tensor<128x1x3x3xbf16>, %arg27: tensor<128x128x1x1xbf16>, %arg28: tensor<128x1x3x3xbf16>, %arg29: tensor<128x128x1x1xbf16>, %arg30: tensor<128x1x3x3xbf16>, %arg31: tensor<128x128x1x1xbf16>, %arg32: tensor<128x128x3x3xbf16>, %arg33: tensor<128xbf16>, %arg34: tensor<128x128x3x3xbf16>, %arg35: tensor<128xbf16>, %arg36: tensor<128x128x3x3xbf16>, %arg37: tensor<128xbf16>, %arg38: tensor<128x128x3x3xbf16>, %arg39: tensor<128xbf16>, %arg40: tensor<512x128x1x1xbf16>, %arg41: tensor<512xbf16>, %arg42: tensor<19x512x1x1xbf16>, %arg43: tensor<19xbf16>, %arg44: tensor<512x128x1x1xbf16>, %arg45: tensor<512xbf16>, %arg46: tensor<38x512x1x1xbf16>, %arg47: tensor<38xbf16>, %arg48: tensor<128x185x1x1xbf16>, %arg49: tensor<128xbf16>, %arg50: tensor<128x128x3x3xbf16>, %arg51: tensor<128xbf16>, %arg52: tensor<128x128x3x3xbf16>, %arg53: tensor<128xbf16>, %arg54: tensor<128x128x1x1xbf16>, %arg55: tensor<128xbf16>, %arg56: tensor<128x128x3x3xbf16>, %arg57: tensor<128xbf16>, %arg58: tensor<128x128x3x3xbf16>, %arg59: tensor<128xbf16>, %arg60: tensor<128x128x1x1xbf16>, %arg61: tensor<128xbf16>, %arg62: tensor<128x128x3x3xbf16>, %arg63: tensor<128xbf16>, %arg64: tensor<128x128x3x3xbf16>, %arg65: tensor<128xbf16>, %arg66: tensor<128x128x1x1xbf16>, %arg67: tensor<128xbf16>, %arg68: tensor<128x128x3x3xbf16>, %arg69: tensor<128xbf16>, %arg70: tensor<128x128x3x3xbf16>, %arg71: tensor<128xbf16>, %arg72: tensor<128x128x1x1xbf16>, %arg73: tensor<128xbf16>, %arg74: tensor<128x128x3x3xbf16>, %arg75: tensor<128xbf16>, %arg76: tensor<128x128x3x3xbf16>, %arg77: tensor<128xbf16>, %arg78: tensor<128x128x1x1xbf16>, %arg79: tensor<128xbf16>, %arg80: tensor<19x128x1x1xbf16>, %arg81: tensor<19xbf16>, %arg82: tensor<128x128x1x1xbf16>, %arg83: tensor<128xbf16>, %arg84: tensor<38x128x1x1xbf16>, %arg85: tensor<38xbf16>, %arg86: tensor<32x1x1xf32>, %arg87: tensor<32x1x1xf32>, %arg88: tensor<32x1x1xbf16>, %arg89: tensor<32x1x1xbf16>, %arg90: tensor<32x1x1xf32>, %arg91: tensor<32x1x1xf32>, %arg92: tensor<32x1x1xbf16>, %arg93: tensor<32x1x1xbf16>, %arg94: tensor<64x1x1xf32>, %arg95: tensor<64x1x1xf32>, %arg96: tensor<64x1x1xbf16>, %arg97: tensor<64x1x1xbf16>, %arg98: tensor<64x1x1xf32>, %arg99: tensor<64x1x1xf32>, %arg100: tensor<64x1x1xbf16>, %arg101: tensor<64x1x1xbf16>, %arg102: tensor<128x1x1xf32>, %arg103: tensor<128x1x1xf32>, %arg104: tensor<128x1x1xbf16>, %arg105: tensor<128x1x1xbf16>, %arg106: tensor<128x1x1xf32>, %arg107: tensor<128x1x1xf32>, %arg108: tensor<128x1x1xbf16>, %arg109: tensor<128x1x1xbf16>, %arg110: tensor<128x1x1xf32>, %arg111: tensor<128x1x1xf32>, %arg112: tensor<128x1x1xbf16>, %arg113: tensor<128x1x1xbf16>, %arg114: tensor<128x1x1xf32>, %arg115: tensor<128x1x1xf32>, %arg116: tensor<128x1x1xbf16>, %arg117: tensor<128x1x1xbf16>, %arg118: tensor<256x1x1xf32>, %arg119: tensor<256x1x1xf32>, %arg120: tensor<256x1x1xbf16>, %arg121: tensor<256x1x1xbf16>, %arg122: tensor<256x1x1xf32>, %arg123: tensor<256x1x1xf32>, %arg124: tensor<256x1x1xbf16>, %arg125: tensor<256x1x1xbf16>, %arg126: tensor<256x1x1xf32>, %arg127: tensor<256x1x1xf32>, %arg128: tensor<256x1x1xbf16>, %arg129: tensor<256x1x1xbf16>, %arg130: tensor<256x1x1xf32>, %arg131: tensor<256x1x1xf32>, %arg132: tensor<256x1x1xbf16>, %arg133: tensor<256x1x1xbf16>, %arg134: tensor<512x1x1xf32>, %arg135: tensor<512x1x1xf32>, %arg136: tensor<512x1x1xbf16>, %arg137: tensor<512x1x1xbf16>, %arg138: tensor<512x1x1xf32>, %arg139: tensor<512x1x1xf32>, %arg140: tensor<512x1x1xbf16>, %arg141: tensor<512x1x1xbf16>, %arg142: tensor<512x1x1xf32>, %arg143: tensor<512x1x1xf32>, %arg144: tensor<512x1x1xbf16>, %arg145: tensor<512x1x1xbf16>, %arg146: tensor<512x1x1xf32>, %arg147: tensor<512x1x1xf32>, %arg148: tensor<512x1x1xbf16>, %arg149: tensor<512x1x1xbf16>, %arg150: tensor<512x1x1xf32>, %arg151: tensor<512x1x1xf32>, %arg152: tensor<512x1x1xbf16>, %arg153: tensor<512x1x1xbf16>, %arg154: tensor<512x1x1xf32>, %arg155: tensor<512x1x1xf32>, %arg156: tensor<512x1x1xbf16>, %arg157: tensor<512x1x1xbf16>, %arg158: tensor<512x1x1xf32>, %arg159: tensor<512x1x1xf32>, %arg160: tensor<512x1x1xbf16>, %arg161: tensor<512x1x1xbf16>, %arg162: tensor<512x1x1xf32>, %arg163: tensor<512x1x1xf32>, %arg164: tensor<512x1x1xbf16>, %arg165: tensor<512x1x1xbf16>, %arg166: tensor<512x1x1xf32>, %arg167: tensor<512x1x1xf32>, %arg168: tensor<512x1x1xbf16>, %arg169: tensor<512x1x1xbf16>, %arg170: tensor<512x1x1xf32>, %arg171: tensor<512x1x1xf32>, %arg172: tensor<512x1x1xbf16>, %arg173: tensor<512x1x1xbf16>, %arg174: tensor<512x1x1xf32>, %arg175: tensor<512x1x1xf32>, %arg176: tensor<512x1x1xbf16>, %arg177: tensor<512x1x1xbf16>, %arg178: tensor<128x1x1xf32>, %arg179: tensor<128x1x1xf32>, %arg180: tensor<128x1x1xbf16>, %arg181: tensor<128x1x1xbf16>, %arg182: tensor<128x1x1xf32>, %arg183: tensor<128x1x1xf32>, %arg184: tensor<128x1x1xbf16>, %arg185: tensor<128x1x1xbf16>, %arg186: tensor<128x1x1xf32>, %arg187: tensor<128x1x1xf32>, %arg188: tensor<128x1x1xbf16>, %arg189: tensor<128x1x1xbf16>, %arg190: tensor<128x1x1xf32>, %arg191: tensor<128x1x1xf32>, %arg192: tensor<128x1x1xbf16>, %arg193: tensor<128x1x1xbf16>, %arg194: tensor<128x1x1xf32>, %arg195: tensor<128x1x1xf32>, %arg196: tensor<128x1x1xbf16>, %arg197: tensor<128x1x1xbf16>, %arg198: tensor<128x1x1xf32>, %arg199: tensor<128x1x1xf32>, %arg200: tensor<128x1x1xbf16>, %arg201: tensor<128x1x1xbf16>, %arg202: tensor<128x1x1xf32>, %arg203: tensor<128x1x1xf32>, %arg204: tensor<128x1x1xbf16>, %arg205: tensor<128x1x1xbf16>, %arg206: tensor<128x1x1xf32>, %arg207: tensor<128x1x1xf32>, %arg208: tensor<128x1x1xbf16>, %arg209: tensor<128x1x1xbf16>, %arg210: tensor<128x1x1xf32>, %arg211: tensor<128x1x1xf32>, %arg212: tensor<128x1x1xbf16>, %arg213: tensor<128x1x1xbf16>, %arg214: tensor<128x1x1xf32>, %arg215: tensor<128x1x1xf32>, %arg216: tensor<128x1x1xbf16>, %arg217: tensor<128x1x1xbf16>) -> tensor<1x57x28x28xbf16> {
-    %c = stablehlo.constant dense<0> : tensor<i64>
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<1x32x112x112xbf16>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x112x112xbf16>
-    %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x56x56xbf16>
-    %cst_2 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x56x56xbf16>
-    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x28x28xbf16>
-    %cst_4 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x28x28xbf16>
-    %cst_5 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x28x28xbf16>
-    %cst_6 = arith.constant dense<1> : tensor<1xi64>
-    %cst_7 = arith.constant dense<1.000000e+00> : tensor<1xf64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x224x224xbf16>, tensor<32x3x3x3xbf16>) -> tensor<1x32x112x112xbf16>
-    %1 = stablehlo.convert %0 : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xf32>
-    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %3 = stablehlo.broadcast_in_dim %arg86, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %4 = stablehlo.subtract %2, %3 : tensor<1x32x112x112xf32>
-    %5 = stablehlo.broadcast_in_dim %4, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %6 = stablehlo.broadcast_in_dim %arg87, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %7 = stablehlo.multiply %5, %6 : tensor<1x32x112x112xf32>
-    %8 = stablehlo.convert %arg88 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %10 = stablehlo.broadcast_in_dim %8, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %11 = stablehlo.multiply %9, %10 : tensor<1x32x112x112xf32>
-    %12 = stablehlo.convert %arg89 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %15 = stablehlo.add %13, %14 : tensor<1x32x112x112xf32>
-    %16 = stablehlo.convert %15 : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xbf16>
-    %17 = stablehlo.maximum %16, %cst : tensor<1x32x112x112xbf16>
-    %18 = stablehlo.convolution(%17, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 32 : i64} : (tensor<1x32x112x112xbf16>, tensor<32x1x3x3xbf16>) -> tensor<1x32x112x112xbf16>
-    %19 = stablehlo.convert %18 : (tensor<1x32x112x112xbf16>) -> tensor<1x32x112x112xf32>
-    %20 = stablehlo.broadcast_in_dim %19, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %21 = stablehlo.broadcast_in_dim %arg90, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %22 = stablehlo.subtract %20, %21 : tensor<1x32x112x112xf32>
-    %23 = stablehlo.broadcast_in_dim %22, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %24 = stablehlo.broadcast_in_dim %arg91, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %25 = stablehlo.multiply %23, %24 : tensor<1x32x112x112xf32>
-    %26 = stablehlo.convert %arg92 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %27 = stablehlo.broadcast_in_dim %25, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %28 = stablehlo.broadcast_in_dim %26, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %29 = stablehlo.multiply %27, %28 : tensor<1x32x112x112xf32>
-    %30 = stablehlo.convert %arg93 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %31 = stablehlo.broadcast_in_dim %29, dims = [0, 1, 2, 3] : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xf32>
-    %32 = stablehlo.broadcast_in_dim %30, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x112x112xf32>
-    %33 = stablehlo.add %31, %32 : tensor<1x32x112x112xf32>
-    %34 = stablehlo.convert %33 : (tensor<1x32x112x112xf32>) -> tensor<1x32x112x112xbf16>
-    %35 = stablehlo.maximum %34, %cst : tensor<1x32x112x112xbf16>
-    %36 = stablehlo.convolution(%35, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x112x112xbf16>, tensor<64x32x1x1xbf16>) -> tensor<1x64x112x112xbf16>
-    %37 = stablehlo.convert %36 : (tensor<1x64x112x112xbf16>) -> tensor<1x64x112x112xf32>
-    %38 = stablehlo.broadcast_in_dim %37, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %39 = stablehlo.broadcast_in_dim %arg94, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %40 = stablehlo.subtract %38, %39 : tensor<1x64x112x112xf32>
-    %41 = stablehlo.broadcast_in_dim %40, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %42 = stablehlo.broadcast_in_dim %arg95, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %43 = stablehlo.multiply %41, %42 : tensor<1x64x112x112xf32>
-    %44 = stablehlo.convert %arg96 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %45 = stablehlo.broadcast_in_dim %43, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %47 = stablehlo.multiply %45, %46 : tensor<1x64x112x112xf32>
-    %48 = stablehlo.convert %arg97 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %49 = stablehlo.broadcast_in_dim %47, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %51 = stablehlo.add %49, %50 : tensor<1x64x112x112xf32>
-    %52 = stablehlo.convert %51 : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xbf16>
-    %53 = stablehlo.maximum %52, %cst_0 : tensor<1x64x112x112xbf16>
-    %54 = stablehlo.convolution(%53, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 64 : i64} : (tensor<1x64x112x112xbf16>, tensor<64x1x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %55 = stablehlo.convert %54 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %56 = stablehlo.broadcast_in_dim %55, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %57 = stablehlo.broadcast_in_dim %arg98, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %58 = stablehlo.subtract %56, %57 : tensor<1x64x56x56xf32>
-    %59 = stablehlo.broadcast_in_dim %58, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %60 = stablehlo.broadcast_in_dim %arg99, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %61 = stablehlo.multiply %59, %60 : tensor<1x64x56x56xf32>
-    %62 = stablehlo.convert %arg100 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %63 = stablehlo.broadcast_in_dim %61, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %64 = stablehlo.broadcast_in_dim %62, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %65 = stablehlo.multiply %63, %64 : tensor<1x64x56x56xf32>
-    %66 = stablehlo.convert %arg101 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %67 = stablehlo.broadcast_in_dim %65, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %68 = stablehlo.broadcast_in_dim %66, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %69 = stablehlo.add %67, %68 : tensor<1x64x56x56xf32>
-    %70 = stablehlo.convert %69 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %71 = stablehlo.maximum %70, %cst_1 : tensor<1x64x56x56xbf16>
-    %72 = stablehlo.convolution(%71, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<128x64x1x1xbf16>) -> tensor<1x128x56x56xbf16>
-    %73 = stablehlo.convert %72 : (tensor<1x128x56x56xbf16>) -> tensor<1x128x56x56xf32>
-    %74 = stablehlo.broadcast_in_dim %73, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %75 = stablehlo.broadcast_in_dim %arg102, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %76 = stablehlo.subtract %74, %75 : tensor<1x128x56x56xf32>
-    %77 = stablehlo.broadcast_in_dim %76, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %78 = stablehlo.broadcast_in_dim %arg103, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %79 = stablehlo.multiply %77, %78 : tensor<1x128x56x56xf32>
-    %80 = stablehlo.convert %arg104 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %81 = stablehlo.broadcast_in_dim %79, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %82 = stablehlo.broadcast_in_dim %80, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %83 = stablehlo.multiply %81, %82 : tensor<1x128x56x56xf32>
-    %84 = stablehlo.convert %arg105 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %85 = stablehlo.broadcast_in_dim %83, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %86 = stablehlo.broadcast_in_dim %84, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %87 = stablehlo.add %85, %86 : tensor<1x128x56x56xf32>
-    %88 = stablehlo.convert %87 : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xbf16>
-    %89 = stablehlo.maximum %88, %cst_2 : tensor<1x128x56x56xbf16>
-    %90 = stablehlo.convolution(%89, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x56x56xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x56x56xbf16>
-    %91 = stablehlo.convert %90 : (tensor<1x128x56x56xbf16>) -> tensor<1x128x56x56xf32>
-    %92 = stablehlo.broadcast_in_dim %91, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %93 = stablehlo.broadcast_in_dim %arg106, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %94 = stablehlo.subtract %92, %93 : tensor<1x128x56x56xf32>
-    %95 = stablehlo.broadcast_in_dim %94, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %96 = stablehlo.broadcast_in_dim %arg107, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %97 = stablehlo.multiply %95, %96 : tensor<1x128x56x56xf32>
-    %98 = stablehlo.convert %arg108 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %99 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %100 = stablehlo.broadcast_in_dim %98, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %101 = stablehlo.multiply %99, %100 : tensor<1x128x56x56xf32>
-    %102 = stablehlo.convert %arg109 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %103 = stablehlo.broadcast_in_dim %101, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %104 = stablehlo.broadcast_in_dim %102, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %105 = stablehlo.add %103, %104 : tensor<1x128x56x56xf32>
-    %106 = stablehlo.convert %105 : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xbf16>
-    %107 = stablehlo.maximum %106, %cst_2 : tensor<1x128x56x56xbf16>
-    %108 = stablehlo.convolution(%107, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x56x56xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x56x56xbf16>
-    %109 = stablehlo.convert %108 : (tensor<1x128x56x56xbf16>) -> tensor<1x128x56x56xf32>
-    %110 = stablehlo.broadcast_in_dim %109, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %111 = stablehlo.broadcast_in_dim %arg110, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %112 = stablehlo.subtract %110, %111 : tensor<1x128x56x56xf32>
-    %113 = stablehlo.broadcast_in_dim %112, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %114 = stablehlo.broadcast_in_dim %arg111, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %115 = stablehlo.multiply %113, %114 : tensor<1x128x56x56xf32>
-    %116 = stablehlo.convert %arg112 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %117 = stablehlo.broadcast_in_dim %115, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %118 = stablehlo.broadcast_in_dim %116, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %119 = stablehlo.multiply %117, %118 : tensor<1x128x56x56xf32>
-    %120 = stablehlo.convert %arg113 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %121 = stablehlo.broadcast_in_dim %119, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %122 = stablehlo.broadcast_in_dim %120, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %123 = stablehlo.add %121, %122 : tensor<1x128x56x56xf32>
-    %124 = stablehlo.convert %123 : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xbf16>
-    %125 = stablehlo.maximum %124, %cst_2 : tensor<1x128x56x56xbf16>
-    %126 = stablehlo.convolution(%125, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x56x56xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %127 = stablehlo.convert %126 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %128 = stablehlo.broadcast_in_dim %127, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %129 = stablehlo.broadcast_in_dim %arg114, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %130 = stablehlo.subtract %128, %129 : tensor<1x128x28x28xf32>
-    %131 = stablehlo.broadcast_in_dim %130, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %132 = stablehlo.broadcast_in_dim %arg115, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %133 = stablehlo.multiply %131, %132 : tensor<1x128x28x28xf32>
-    %134 = stablehlo.convert %arg116 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %135 = stablehlo.broadcast_in_dim %133, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %136 = stablehlo.broadcast_in_dim %134, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %137 = stablehlo.multiply %135, %136 : tensor<1x128x28x28xf32>
-    %138 = stablehlo.convert %arg117 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %139 = stablehlo.broadcast_in_dim %137, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %140 = stablehlo.broadcast_in_dim %138, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %141 = stablehlo.add %139, %140 : tensor<1x128x28x28xf32>
-    %142 = stablehlo.convert %141 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %143 = stablehlo.maximum %142, %cst_3 : tensor<1x128x28x28xbf16>
-    %144 = stablehlo.convolution(%143, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<256x128x1x1xbf16>) -> tensor<1x256x28x28xbf16>
-    %145 = stablehlo.convert %144 : (tensor<1x256x28x28xbf16>) -> tensor<1x256x28x28xf32>
-    %146 = stablehlo.broadcast_in_dim %145, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %147 = stablehlo.broadcast_in_dim %arg118, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %148 = stablehlo.subtract %146, %147 : tensor<1x256x28x28xf32>
-    %149 = stablehlo.broadcast_in_dim %148, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %150 = stablehlo.broadcast_in_dim %arg119, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %151 = stablehlo.multiply %149, %150 : tensor<1x256x28x28xf32>
-    %152 = stablehlo.convert %arg120 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %153 = stablehlo.broadcast_in_dim %151, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %154 = stablehlo.broadcast_in_dim %152, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %155 = stablehlo.multiply %153, %154 : tensor<1x256x28x28xf32>
-    %156 = stablehlo.convert %arg121 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %157 = stablehlo.broadcast_in_dim %155, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %158 = stablehlo.broadcast_in_dim %156, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %159 = stablehlo.add %157, %158 : tensor<1x256x28x28xf32>
-    %160 = stablehlo.convert %159 : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xbf16>
-    %161 = stablehlo.maximum %160, %cst_4 : tensor<1x256x28x28xbf16>
-    %162 = stablehlo.convolution(%161, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x28x28xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x28x28xbf16>
-    %163 = stablehlo.convert %162 : (tensor<1x256x28x28xbf16>) -> tensor<1x256x28x28xf32>
-    %164 = stablehlo.broadcast_in_dim %163, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %165 = stablehlo.broadcast_in_dim %arg122, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %166 = stablehlo.subtract %164, %165 : tensor<1x256x28x28xf32>
-    %167 = stablehlo.broadcast_in_dim %166, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %168 = stablehlo.broadcast_in_dim %arg123, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %169 = stablehlo.multiply %167, %168 : tensor<1x256x28x28xf32>
-    %170 = stablehlo.convert %arg124 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %171 = stablehlo.broadcast_in_dim %169, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %172 = stablehlo.broadcast_in_dim %170, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %173 = stablehlo.multiply %171, %172 : tensor<1x256x28x28xf32>
-    %174 = stablehlo.convert %arg125 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %175 = stablehlo.broadcast_in_dim %173, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %176 = stablehlo.broadcast_in_dim %174, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %177 = stablehlo.add %175, %176 : tensor<1x256x28x28xf32>
-    %178 = stablehlo.convert %177 : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xbf16>
-    %179 = stablehlo.maximum %178, %cst_4 : tensor<1x256x28x28xbf16>
-    %180 = stablehlo.convolution(%179, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x28x28xbf16>, tensor<256x256x1x1xbf16>) -> tensor<1x256x28x28xbf16>
-    %181 = stablehlo.convert %180 : (tensor<1x256x28x28xbf16>) -> tensor<1x256x28x28xf32>
-    %182 = stablehlo.broadcast_in_dim %181, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %183 = stablehlo.broadcast_in_dim %arg126, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %184 = stablehlo.subtract %182, %183 : tensor<1x256x28x28xf32>
-    %185 = stablehlo.broadcast_in_dim %184, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %186 = stablehlo.broadcast_in_dim %arg127, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %187 = stablehlo.multiply %185, %186 : tensor<1x256x28x28xf32>
-    %188 = stablehlo.convert %arg128 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %189 = stablehlo.broadcast_in_dim %187, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %190 = stablehlo.broadcast_in_dim %188, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %191 = stablehlo.multiply %189, %190 : tensor<1x256x28x28xf32>
-    %192 = stablehlo.convert %arg129 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %193 = stablehlo.broadcast_in_dim %191, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %194 = stablehlo.broadcast_in_dim %192, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %195 = stablehlo.add %193, %194 : tensor<1x256x28x28xf32>
-    %196 = stablehlo.convert %195 : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xbf16>
-    %197 = stablehlo.maximum %196, %cst_4 : tensor<1x256x28x28xbf16>
-    %198 = stablehlo.convolution(%197, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x28x28xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x28x28xbf16>
-    %199 = stablehlo.convert %198 : (tensor<1x256x28x28xbf16>) -> tensor<1x256x28x28xf32>
-    %200 = stablehlo.broadcast_in_dim %199, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %201 = stablehlo.broadcast_in_dim %arg130, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %202 = stablehlo.subtract %200, %201 : tensor<1x256x28x28xf32>
-    %203 = stablehlo.broadcast_in_dim %202, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %204 = stablehlo.broadcast_in_dim %arg131, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %205 = stablehlo.multiply %203, %204 : tensor<1x256x28x28xf32>
-    %206 = stablehlo.convert %arg132 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %207 = stablehlo.broadcast_in_dim %205, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %208 = stablehlo.broadcast_in_dim %206, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %209 = stablehlo.multiply %207, %208 : tensor<1x256x28x28xf32>
-    %210 = stablehlo.convert %arg133 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %211 = stablehlo.broadcast_in_dim %209, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %212 = stablehlo.broadcast_in_dim %210, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %213 = stablehlo.add %211, %212 : tensor<1x256x28x28xf32>
-    %214 = stablehlo.convert %213 : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xbf16>
-    %215 = stablehlo.maximum %214, %cst_4 : tensor<1x256x28x28xbf16>
-    %216 = stablehlo.convolution(%215, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x28x28xbf16>, tensor<512x256x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %217 = stablehlo.convert %216 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %218 = stablehlo.broadcast_in_dim %217, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %219 = stablehlo.broadcast_in_dim %arg134, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %220 = stablehlo.subtract %218, %219 : tensor<1x512x28x28xf32>
-    %221 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %222 = stablehlo.broadcast_in_dim %arg135, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %223 = stablehlo.multiply %221, %222 : tensor<1x512x28x28xf32>
-    %224 = stablehlo.convert %arg136 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %225 = stablehlo.broadcast_in_dim %223, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %226 = stablehlo.broadcast_in_dim %224, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %227 = stablehlo.multiply %225, %226 : tensor<1x512x28x28xf32>
-    %228 = stablehlo.convert %arg137 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %229 = stablehlo.broadcast_in_dim %227, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %230 = stablehlo.broadcast_in_dim %228, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %231 = stablehlo.add %229, %230 : tensor<1x512x28x28xf32>
-    %232 = stablehlo.convert %231 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %233 = stablehlo.maximum %232, %cst_5 : tensor<1x512x28x28xbf16>
-    %234 = stablehlo.convolution(%233, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x28x28xbf16>
-    %235 = stablehlo.convert %234 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %236 = stablehlo.broadcast_in_dim %235, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %237 = stablehlo.broadcast_in_dim %arg138, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %238 = stablehlo.subtract %236, %237 : tensor<1x512x28x28xf32>
-    %239 = stablehlo.broadcast_in_dim %238, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %240 = stablehlo.broadcast_in_dim %arg139, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %241 = stablehlo.multiply %239, %240 : tensor<1x512x28x28xf32>
-    %242 = stablehlo.convert %arg140 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %243 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %244 = stablehlo.broadcast_in_dim %242, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %245 = stablehlo.multiply %243, %244 : tensor<1x512x28x28xf32>
-    %246 = stablehlo.convert %arg141 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %247 = stablehlo.broadcast_in_dim %245, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %248 = stablehlo.broadcast_in_dim %246, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %249 = stablehlo.add %247, %248 : tensor<1x512x28x28xf32>
-    %250 = stablehlo.convert %249 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %251 = stablehlo.maximum %250, %cst_5 : tensor<1x512x28x28xbf16>
-    %252 = stablehlo.convolution(%251, %arg15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %253 = stablehlo.convert %252 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %254 = stablehlo.broadcast_in_dim %253, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %255 = stablehlo.broadcast_in_dim %arg142, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %256 = stablehlo.subtract %254, %255 : tensor<1x512x28x28xf32>
-    %257 = stablehlo.broadcast_in_dim %256, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %258 = stablehlo.broadcast_in_dim %arg143, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %259 = stablehlo.multiply %257, %258 : tensor<1x512x28x28xf32>
-    %260 = stablehlo.convert %arg144 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %261 = stablehlo.broadcast_in_dim %259, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %262 = stablehlo.broadcast_in_dim %260, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %263 = stablehlo.multiply %261, %262 : tensor<1x512x28x28xf32>
-    %264 = stablehlo.convert %arg145 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %265 = stablehlo.broadcast_in_dim %263, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %266 = stablehlo.broadcast_in_dim %264, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %267 = stablehlo.add %265, %266 : tensor<1x512x28x28xf32>
-    %268 = stablehlo.convert %267 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %269 = stablehlo.maximum %268, %cst_5 : tensor<1x512x28x28xbf16>
-    %270 = stablehlo.convolution(%269, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x28x28xbf16>
-    %271 = stablehlo.convert %270 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %272 = stablehlo.broadcast_in_dim %271, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %273 = stablehlo.broadcast_in_dim %arg146, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %274 = stablehlo.subtract %272, %273 : tensor<1x512x28x28xf32>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %276 = stablehlo.broadcast_in_dim %arg147, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %277 = stablehlo.multiply %275, %276 : tensor<1x512x28x28xf32>
-    %278 = stablehlo.convert %arg148 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %279 = stablehlo.broadcast_in_dim %277, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %280 = stablehlo.broadcast_in_dim %278, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %281 = stablehlo.multiply %279, %280 : tensor<1x512x28x28xf32>
-    %282 = stablehlo.convert %arg149 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %283 = stablehlo.broadcast_in_dim %281, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %284 = stablehlo.broadcast_in_dim %282, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %285 = stablehlo.add %283, %284 : tensor<1x512x28x28xf32>
-    %286 = stablehlo.convert %285 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %287 = stablehlo.maximum %286, %cst_5 : tensor<1x512x28x28xbf16>
-    %288 = stablehlo.convolution(%287, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %289 = stablehlo.convert %288 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %290 = stablehlo.broadcast_in_dim %289, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %291 = stablehlo.broadcast_in_dim %arg150, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %292 = stablehlo.subtract %290, %291 : tensor<1x512x28x28xf32>
-    %293 = stablehlo.broadcast_in_dim %292, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %294 = stablehlo.broadcast_in_dim %arg151, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %295 = stablehlo.multiply %293, %294 : tensor<1x512x28x28xf32>
-    %296 = stablehlo.convert %arg152 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %297 = stablehlo.broadcast_in_dim %295, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %298 = stablehlo.broadcast_in_dim %296, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %299 = stablehlo.multiply %297, %298 : tensor<1x512x28x28xf32>
-    %300 = stablehlo.convert %arg153 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %301 = stablehlo.broadcast_in_dim %299, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %302 = stablehlo.broadcast_in_dim %300, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %303 = stablehlo.add %301, %302 : tensor<1x512x28x28xf32>
-    %304 = stablehlo.convert %303 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %305 = stablehlo.maximum %304, %cst_5 : tensor<1x512x28x28xbf16>
-    %306 = stablehlo.convolution(%305, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x28x28xbf16>
-    %307 = stablehlo.convert %306 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %308 = stablehlo.broadcast_in_dim %307, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %309 = stablehlo.broadcast_in_dim %arg154, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %310 = stablehlo.subtract %308, %309 : tensor<1x512x28x28xf32>
-    %311 = stablehlo.broadcast_in_dim %310, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %312 = stablehlo.broadcast_in_dim %arg155, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %313 = stablehlo.multiply %311, %312 : tensor<1x512x28x28xf32>
-    %314 = stablehlo.convert %arg156 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %315 = stablehlo.broadcast_in_dim %313, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %316 = stablehlo.broadcast_in_dim %314, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %317 = stablehlo.multiply %315, %316 : tensor<1x512x28x28xf32>
-    %318 = stablehlo.convert %arg157 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %319 = stablehlo.broadcast_in_dim %317, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %320 = stablehlo.broadcast_in_dim %318, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %321 = stablehlo.add %319, %320 : tensor<1x512x28x28xf32>
-    %322 = stablehlo.convert %321 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %323 = stablehlo.maximum %322, %cst_5 : tensor<1x512x28x28xbf16>
-    %324 = stablehlo.convolution(%323, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %325 = stablehlo.convert %324 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %326 = stablehlo.broadcast_in_dim %325, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %327 = stablehlo.broadcast_in_dim %arg158, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %328 = stablehlo.subtract %326, %327 : tensor<1x512x28x28xf32>
-    %329 = stablehlo.broadcast_in_dim %328, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %330 = stablehlo.broadcast_in_dim %arg159, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %331 = stablehlo.multiply %329, %330 : tensor<1x512x28x28xf32>
-    %332 = stablehlo.convert %arg160 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %333 = stablehlo.broadcast_in_dim %331, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %334 = stablehlo.broadcast_in_dim %332, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %335 = stablehlo.multiply %333, %334 : tensor<1x512x28x28xf32>
-    %336 = stablehlo.convert %arg161 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %337 = stablehlo.broadcast_in_dim %335, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %338 = stablehlo.broadcast_in_dim %336, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %339 = stablehlo.add %337, %338 : tensor<1x512x28x28xf32>
-    %340 = stablehlo.convert %339 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %341 = stablehlo.maximum %340, %cst_5 : tensor<1x512x28x28xbf16>
-    %342 = stablehlo.convolution(%341, %arg20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x28x28xbf16>
-    %343 = stablehlo.convert %342 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %344 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %345 = stablehlo.broadcast_in_dim %arg162, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %346 = stablehlo.subtract %344, %345 : tensor<1x512x28x28xf32>
-    %347 = stablehlo.broadcast_in_dim %346, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %348 = stablehlo.broadcast_in_dim %arg163, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %349 = stablehlo.multiply %347, %348 : tensor<1x512x28x28xf32>
-    %350 = stablehlo.convert %arg164 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %351 = stablehlo.broadcast_in_dim %349, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %352 = stablehlo.broadcast_in_dim %350, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %353 = stablehlo.multiply %351, %352 : tensor<1x512x28x28xf32>
-    %354 = stablehlo.convert %arg165 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %355 = stablehlo.broadcast_in_dim %353, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %356 = stablehlo.broadcast_in_dim %354, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %357 = stablehlo.add %355, %356 : tensor<1x512x28x28xf32>
-    %358 = stablehlo.convert %357 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %359 = stablehlo.maximum %358, %cst_5 : tensor<1x512x28x28xbf16>
-    %360 = stablehlo.convolution(%359, %arg21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %361 = stablehlo.convert %360 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %362 = stablehlo.broadcast_in_dim %361, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %363 = stablehlo.broadcast_in_dim %arg166, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %364 = stablehlo.subtract %362, %363 : tensor<1x512x28x28xf32>
-    %365 = stablehlo.broadcast_in_dim %364, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %366 = stablehlo.broadcast_in_dim %arg167, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %367 = stablehlo.multiply %365, %366 : tensor<1x512x28x28xf32>
-    %368 = stablehlo.convert %arg168 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %369 = stablehlo.broadcast_in_dim %367, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %370 = stablehlo.broadcast_in_dim %368, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %371 = stablehlo.multiply %369, %370 : tensor<1x512x28x28xf32>
-    %372 = stablehlo.convert %arg169 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %373 = stablehlo.broadcast_in_dim %371, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %374 = stablehlo.broadcast_in_dim %372, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %375 = stablehlo.add %373, %374 : tensor<1x512x28x28xf32>
-    %376 = stablehlo.convert %375 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %377 = stablehlo.maximum %376, %cst_5 : tensor<1x512x28x28xbf16>
-    %378 = stablehlo.convolution(%377, %arg22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 512 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x1x3x3xbf16>) -> tensor<1x512x28x28xbf16>
-    %379 = stablehlo.convert %378 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %380 = stablehlo.broadcast_in_dim %379, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %381 = stablehlo.broadcast_in_dim %arg170, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %382 = stablehlo.subtract %380, %381 : tensor<1x512x28x28xf32>
-    %383 = stablehlo.broadcast_in_dim %382, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %384 = stablehlo.broadcast_in_dim %arg171, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %385 = stablehlo.multiply %383, %384 : tensor<1x512x28x28xf32>
-    %386 = stablehlo.convert %arg172 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %387 = stablehlo.broadcast_in_dim %385, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %388 = stablehlo.broadcast_in_dim %386, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %389 = stablehlo.multiply %387, %388 : tensor<1x512x28x28xf32>
-    %390 = stablehlo.convert %arg173 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %391 = stablehlo.broadcast_in_dim %389, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %392 = stablehlo.broadcast_in_dim %390, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %393 = stablehlo.add %391, %392 : tensor<1x512x28x28xf32>
-    %394 = stablehlo.convert %393 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %395 = stablehlo.maximum %394, %cst_5 : tensor<1x512x28x28xbf16>
-    %396 = stablehlo.convolution(%395, %arg23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<512x512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %397 = stablehlo.convert %396 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %398 = stablehlo.broadcast_in_dim %397, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %399 = stablehlo.broadcast_in_dim %arg174, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %400 = stablehlo.subtract %398, %399 : tensor<1x512x28x28xf32>
-    %401 = stablehlo.broadcast_in_dim %400, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %402 = stablehlo.broadcast_in_dim %arg175, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %403 = stablehlo.multiply %401, %402 : tensor<1x512x28x28xf32>
-    %404 = stablehlo.convert %arg176 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %405 = stablehlo.broadcast_in_dim %403, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %406 = stablehlo.broadcast_in_dim %404, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %407 = stablehlo.multiply %405, %406 : tensor<1x512x28x28xf32>
-    %408 = stablehlo.convert %arg177 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %409 = stablehlo.broadcast_in_dim %407, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %410 = stablehlo.broadcast_in_dim %408, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %411 = stablehlo.add %409, %410 : tensor<1x512x28x28xf32>
-    %412 = stablehlo.convert %411 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %413 = stablehlo.maximum %412, %cst_5 : tensor<1x512x28x28xbf16>
-    %414 = stablehlo.convolution(%413, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %415 = stablehlo.reshape %arg25 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %416 = stablehlo.broadcast_in_dim %414, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %417 = stablehlo.broadcast_in_dim %415, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %418 = stablehlo.add %416, %417 : tensor<1x128x28x28xbf16>
-    %419 = stablehlo.maximum %418, %cst_3 : tensor<1x128x28x28xbf16>
-    %420 = stablehlo.convolution(%419, %arg26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %421 = stablehlo.convert %c : (tensor<i64>) -> tensor<bf16>
-    %422 = stablehlo.broadcast_in_dim %421, dims = [] : (tensor<bf16>) -> tensor<1x128x28x28xbf16>
-    %423 = stablehlo.broadcast_in_dim %420, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %424 = stablehlo.maximum %422, %423 : tensor<1x128x28x28xbf16>
-    %425 = stablehlo.convert %cst_6 : (tensor<1xi64>) -> tensor<1xbf16>
-    %426 = stablehlo.reshape %425 : (tensor<1xbf16>) -> tensor<bf16>
-    %427 = stablehlo.broadcast_in_dim %424, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %428 = stablehlo.broadcast_in_dim %426, dims = [] : (tensor<bf16>) -> tensor<1x128x28x28xbf16>
-    %429 = stablehlo.multiply %427, %428 : tensor<1x128x28x28xbf16>
-    %430 = stablehlo.minimum %422, %423 : tensor<1x128x28x28xbf16>
-    %431 = stablehlo.broadcast_in_dim %430, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %432 = stablehlo.multiply %431, %428 : tensor<1x128x28x28xbf16>
-    %433 = stablehlo.exponential %432 : tensor<1x128x28x28xbf16>
-    %434 = stablehlo.convert %cst_7 : (tensor<1xf64>) -> tensor<1xbf16>
-    %435 = stablehlo.reshape %434 : (tensor<1xbf16>) -> tensor<bf16>
-    %436 = stablehlo.broadcast_in_dim %433, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %437 = stablehlo.broadcast_in_dim %435, dims = [] : (tensor<bf16>) -> tensor<1x128x28x28xbf16>
-    %438 = stablehlo.subtract %436, %437 : tensor<1x128x28x28xbf16>
-    %439 = stablehlo.broadcast_in_dim %438, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %440 = stablehlo.multiply %439, %428 : tensor<1x128x28x28xbf16>
-    %441 = stablehlo.broadcast_in_dim %440, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %442 = stablehlo.multiply %441, %437 : tensor<1x128x28x28xbf16>
-    %443 = stablehlo.add %429, %442 : tensor<1x128x28x28xbf16>
-    %444 = stablehlo.convolution(%443, %arg27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %445 = stablehlo.broadcast_in_dim %444, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %446 = stablehlo.maximum %422, %445 : tensor<1x128x28x28xbf16>
-    %447 = stablehlo.broadcast_in_dim %446, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %448 = stablehlo.multiply %447, %428 : tensor<1x128x28x28xbf16>
-    %449 = stablehlo.minimum %422, %445 : tensor<1x128x28x28xbf16>
-    %450 = stablehlo.broadcast_in_dim %449, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %451 = stablehlo.multiply %450, %428 : tensor<1x128x28x28xbf16>
-    %452 = stablehlo.exponential %451 : tensor<1x128x28x28xbf16>
-    %453 = stablehlo.broadcast_in_dim %452, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %454 = stablehlo.subtract %453, %437 : tensor<1x128x28x28xbf16>
-    %455 = stablehlo.broadcast_in_dim %454, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %456 = stablehlo.multiply %455, %428 : tensor<1x128x28x28xbf16>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %458 = stablehlo.multiply %457, %437 : tensor<1x128x28x28xbf16>
-    %459 = stablehlo.add %448, %458 : tensor<1x128x28x28xbf16>
-    %460 = stablehlo.convolution(%459, %arg28) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %461 = stablehlo.broadcast_in_dim %460, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %462 = stablehlo.maximum %422, %461 : tensor<1x128x28x28xbf16>
-    %463 = stablehlo.broadcast_in_dim %462, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %464 = stablehlo.multiply %463, %428 : tensor<1x128x28x28xbf16>
-    %465 = stablehlo.minimum %422, %461 : tensor<1x128x28x28xbf16>
-    %466 = stablehlo.broadcast_in_dim %465, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %467 = stablehlo.multiply %466, %428 : tensor<1x128x28x28xbf16>
-    %468 = stablehlo.exponential %467 : tensor<1x128x28x28xbf16>
-    %469 = stablehlo.broadcast_in_dim %468, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %470 = stablehlo.subtract %469, %437 : tensor<1x128x28x28xbf16>
-    %471 = stablehlo.broadcast_in_dim %470, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %472 = stablehlo.multiply %471, %428 : tensor<1x128x28x28xbf16>
-    %473 = stablehlo.broadcast_in_dim %472, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %474 = stablehlo.multiply %473, %437 : tensor<1x128x28x28xbf16>
-    %475 = stablehlo.add %464, %474 : tensor<1x128x28x28xbf16>
-    %476 = stablehlo.convolution(%475, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %477 = stablehlo.broadcast_in_dim %476, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %478 = stablehlo.maximum %422, %477 : tensor<1x128x28x28xbf16>
-    %479 = stablehlo.broadcast_in_dim %478, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %480 = stablehlo.multiply %479, %428 : tensor<1x128x28x28xbf16>
-    %481 = stablehlo.minimum %422, %477 : tensor<1x128x28x28xbf16>
-    %482 = stablehlo.broadcast_in_dim %481, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %483 = stablehlo.multiply %482, %428 : tensor<1x128x28x28xbf16>
-    %484 = stablehlo.exponential %483 : tensor<1x128x28x28xbf16>
-    %485 = stablehlo.broadcast_in_dim %484, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %486 = stablehlo.subtract %485, %437 : tensor<1x128x28x28xbf16>
-    %487 = stablehlo.broadcast_in_dim %486, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %488 = stablehlo.multiply %487, %428 : tensor<1x128x28x28xbf16>
-    %489 = stablehlo.broadcast_in_dim %488, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %490 = stablehlo.multiply %489, %437 : tensor<1x128x28x28xbf16>
-    %491 = stablehlo.add %480, %490 : tensor<1x128x28x28xbf16>
-    %492 = stablehlo.convolution(%491, %arg30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %493 = stablehlo.broadcast_in_dim %492, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %494 = stablehlo.maximum %422, %493 : tensor<1x128x28x28xbf16>
-    %495 = stablehlo.broadcast_in_dim %494, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %496 = stablehlo.multiply %495, %428 : tensor<1x128x28x28xbf16>
-    %497 = stablehlo.minimum %422, %493 : tensor<1x128x28x28xbf16>
-    %498 = stablehlo.broadcast_in_dim %497, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %499 = stablehlo.multiply %498, %428 : tensor<1x128x28x28xbf16>
-    %500 = stablehlo.exponential %499 : tensor<1x128x28x28xbf16>
-    %501 = stablehlo.broadcast_in_dim %500, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %502 = stablehlo.subtract %501, %437 : tensor<1x128x28x28xbf16>
-    %503 = stablehlo.broadcast_in_dim %502, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %504 = stablehlo.multiply %503, %428 : tensor<1x128x28x28xbf16>
-    %505 = stablehlo.broadcast_in_dim %504, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %506 = stablehlo.multiply %505, %437 : tensor<1x128x28x28xbf16>
-    %507 = stablehlo.add %496, %506 : tensor<1x128x28x28xbf16>
-    %508 = stablehlo.convolution(%507, %arg31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %509 = stablehlo.broadcast_in_dim %508, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %510 = stablehlo.maximum %422, %509 : tensor<1x128x28x28xbf16>
-    %511 = stablehlo.broadcast_in_dim %510, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %512 = stablehlo.multiply %511, %428 : tensor<1x128x28x28xbf16>
-    %513 = stablehlo.minimum %422, %509 : tensor<1x128x28x28xbf16>
-    %514 = stablehlo.broadcast_in_dim %513, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %515 = stablehlo.multiply %514, %428 : tensor<1x128x28x28xbf16>
-    %516 = stablehlo.exponential %515 : tensor<1x128x28x28xbf16>
-    %517 = stablehlo.broadcast_in_dim %516, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %518 = stablehlo.subtract %517, %437 : tensor<1x128x28x28xbf16>
-    %519 = stablehlo.broadcast_in_dim %518, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %520 = stablehlo.multiply %519, %428 : tensor<1x128x28x28xbf16>
-    %521 = stablehlo.broadcast_in_dim %520, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %522 = stablehlo.multiply %521, %437 : tensor<1x128x28x28xbf16>
-    %523 = stablehlo.add %512, %522 : tensor<1x128x28x28xbf16>
-    %524 = stablehlo.add %419, %523 : tensor<1x128x28x28xbf16>
-    %525 = stablehlo.convolution(%524, %arg32) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %526 = stablehlo.reshape %arg33 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %527 = stablehlo.broadcast_in_dim %525, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %528 = stablehlo.broadcast_in_dim %526, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %529 = stablehlo.add %527, %528 : tensor<1x128x28x28xbf16>
-    %530 = stablehlo.maximum %529, %cst_3 : tensor<1x128x28x28xbf16>
-    %531 = stablehlo.convolution(%530, %arg34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %532 = stablehlo.reshape %arg35 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %533 = stablehlo.broadcast_in_dim %531, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %534 = stablehlo.broadcast_in_dim %532, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %535 = stablehlo.add %533, %534 : tensor<1x128x28x28xbf16>
-    %536 = stablehlo.maximum %535, %cst_3 : tensor<1x128x28x28xbf16>
-    %537 = stablehlo.convolution(%536, %arg36) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %538 = stablehlo.reshape %arg37 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %539 = stablehlo.broadcast_in_dim %537, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %540 = stablehlo.broadcast_in_dim %538, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %541 = stablehlo.add %539, %540 : tensor<1x128x28x28xbf16>
-    %542 = stablehlo.maximum %541, %cst_3 : tensor<1x128x28x28xbf16>
-    %543 = stablehlo.convolution(%542, %arg38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %544 = stablehlo.reshape %arg39 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %545 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %546 = stablehlo.broadcast_in_dim %544, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %547 = stablehlo.add %545, %546 : tensor<1x128x28x28xbf16>
-    %548 = stablehlo.maximum %547, %cst_3 : tensor<1x128x28x28xbf16>
-    %549 = stablehlo.convolution(%548, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %550 = stablehlo.reshape %arg41 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %551 = stablehlo.broadcast_in_dim %549, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xbf16>
-    %552 = stablehlo.broadcast_in_dim %550, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %553 = stablehlo.add %551, %552 : tensor<1x512x28x28xbf16>
-    %554 = stablehlo.maximum %553, %cst_5 : tensor<1x512x28x28xbf16>
-    %555 = stablehlo.convolution(%554, %arg42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<19x512x1x1xbf16>) -> tensor<1x19x28x28xbf16>
-    %556 = stablehlo.reshape %arg43 : (tensor<19xbf16>) -> tensor<19x1x1xbf16>
-    %557 = stablehlo.broadcast_in_dim %555, dims = [0, 1, 2, 3] : (tensor<1x19x28x28xbf16>) -> tensor<1x19x28x28xbf16>
-    %558 = stablehlo.broadcast_in_dim %556, dims = [1, 2, 3] : (tensor<19x1x1xbf16>) -> tensor<1x19x28x28xbf16>
-    %559 = stablehlo.add %557, %558 : tensor<1x19x28x28xbf16>
-    %560 = stablehlo.convolution(%548, %arg44) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %561 = stablehlo.reshape %arg45 : (tensor<512xbf16>) -> tensor<512x1x1xbf16>
-    %562 = stablehlo.broadcast_in_dim %560, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xbf16>
-    %563 = stablehlo.broadcast_in_dim %561, dims = [1, 2, 3] : (tensor<512x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %564 = stablehlo.add %562, %563 : tensor<1x512x28x28xbf16>
-    %565 = stablehlo.maximum %564, %cst_5 : tensor<1x512x28x28xbf16>
-    %566 = stablehlo.convolution(%565, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<38x512x1x1xbf16>) -> tensor<1x38x28x28xbf16>
-    %567 = stablehlo.reshape %arg47 : (tensor<38xbf16>) -> tensor<38x1x1xbf16>
-    %568 = stablehlo.broadcast_in_dim %566, dims = [0, 1, 2, 3] : (tensor<1x38x28x28xbf16>) -> tensor<1x38x28x28xbf16>
-    %569 = stablehlo.broadcast_in_dim %567, dims = [1, 2, 3] : (tensor<38x1x1xbf16>) -> tensor<1x38x28x28xbf16>
-    %570 = stablehlo.add %568, %569 : tensor<1x38x28x28xbf16>
-    %571 = stablehlo.concatenate %530, %559, %570, dim = 1 : (tensor<1x128x28x28xbf16>, tensor<1x19x28x28xbf16>, tensor<1x38x28x28xbf16>) -> tensor<1x185x28x28xbf16>
-    %572 = stablehlo.slice %571 [0:1, 0:128, 0:28, 0:28] : (tensor<1x185x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %573 = stablehlo.convolution(%571, %arg48) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x185x28x28xbf16>, tensor<128x185x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %574 = stablehlo.reshape %arg49 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %575 = stablehlo.broadcast_in_dim %573, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %576 = stablehlo.broadcast_in_dim %574, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %577 = stablehlo.add %575, %576 : tensor<1x128x28x28xbf16>
-    %578 = stablehlo.maximum %577, %cst_3 : tensor<1x128x28x28xbf16>
-    %579 = stablehlo.convolution(%578, %arg50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %580 = stablehlo.reshape %arg51 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %581 = stablehlo.broadcast_in_dim %579, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %582 = stablehlo.broadcast_in_dim %580, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %583 = stablehlo.add %581, %582 : tensor<1x128x28x28xbf16>
-    %584 = stablehlo.convert %583 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %585 = stablehlo.broadcast_in_dim %584, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %586 = stablehlo.broadcast_in_dim %arg178, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %587 = stablehlo.subtract %585, %586 : tensor<1x128x28x28xf32>
-    %588 = stablehlo.broadcast_in_dim %587, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %589 = stablehlo.broadcast_in_dim %arg179, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %590 = stablehlo.multiply %588, %589 : tensor<1x128x28x28xf32>
-    %591 = stablehlo.convert %arg180 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %592 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %593 = stablehlo.broadcast_in_dim %591, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %594 = stablehlo.multiply %592, %593 : tensor<1x128x28x28xf32>
-    %595 = stablehlo.convert %arg181 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %596 = stablehlo.broadcast_in_dim %594, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %597 = stablehlo.broadcast_in_dim %595, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %598 = stablehlo.add %596, %597 : tensor<1x128x28x28xf32>
-    %599 = stablehlo.convert %598 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %600 = stablehlo.maximum %599, %cst_3 : tensor<1x128x28x28xbf16>
-    %601 = stablehlo.convolution(%600, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %602 = stablehlo.reshape %arg53 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %603 = stablehlo.broadcast_in_dim %601, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %604 = stablehlo.broadcast_in_dim %602, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %605 = stablehlo.add %603, %604 : tensor<1x128x28x28xbf16>
-    %606 = stablehlo.convert %605 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %607 = stablehlo.broadcast_in_dim %606, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %608 = stablehlo.broadcast_in_dim %arg182, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %609 = stablehlo.subtract %607, %608 : tensor<1x128x28x28xf32>
-    %610 = stablehlo.broadcast_in_dim %609, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %611 = stablehlo.broadcast_in_dim %arg183, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %612 = stablehlo.multiply %610, %611 : tensor<1x128x28x28xf32>
-    %613 = stablehlo.convert %arg184 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %614 = stablehlo.broadcast_in_dim %612, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %615 = stablehlo.broadcast_in_dim %613, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %616 = stablehlo.multiply %614, %615 : tensor<1x128x28x28xf32>
-    %617 = stablehlo.convert %arg185 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %618 = stablehlo.broadcast_in_dim %616, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %619 = stablehlo.broadcast_in_dim %617, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %620 = stablehlo.add %618, %619 : tensor<1x128x28x28xf32>
-    %621 = stablehlo.convert %620 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %622 = stablehlo.maximum %621, %cst_3 : tensor<1x128x28x28xbf16>
-    %623 = stablehlo.add %578, %622 : tensor<1x128x28x28xbf16>
-    %624 = stablehlo.convolution(%623, %arg54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %625 = stablehlo.reshape %arg55 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %626 = stablehlo.broadcast_in_dim %624, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %627 = stablehlo.broadcast_in_dim %625, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %628 = stablehlo.add %626, %627 : tensor<1x128x28x28xbf16>
-    %629 = stablehlo.maximum %628, %cst_3 : tensor<1x128x28x28xbf16>
-    %630 = stablehlo.convolution(%629, %arg56) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %631 = stablehlo.reshape %arg57 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %632 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %633 = stablehlo.broadcast_in_dim %631, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %634 = stablehlo.add %632, %633 : tensor<1x128x28x28xbf16>
-    %635 = stablehlo.convert %634 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %636 = stablehlo.broadcast_in_dim %635, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %637 = stablehlo.broadcast_in_dim %arg186, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %638 = stablehlo.subtract %636, %637 : tensor<1x128x28x28xf32>
-    %639 = stablehlo.broadcast_in_dim %638, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %640 = stablehlo.broadcast_in_dim %arg187, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %641 = stablehlo.multiply %639, %640 : tensor<1x128x28x28xf32>
-    %642 = stablehlo.convert %arg188 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %643 = stablehlo.broadcast_in_dim %641, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %644 = stablehlo.broadcast_in_dim %642, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %645 = stablehlo.multiply %643, %644 : tensor<1x128x28x28xf32>
-    %646 = stablehlo.convert %arg189 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %647 = stablehlo.broadcast_in_dim %645, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %648 = stablehlo.broadcast_in_dim %646, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %649 = stablehlo.add %647, %648 : tensor<1x128x28x28xf32>
-    %650 = stablehlo.convert %649 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %651 = stablehlo.maximum %650, %cst_3 : tensor<1x128x28x28xbf16>
-    %652 = stablehlo.convolution(%651, %arg58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %653 = stablehlo.reshape %arg59 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %654 = stablehlo.broadcast_in_dim %652, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %655 = stablehlo.broadcast_in_dim %653, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %656 = stablehlo.add %654, %655 : tensor<1x128x28x28xbf16>
-    %657 = stablehlo.convert %656 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %658 = stablehlo.broadcast_in_dim %657, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %659 = stablehlo.broadcast_in_dim %arg190, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %660 = stablehlo.subtract %658, %659 : tensor<1x128x28x28xf32>
-    %661 = stablehlo.broadcast_in_dim %660, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %662 = stablehlo.broadcast_in_dim %arg191, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %663 = stablehlo.multiply %661, %662 : tensor<1x128x28x28xf32>
-    %664 = stablehlo.convert %arg192 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %665 = stablehlo.broadcast_in_dim %663, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %666 = stablehlo.broadcast_in_dim %664, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %667 = stablehlo.multiply %665, %666 : tensor<1x128x28x28xf32>
-    %668 = stablehlo.convert %arg193 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %669 = stablehlo.broadcast_in_dim %667, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %670 = stablehlo.broadcast_in_dim %668, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %671 = stablehlo.add %669, %670 : tensor<1x128x28x28xf32>
-    %672 = stablehlo.convert %671 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %673 = stablehlo.maximum %672, %cst_3 : tensor<1x128x28x28xbf16>
-    %674 = stablehlo.add %629, %673 : tensor<1x128x28x28xbf16>
-    %675 = stablehlo.convolution(%674, %arg60) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %676 = stablehlo.reshape %arg61 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %677 = stablehlo.broadcast_in_dim %675, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %678 = stablehlo.broadcast_in_dim %676, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %679 = stablehlo.add %677, %678 : tensor<1x128x28x28xbf16>
-    %680 = stablehlo.maximum %679, %cst_3 : tensor<1x128x28x28xbf16>
-    %681 = stablehlo.convolution(%680, %arg62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %682 = stablehlo.reshape %arg63 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %683 = stablehlo.broadcast_in_dim %681, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %684 = stablehlo.broadcast_in_dim %682, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %685 = stablehlo.add %683, %684 : tensor<1x128x28x28xbf16>
-    %686 = stablehlo.convert %685 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %687 = stablehlo.broadcast_in_dim %686, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %688 = stablehlo.broadcast_in_dim %arg194, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %689 = stablehlo.subtract %687, %688 : tensor<1x128x28x28xf32>
-    %690 = stablehlo.broadcast_in_dim %689, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %691 = stablehlo.broadcast_in_dim %arg195, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %692 = stablehlo.multiply %690, %691 : tensor<1x128x28x28xf32>
-    %693 = stablehlo.convert %arg196 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %694 = stablehlo.broadcast_in_dim %692, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %695 = stablehlo.broadcast_in_dim %693, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %696 = stablehlo.multiply %694, %695 : tensor<1x128x28x28xf32>
-    %697 = stablehlo.convert %arg197 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %698 = stablehlo.broadcast_in_dim %696, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %699 = stablehlo.broadcast_in_dim %697, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %700 = stablehlo.add %698, %699 : tensor<1x128x28x28xf32>
-    %701 = stablehlo.convert %700 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %702 = stablehlo.maximum %701, %cst_3 : tensor<1x128x28x28xbf16>
-    %703 = stablehlo.convolution(%702, %arg64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %704 = stablehlo.reshape %arg65 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %705 = stablehlo.broadcast_in_dim %703, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %706 = stablehlo.broadcast_in_dim %704, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %707 = stablehlo.add %705, %706 : tensor<1x128x28x28xbf16>
-    %708 = stablehlo.convert %707 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %709 = stablehlo.broadcast_in_dim %708, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %710 = stablehlo.broadcast_in_dim %arg198, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %711 = stablehlo.subtract %709, %710 : tensor<1x128x28x28xf32>
-    %712 = stablehlo.broadcast_in_dim %711, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %713 = stablehlo.broadcast_in_dim %arg199, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %714 = stablehlo.multiply %712, %713 : tensor<1x128x28x28xf32>
-    %715 = stablehlo.convert %arg200 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %716 = stablehlo.broadcast_in_dim %714, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %717 = stablehlo.broadcast_in_dim %715, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %718 = stablehlo.multiply %716, %717 : tensor<1x128x28x28xf32>
-    %719 = stablehlo.convert %arg201 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %720 = stablehlo.broadcast_in_dim %718, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %721 = stablehlo.broadcast_in_dim %719, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %722 = stablehlo.add %720, %721 : tensor<1x128x28x28xf32>
-    %723 = stablehlo.convert %722 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %724 = stablehlo.maximum %723, %cst_3 : tensor<1x128x28x28xbf16>
-    %725 = stablehlo.add %680, %724 : tensor<1x128x28x28xbf16>
-    %726 = stablehlo.convolution(%725, %arg66) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %727 = stablehlo.reshape %arg67 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %728 = stablehlo.broadcast_in_dim %726, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %729 = stablehlo.broadcast_in_dim %727, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %730 = stablehlo.add %728, %729 : tensor<1x128x28x28xbf16>
-    %731 = stablehlo.maximum %730, %cst_3 : tensor<1x128x28x28xbf16>
-    %732 = stablehlo.convolution(%731, %arg68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %733 = stablehlo.reshape %arg69 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %734 = stablehlo.broadcast_in_dim %732, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %735 = stablehlo.broadcast_in_dim %733, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %736 = stablehlo.add %734, %735 : tensor<1x128x28x28xbf16>
-    %737 = stablehlo.convert %736 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %738 = stablehlo.broadcast_in_dim %737, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %739 = stablehlo.broadcast_in_dim %arg202, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %740 = stablehlo.subtract %738, %739 : tensor<1x128x28x28xf32>
-    %741 = stablehlo.broadcast_in_dim %740, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %742 = stablehlo.broadcast_in_dim %arg203, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %743 = stablehlo.multiply %741, %742 : tensor<1x128x28x28xf32>
-    %744 = stablehlo.convert %arg204 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %745 = stablehlo.broadcast_in_dim %743, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %746 = stablehlo.broadcast_in_dim %744, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %747 = stablehlo.multiply %745, %746 : tensor<1x128x28x28xf32>
-    %748 = stablehlo.convert %arg205 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %749 = stablehlo.broadcast_in_dim %747, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %750 = stablehlo.broadcast_in_dim %748, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %751 = stablehlo.add %749, %750 : tensor<1x128x28x28xf32>
-    %752 = stablehlo.convert %751 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %753 = stablehlo.maximum %752, %cst_3 : tensor<1x128x28x28xbf16>
-    %754 = stablehlo.convolution(%753, %arg70) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %755 = stablehlo.reshape %arg71 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %756 = stablehlo.broadcast_in_dim %754, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %757 = stablehlo.broadcast_in_dim %755, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %758 = stablehlo.add %756, %757 : tensor<1x128x28x28xbf16>
-    %759 = stablehlo.convert %758 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %760 = stablehlo.broadcast_in_dim %759, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %761 = stablehlo.broadcast_in_dim %arg206, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %762 = stablehlo.subtract %760, %761 : tensor<1x128x28x28xf32>
-    %763 = stablehlo.broadcast_in_dim %762, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %764 = stablehlo.broadcast_in_dim %arg207, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %765 = stablehlo.multiply %763, %764 : tensor<1x128x28x28xf32>
-    %766 = stablehlo.convert %arg208 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %767 = stablehlo.broadcast_in_dim %765, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %768 = stablehlo.broadcast_in_dim %766, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %769 = stablehlo.multiply %767, %768 : tensor<1x128x28x28xf32>
-    %770 = stablehlo.convert %arg209 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %771 = stablehlo.broadcast_in_dim %769, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %772 = stablehlo.broadcast_in_dim %770, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %773 = stablehlo.add %771, %772 : tensor<1x128x28x28xf32>
-    %774 = stablehlo.convert %773 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %775 = stablehlo.maximum %774, %cst_3 : tensor<1x128x28x28xbf16>
-    %776 = stablehlo.add %731, %775 : tensor<1x128x28x28xbf16>
-    %777 = stablehlo.convolution(%776, %arg72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %778 = stablehlo.reshape %arg73 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %779 = stablehlo.broadcast_in_dim %777, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %780 = stablehlo.broadcast_in_dim %778, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %781 = stablehlo.add %779, %780 : tensor<1x128x28x28xbf16>
-    %782 = stablehlo.maximum %781, %cst_3 : tensor<1x128x28x28xbf16>
-    %783 = stablehlo.convolution(%782, %arg74) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %784 = stablehlo.reshape %arg75 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %785 = stablehlo.broadcast_in_dim %783, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %786 = stablehlo.broadcast_in_dim %784, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %787 = stablehlo.add %785, %786 : tensor<1x128x28x28xbf16>
-    %788 = stablehlo.convert %787 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %789 = stablehlo.broadcast_in_dim %788, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %790 = stablehlo.broadcast_in_dim %arg210, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %791 = stablehlo.subtract %789, %790 : tensor<1x128x28x28xf32>
-    %792 = stablehlo.broadcast_in_dim %791, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %793 = stablehlo.broadcast_in_dim %arg211, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %794 = stablehlo.multiply %792, %793 : tensor<1x128x28x28xf32>
-    %795 = stablehlo.convert %arg212 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %796 = stablehlo.broadcast_in_dim %794, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %797 = stablehlo.broadcast_in_dim %795, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %798 = stablehlo.multiply %796, %797 : tensor<1x128x28x28xf32>
-    %799 = stablehlo.convert %arg213 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %800 = stablehlo.broadcast_in_dim %798, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %801 = stablehlo.broadcast_in_dim %799, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %802 = stablehlo.add %800, %801 : tensor<1x128x28x28xf32>
-    %803 = stablehlo.convert %802 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %804 = stablehlo.maximum %803, %cst_3 : tensor<1x128x28x28xbf16>
-    %805 = stablehlo.convolution(%804, %arg76) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[2, 2], [2, 2]], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %806 = stablehlo.reshape %arg77 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %807 = stablehlo.broadcast_in_dim %805, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %808 = stablehlo.broadcast_in_dim %806, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %809 = stablehlo.add %807, %808 : tensor<1x128x28x28xbf16>
-    %810 = stablehlo.convert %809 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %811 = stablehlo.broadcast_in_dim %810, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %812 = stablehlo.broadcast_in_dim %arg214, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %813 = stablehlo.subtract %811, %812 : tensor<1x128x28x28xf32>
-    %814 = stablehlo.broadcast_in_dim %813, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %815 = stablehlo.broadcast_in_dim %arg215, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %816 = stablehlo.multiply %814, %815 : tensor<1x128x28x28xf32>
-    %817 = stablehlo.convert %arg216 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %818 = stablehlo.broadcast_in_dim %816, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %819 = stablehlo.broadcast_in_dim %817, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %820 = stablehlo.multiply %818, %819 : tensor<1x128x28x28xf32>
-    %821 = stablehlo.convert %arg217 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %822 = stablehlo.broadcast_in_dim %820, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %823 = stablehlo.broadcast_in_dim %821, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %824 = stablehlo.add %822, %823 : tensor<1x128x28x28xf32>
-    %825 = stablehlo.convert %824 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %826 = stablehlo.maximum %825, %cst_3 : tensor<1x128x28x28xbf16>
-    %827 = stablehlo.add %782, %826 : tensor<1x128x28x28xbf16>
-    %828 = stablehlo.convolution(%827, %arg78) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %829 = stablehlo.reshape %arg79 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %830 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %831 = stablehlo.broadcast_in_dim %829, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %832 = stablehlo.add %830, %831 : tensor<1x128x28x28xbf16>
-    %833 = stablehlo.maximum %832, %cst_3 : tensor<1x128x28x28xbf16>
-    %834 = stablehlo.convolution(%833, %arg80) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<19x128x1x1xbf16>) -> tensor<1x19x28x28xbf16>
-    %835 = stablehlo.reshape %arg81 : (tensor<19xbf16>) -> tensor<19x1x1xbf16>
-    %836 = stablehlo.broadcast_in_dim %834, dims = [0, 1, 2, 3] : (tensor<1x19x28x28xbf16>) -> tensor<1x19x28x28xbf16>
-    %837 = stablehlo.broadcast_in_dim %835, dims = [1, 2, 3] : (tensor<19x1x1xbf16>) -> tensor<1x19x28x28xbf16>
-    %838 = stablehlo.add %836, %837 : tensor<1x19x28x28xbf16>
-    %839 = stablehlo.convolution(%827, %arg82) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %840 = stablehlo.reshape %arg83 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %841 = stablehlo.broadcast_in_dim %839, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xbf16>
-    %842 = stablehlo.broadcast_in_dim %840, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %843 = stablehlo.add %841, %842 : tensor<1x128x28x28xbf16>
-    %844 = stablehlo.maximum %843, %cst_3 : tensor<1x128x28x28xbf16>
-    %845 = stablehlo.convolution(%844, %arg84) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<38x128x1x1xbf16>) -> tensor<1x38x28x28xbf16>
-    %846 = stablehlo.reshape %arg85 : (tensor<38xbf16>) -> tensor<38x1x1xbf16>
-    %847 = stablehlo.broadcast_in_dim %845, dims = [0, 1, 2, 3] : (tensor<1x38x28x28xbf16>) -> tensor<1x38x28x28xbf16>
-    %848 = stablehlo.broadcast_in_dim %846, dims = [1, 2, 3] : (tensor<38x1x1xbf16>) -> tensor<1x38x28x28xbf16>
-    %849 = stablehlo.add %847, %848 : tensor<1x38x28x28xbf16>
-    %850 = stablehlo.concatenate %572, %838, %849, dim = 1 : (tensor<1x128x28x28xbf16>, tensor<1x19x28x28xbf16>, tensor<1x38x28x28xbf16>) -> tensor<1x185x28x28xbf16>
-    %851 = stablehlo.slice %850 [0:1, 128:185, 0:28, 0:28] : (tensor<1x185x28x28xbf16>) -> tensor<1x57x28x28xbf16>
-    return %851 : tensor<1x57x28x28xbf16>
-  }
-}
diff --git a/mlir_tests/Perceiver IO.mlir b/mlir_tests/Perceiver IO.mlir
deleted file mode 100644
index f9c8b21b..00000000
--- a/mlir_tests/Perceiver IO.mlir	
+++ /dev/null
@@ -1,5604 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x2048xi64>, %arg1: tensor<1x2048xi64>, %arg2: tensor<262x768xbf16>, %arg3: tensor<768xbf16>, %arg4: tensor<768xbf16>, %arg5: tensor<1280xbf16>, %arg6: tensor<1280xbf16>, %arg7: tensor<1280xbf16>, %arg8: tensor<1280xbf16>, %arg9: tensor<1280xbf16>, %arg10: tensor<1280xbf16>, %arg11: tensor<1280xbf16>, %arg12: tensor<1280xbf16>, %arg13: tensor<1280xbf16>, %arg14: tensor<1280xbf16>, %arg15: tensor<1280xbf16>, %arg16: tensor<1280xbf16>, %arg17: tensor<1280xbf16>, %arg18: tensor<1280xbf16>, %arg19: tensor<1280xbf16>, %arg20: tensor<1280xbf16>, %arg21: tensor<1280xbf16>, %arg22: tensor<1280xbf16>, %arg23: tensor<1280xbf16>, %arg24: tensor<1280xbf16>, %arg25: tensor<1280xbf16>, %arg26: tensor<1280xbf16>, %arg27: tensor<1280xbf16>, %arg28: tensor<1280xbf16>, %arg29: tensor<1280xbf16>, %arg30: tensor<1280xbf16>, %arg31: tensor<1280xbf16>, %arg32: tensor<1280xbf16>, %arg33: tensor<1280xbf16>, %arg34: tensor<1280xbf16>, %arg35: tensor<1280xbf16>, %arg36: tensor<1280xbf16>, %arg37: tensor<1280xbf16>, %arg38: tensor<1280xbf16>, %arg39: tensor<1280xbf16>, %arg40: tensor<1280xbf16>, %arg41: tensor<1280xbf16>, %arg42: tensor<1280xbf16>, %arg43: tensor<1280xbf16>, %arg44: tensor<1280xbf16>, %arg45: tensor<1280xbf16>, %arg46: tensor<1280xbf16>, %arg47: tensor<1280xbf16>, %arg48: tensor<1280xbf16>, %arg49: tensor<1280xbf16>, %arg50: tensor<1280xbf16>, %arg51: tensor<1280xbf16>, %arg52: tensor<1280xbf16>, %arg53: tensor<1280xbf16>, %arg54: tensor<1280xbf16>, %arg55: tensor<1280xbf16>, %arg56: tensor<1280xbf16>, %arg57: tensor<1280xbf16>, %arg58: tensor<1280xbf16>, %arg59: tensor<1280xbf16>, %arg60: tensor<1280xbf16>, %arg61: tensor<1280xbf16>, %arg62: tensor<1280xbf16>, %arg63: tensor<1280xbf16>, %arg64: tensor<1280xbf16>, %arg65: tensor<1280xbf16>, %arg66: tensor<1280xbf16>, %arg67: tensor<1280xbf16>, %arg68: tensor<1280xbf16>, %arg69: tensor<1280xbf16>, %arg70: tensor<1280xbf16>, %arg71: tensor<1280xbf16>, %arg72: tensor<1280xbf16>, %arg73: tensor<1280xbf16>, %arg74: tensor<1280xbf16>, %arg75: tensor<1280xbf16>, %arg76: tensor<1280xbf16>, %arg77: tensor<1280xbf16>, %arg78: tensor<1280xbf16>, %arg79: tensor<1280xbf16>, %arg80: tensor<1280xbf16>, %arg81: tensor<1280xbf16>, %arg82: tensor<1280xbf16>, %arg83: tensor<1280xbf16>, %arg84: tensor<1280xbf16>, %arg85: tensor<1280xbf16>, %arg86: tensor<1280xbf16>, %arg87: tensor<1280xbf16>, %arg88: tensor<1280xbf16>, %arg89: tensor<1280xbf16>, %arg90: tensor<1280xbf16>, %arg91: tensor<1280xbf16>, %arg92: tensor<1280xbf16>, %arg93: tensor<1280xbf16>, %arg94: tensor<1280xbf16>, %arg95: tensor<1280xbf16>, %arg96: tensor<1280xbf16>, %arg97: tensor<1280xbf16>, %arg98: tensor<1280xbf16>, %arg99: tensor<1280xbf16>, %arg100: tensor<1280xbf16>, %arg101: tensor<1280xbf16>, %arg102: tensor<1280xbf16>, %arg103: tensor<1280xbf16>, %arg104: tensor<1280xbf16>, %arg105: tensor<1280xbf16>, %arg106: tensor<1280xbf16>, %arg107: tensor<1280xbf16>, %arg108: tensor<1280xbf16>, %arg109: tensor<1280xbf16>, %arg110: tensor<1280xbf16>, %arg111: tensor<1280xbf16>, %arg112: tensor<1280xbf16>, %arg113: tensor<768xbf16>, %arg114: tensor<768xbf16>, %arg115: tensor<262xbf16>, %arg116: tensor<2048x768xbf16>, %arg117: tensor<768x256xf32>, %arg118: tensor<256xf32>, %arg119: tensor<768x1280xf32>, %arg120: tensor<1280xf32>, %arg121: tensor<8x256x32xbf16>, %arg122: tensor<1280x1280xf32>, %arg123: tensor<1280xf32>, %arg124: tensor<1x256x1280xbf16>, %arg125: tensor<1280x1280xf32>, %arg126: tensor<1280xf32>, %arg127: tensor<1280x1280xf32>, %arg128: tensor<1280xf32>, %arg129: tensor<1280x256xf32>, %arg130: tensor<256xf32>, %arg131: tensor<1280x256xf32>, %arg132: tensor<256xf32>, %arg133: tensor<1280x1280xf32>, %arg134: tensor<1280xf32>, %arg135: tensor<1280x1280xf32>, %arg136: tensor<1280xf32>, %arg137: tensor<1280x1280xf32>, %arg138: tensor<1280xf32>, %arg139: tensor<1280x1280xf32>, %arg140: tensor<1280xf32>, %arg141: tensor<1280x256xf32>, %arg142: tensor<256xf32>, %arg143: tensor<1280x256xf32>, %arg144: tensor<256xf32>, %arg145: tensor<1280x1280xf32>, %arg146: tensor<1280xf32>, %arg147: tensor<1280x1280xf32>, %arg148: tensor<1280xf32>, %arg149: tensor<1280x1280xf32>, %arg150: tensor<1280xf32>, %arg151: tensor<1280x1280xf32>, %arg152: tensor<1280xf32>, %arg153: tensor<1280x256xf32>, %arg154: tensor<256xf32>, %arg155: tensor<1280x256xf32>, %arg156: tensor<256xf32>, %arg157: tensor<1280x1280xf32>, %arg158: tensor<1280xf32>, %arg159: tensor<1280x1280xf32>, %arg160: tensor<1280xf32>, %arg161: tensor<1280x1280xf32>, %arg162: tensor<1280xf32>, %arg163: tensor<1280x1280xf32>, %arg164: tensor<1280xf32>, %arg165: tensor<1280x256xf32>, %arg166: tensor<256xf32>, %arg167: tensor<1280x256xf32>, %arg168: tensor<256xf32>, %arg169: tensor<1280x1280xf32>, %arg170: tensor<1280xf32>, %arg171: tensor<1280x1280xf32>, %arg172: tensor<1280xf32>, %arg173: tensor<1280x1280xf32>, %arg174: tensor<1280xf32>, %arg175: tensor<1280x1280xf32>, %arg176: tensor<1280xf32>, %arg177: tensor<1280x256xf32>, %arg178: tensor<256xf32>, %arg179: tensor<1280x256xf32>, %arg180: tensor<256xf32>, %arg181: tensor<1280x1280xf32>, %arg182: tensor<1280xf32>, %arg183: tensor<1280x1280xf32>, %arg184: tensor<1280xf32>, %arg185: tensor<1280x1280xf32>, %arg186: tensor<1280xf32>, %arg187: tensor<1280x1280xf32>, %arg188: tensor<1280xf32>, %arg189: tensor<1280x256xf32>, %arg190: tensor<256xf32>, %arg191: tensor<1280x256xf32>, %arg192: tensor<256xf32>, %arg193: tensor<1280x1280xf32>, %arg194: tensor<1280xf32>, %arg195: tensor<1280x1280xf32>, %arg196: tensor<1280xf32>, %arg197: tensor<1280x1280xf32>, %arg198: tensor<1280xf32>, %arg199: tensor<1280x1280xf32>, %arg200: tensor<1280xf32>, %arg201: tensor<1280x256xf32>, %arg202: tensor<256xf32>, %arg203: tensor<1280x256xf32>, %arg204: tensor<256xf32>, %arg205: tensor<1280x1280xf32>, %arg206: tensor<1280xf32>, %arg207: tensor<1280x1280xf32>, %arg208: tensor<1280xf32>, %arg209: tensor<1280x1280xf32>, %arg210: tensor<1280xf32>, %arg211: tensor<1280x1280xf32>, %arg212: tensor<1280xf32>, %arg213: tensor<1280x256xf32>, %arg214: tensor<256xf32>, %arg215: tensor<1280x256xf32>, %arg216: tensor<256xf32>, %arg217: tensor<1280x1280xf32>, %arg218: tensor<1280xf32>, %arg219: tensor<1280x1280xf32>, %arg220: tensor<1280xf32>, %arg221: tensor<1280x1280xf32>, %arg222: tensor<1280xf32>, %arg223: tensor<1280x1280xf32>, %arg224: tensor<1280xf32>, %arg225: tensor<1280x256xf32>, %arg226: tensor<256xf32>, %arg227: tensor<1280x256xf32>, %arg228: tensor<256xf32>, %arg229: tensor<1280x1280xf32>, %arg230: tensor<1280xf32>, %arg231: tensor<1280x1280xf32>, %arg232: tensor<1280xf32>, %arg233: tensor<1280x1280xf32>, %arg234: tensor<1280xf32>, %arg235: tensor<1280x1280xf32>, %arg236: tensor<1280xf32>, %arg237: tensor<1280x256xf32>, %arg238: tensor<256xf32>, %arg239: tensor<1280x256xf32>, %arg240: tensor<256xf32>, %arg241: tensor<1280x1280xf32>, %arg242: tensor<1280xf32>, %arg243: tensor<1280x1280xf32>, %arg244: tensor<1280xf32>, %arg245: tensor<1280x1280xf32>, %arg246: tensor<1280xf32>, %arg247: tensor<1280x1280xf32>, %arg248: tensor<1280xf32>, %arg249: tensor<1280x256xf32>, %arg250: tensor<256xf32>, %arg251: tensor<1280x256xf32>, %arg252: tensor<256xf32>, %arg253: tensor<1280x1280xf32>, %arg254: tensor<1280xf32>, %arg255: tensor<1280x1280xf32>, %arg256: tensor<1280xf32>, %arg257: tensor<1280x1280xf32>, %arg258: tensor<1280xf32>, %arg259: tensor<1280x1280xf32>, %arg260: tensor<1280xf32>, %arg261: tensor<1280x256xf32>, %arg262: tensor<256xf32>, %arg263: tensor<1280x256xf32>, %arg264: tensor<256xf32>, %arg265: tensor<1280x1280xf32>, %arg266: tensor<1280xf32>, %arg267: tensor<1280x1280xf32>, %arg268: tensor<1280xf32>, %arg269: tensor<1280x1280xf32>, %arg270: tensor<1280xf32>, %arg271: tensor<1280x1280xf32>, %arg272: tensor<1280xf32>, %arg273: tensor<1280x256xf32>, %arg274: tensor<256xf32>, %arg275: tensor<1280x256xf32>, %arg276: tensor<256xf32>, %arg277: tensor<1280x1280xf32>, %arg278: tensor<1280xf32>, %arg279: tensor<1280x1280xf32>, %arg280: tensor<1280xf32>, %arg281: tensor<1280x1280xf32>, %arg282: tensor<1280xf32>, %arg283: tensor<1280x1280xf32>, %arg284: tensor<1280xf32>, %arg285: tensor<1280x256xf32>, %arg286: tensor<256xf32>, %arg287: tensor<1280x256xf32>, %arg288: tensor<256xf32>, %arg289: tensor<1280x1280xf32>, %arg290: tensor<1280xf32>, %arg291: tensor<1280x1280xf32>, %arg292: tensor<1280xf32>, %arg293: tensor<1280x1280xf32>, %arg294: tensor<1280xf32>, %arg295: tensor<1280x1280xf32>, %arg296: tensor<1280xf32>, %arg297: tensor<1280x256xf32>, %arg298: tensor<256xf32>, %arg299: tensor<1280x256xf32>, %arg300: tensor<256xf32>, %arg301: tensor<1280x1280xf32>, %arg302: tensor<1280xf32>, %arg303: tensor<1280x1280xf32>, %arg304: tensor<1280xf32>, %arg305: tensor<1280x1280xf32>, %arg306: tensor<1280xf32>, %arg307: tensor<1280x1280xf32>, %arg308: tensor<1280xf32>, %arg309: tensor<1280x256xf32>, %arg310: tensor<256xf32>, %arg311: tensor<1280x256xf32>, %arg312: tensor<256xf32>, %arg313: tensor<1280x1280xf32>, %arg314: tensor<1280xf32>, %arg315: tensor<1280x1280xf32>, %arg316: tensor<1280xf32>, %arg317: tensor<1280x1280xf32>, %arg318: tensor<1280xf32>, %arg319: tensor<1280x1280xf32>, %arg320: tensor<1280xf32>, %arg321: tensor<1280x256xf32>, %arg322: tensor<256xf32>, %arg323: tensor<1280x256xf32>, %arg324: tensor<256xf32>, %arg325: tensor<1280x1280xf32>, %arg326: tensor<1280xf32>, %arg327: tensor<1280x1280xf32>, %arg328: tensor<1280xf32>, %arg329: tensor<1280x1280xf32>, %arg330: tensor<1280xf32>, %arg331: tensor<1280x1280xf32>, %arg332: tensor<1280xf32>, %arg333: tensor<1280x256xf32>, %arg334: tensor<256xf32>, %arg335: tensor<1280x256xf32>, %arg336: tensor<256xf32>, %arg337: tensor<1280x1280xf32>, %arg338: tensor<1280xf32>, %arg339: tensor<1280x1280xf32>, %arg340: tensor<1280xf32>, %arg341: tensor<1280x1280xf32>, %arg342: tensor<1280xf32>, %arg343: tensor<1280x1280xf32>, %arg344: tensor<1280xf32>, %arg345: tensor<1280x256xf32>, %arg346: tensor<256xf32>, %arg347: tensor<1280x256xf32>, %arg348: tensor<256xf32>, %arg349: tensor<1280x1280xf32>, %arg350: tensor<1280xf32>, %arg351: tensor<1280x1280xf32>, %arg352: tensor<1280xf32>, %arg353: tensor<1280x1280xf32>, %arg354: tensor<1280xf32>, %arg355: tensor<1280x1280xf32>, %arg356: tensor<1280xf32>, %arg357: tensor<1280x256xf32>, %arg358: tensor<256xf32>, %arg359: tensor<1280x256xf32>, %arg360: tensor<256xf32>, %arg361: tensor<1280x1280xf32>, %arg362: tensor<1280xf32>, %arg363: tensor<1280x1280xf32>, %arg364: tensor<1280xf32>, %arg365: tensor<1280x1280xf32>, %arg366: tensor<1280xf32>, %arg367: tensor<1280x1280xf32>, %arg368: tensor<1280xf32>, %arg369: tensor<1280x256xf32>, %arg370: tensor<256xf32>, %arg371: tensor<1280x256xf32>, %arg372: tensor<256xf32>, %arg373: tensor<1280x1280xf32>, %arg374: tensor<1280xf32>, %arg375: tensor<1280x1280xf32>, %arg376: tensor<1280xf32>, %arg377: tensor<1280x1280xf32>, %arg378: tensor<1280xf32>, %arg379: tensor<1280x1280xf32>, %arg380: tensor<1280xf32>, %arg381: tensor<1280x256xf32>, %arg382: tensor<256xf32>, %arg383: tensor<1280x256xf32>, %arg384: tensor<256xf32>, %arg385: tensor<1280x1280xf32>, %arg386: tensor<1280xf32>, %arg387: tensor<1280x1280xf32>, %arg388: tensor<1280xf32>, %arg389: tensor<1280x1280xf32>, %arg390: tensor<1280xf32>, %arg391: tensor<1280x1280xf32>, %arg392: tensor<1280xf32>, %arg393: tensor<1280x256xf32>, %arg394: tensor<256xf32>, %arg395: tensor<1280x256xf32>, %arg396: tensor<256xf32>, %arg397: tensor<1280x1280xf32>, %arg398: tensor<1280xf32>, %arg399: tensor<1280x1280xf32>, %arg400: tensor<1280xf32>, %arg401: tensor<1280x1280xf32>, %arg402: tensor<1280xf32>, %arg403: tensor<1280x1280xf32>, %arg404: tensor<1280xf32>, %arg405: tensor<1280x256xf32>, %arg406: tensor<256xf32>, %arg407: tensor<1280x256xf32>, %arg408: tensor<256xf32>, %arg409: tensor<1280x1280xf32>, %arg410: tensor<1280xf32>, %arg411: tensor<1280x1280xf32>, %arg412: tensor<1280xf32>, %arg413: tensor<1280x1280xf32>, %arg414: tensor<1280xf32>, %arg415: tensor<1280x1280xf32>, %arg416: tensor<1280xf32>, %arg417: tensor<1280x256xf32>, %arg418: tensor<256xf32>, %arg419: tensor<1280x256xf32>, %arg420: tensor<256xf32>, %arg421: tensor<1280x1280xf32>, %arg422: tensor<1280xf32>, %arg423: tensor<1280x1280xf32>, %arg424: tensor<1280xf32>, %arg425: tensor<1280x1280xf32>, %arg426: tensor<1280xf32>, %arg427: tensor<1280x1280xf32>, %arg428: tensor<1280xf32>, %arg429: tensor<1280x256xf32>, %arg430: tensor<256xf32>, %arg431: tensor<1280x256xf32>, %arg432: tensor<256xf32>, %arg433: tensor<1280x1280xf32>, %arg434: tensor<1280xf32>, %arg435: tensor<1280x1280xf32>, %arg436: tensor<1280xf32>, %arg437: tensor<1280x1280xf32>, %arg438: tensor<1280xf32>, %arg439: tensor<1280x1280xf32>, %arg440: tensor<1280xf32>, %arg441: tensor<1280x256xf32>, %arg442: tensor<256xf32>, %arg443: tensor<1280x768xf32>, %arg444: tensor<768xf32>, %arg445: tensor<8x2048x32xbf16>, %arg446: tensor<768x768xf32>, %arg447: tensor<768xf32>, %arg448: tensor<768x768xf32>, %arg449: tensor<768xf32>, %arg450: tensor<768x768xf32>, %arg451: tensor<768xf32>, %arg452: tensor<768x262xbf16>) -> tensor<1x2048x262xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x256x1280xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x256x1280xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x256x1280xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x256x1280xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x256x1280xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x256x1280xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x256x1280xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x256x1280xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x256x1280xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x256x1280xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x256x1280xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x256x1280xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x256x1280xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x256x1280xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x256x1280xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x256x1280xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x256x1280xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x256x1280xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x256x1280xf32>
-    %cst_21 = stablehlo.constant dense<1.000000e+00> : tensor<1x2048x768xbf16>
-    %cst_22 = stablehlo.constant dense<2.000000e+00> : tensor<1x2048x768xbf16>
-    %cst_23 = stablehlo.constant dense<5.000000e-01> : tensor<1x2048x768xbf16>
-    %cst_24 = stablehlo.constant dense<-4.000000e+00> : tensor<1x2048x768xf32>
-    %cst_25 = stablehlo.constant dense<4.000000e+00> : tensor<1x2048x768xf32>
-    %cst_26 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x2048x768xf32>
-    %cst_27 = stablehlo.constant dense<2.77068146E-8> : tensor<1x2048x768xf32>
-    %cst_28 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x2048x768xf32>
-    %cst_29 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x2048x768xf32>
-    %cst_30 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x2048x768xf32>
-    %cst_31 = stablehlo.constant dense<-2.954600e-03> : tensor<1x2048x768xf32>
-    %cst_32 = stablehlo.constant dense<-0.0160960332> : tensor<1x2048x768xf32>
-    %cst_33 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x2048x768xf32>
-    %cst_34 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x2048x768xf32>
-    %cst_35 = stablehlo.constant dense<-0.00168282702> : tensor<1x2048x768xf32>
-    %cst_36 = stablehlo.constant dense<-0.00737332925> : tensor<1x2048x768xf32>
-    %cst_37 = stablehlo.constant dense<-0.0142647391> : tensor<1x2048x768xf32>
-    %cst_38 = stablehlo.constant dense<-1.000000e+00> : tensor<1x2048x768xf32>
-    %cst_39 = stablehlo.constant dense<1.000000e+00> : tensor<1x2048x768xf32>
-    %cst_40 = arith.constant dense<1.000000e+00> : tensor<1xf64>
-    %cst_41 = arith.constant dense<-3.3895313892515355E+38> : tensor<1xf64>
-    %cst_42 = arith.constant dense<768> : tensor<1xi64>
-    %cst_43 = arith.constant dense<1.000000e-05> : tensor<1xf64>
-    %cst_44 = arith.constant dense<1> : tensor<1xi64>
-    %cst_45 = arith.constant dense<5.6568542494923806> : tensor<1xf64>
-    %cst_46 = arith.constant dense<1280> : tensor<1xi64>
-    %0 = "stablehlo.gather"(%arg2, %arg0) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 768>}> : (tensor<262x768xbf16>, tensor<1x2048xi64>) -> tensor<1x2048x768xbf16>
-    %1 = stablehlo.convert %0 : tensor<1x2048x768xbf16>
-    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2] : (tensor<1x2048x768xbf16>) -> tensor<1x2048x768xbf16>
-    %3 = stablehlo.broadcast_in_dim %arg116, dims = [1, 2] : (tensor<2048x768xbf16>) -> tensor<1x2048x768xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x2048x768xbf16>
-    %5 = stablehlo.reshape %arg1 : (tensor<1x2048xi64>) -> tensor<1x1x2048xi64>
-    %6 = stablehlo.reshape %5 : (tensor<1x1x2048xi64>) -> tensor<1x1x1x2048xi64>
-    %7 = stablehlo.convert %6 : (tensor<1x1x1x2048xi64>) -> tensor<1x1x1x2048xbf16>
-    %8 = stablehlo.convert %cst_40 : (tensor<1xf64>) -> tensor<1xbf16>
-    %9 = stablehlo.reshape %8 : (tensor<1xbf16>) -> tensor<bf16>
-    %10 = stablehlo.broadcast_in_dim %9, dims = [] : (tensor<bf16>) -> tensor<1x1x1x2048xbf16>
-    %11 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x1x1x2048xbf16>) -> tensor<1x1x1x2048xbf16>
-    %12 = stablehlo.subtract %10, %11 : tensor<1x1x1x2048xbf16>
-    %13 = stablehlo.convert %cst_41 : (tensor<1xf64>) -> tensor<1xbf16>
-    %14 = stablehlo.reshape %13 : (tensor<1xbf16>) -> tensor<bf16>
-    %15 = stablehlo.broadcast_in_dim %12, dims = [0, 1, 2, 3] : (tensor<1x1x1x2048xbf16>) -> tensor<1x1x1x2048xbf16>
-    %16 = stablehlo.broadcast_in_dim %14, dims = [] : (tensor<bf16>) -> tensor<1x1x1x2048xbf16>
-    %17 = stablehlo.multiply %15, %16 : tensor<1x1x1x2048xbf16>
-    %18 = stablehlo.convert %4 : (tensor<1x2048x768xbf16>) -> tensor<1x2048x768xf32>
-    %19 = stablehlo.convert %18 : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf64>
-    %20 = stablehlo.reduce(%19 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x2048x768xf64>, tensor<f64>) -> tensor<1x2048xf64>
-    %21 = stablehlo.reshape %20 : (tensor<1x2048xf64>) -> tensor<1x2048x1xf64>
-    %22 = stablehlo.convert %cst_42 : (tensor<1xi64>) -> tensor<1xf64>
-    %23 = stablehlo.reshape %22 : (tensor<1xf64>) -> tensor<f64>
-    %24 = stablehlo.broadcast_in_dim %21, dims = [0, 1, 2] : (tensor<1x2048x1xf64>) -> tensor<1x2048x1xf64>
-    %25 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<f64>) -> tensor<1x2048x1xf64>
-    %26 = stablehlo.divide %24, %25 : tensor<1x2048x1xf64>
-    %27 = stablehlo.broadcast_in_dim %19, dims = [0, 1, 2] : (tensor<1x2048x768xf64>) -> tensor<1x2048x768xf64>
-    %28 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2] : (tensor<1x2048x1xf64>) -> tensor<1x2048x768xf64>
-    %29 = stablehlo.subtract %27, %28 : tensor<1x2048x768xf64>
-    %30 = stablehlo.multiply %29, %29 : tensor<1x2048x768xf64>
-    %31 = stablehlo.reduce(%30 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x2048x768xf64>, tensor<f64>) -> tensor<1x2048xf64>
-    %32 = stablehlo.reshape %31 : (tensor<1x2048xf64>) -> tensor<1x2048x1xf64>
-    %33 = stablehlo.broadcast_in_dim %32, dims = [0, 1, 2] : (tensor<1x2048x1xf64>) -> tensor<1x2048x1xf64>
-    %34 = stablehlo.divide %33, %25 : tensor<1x2048x1xf64>
-    %35 = stablehlo.convert %34 : (tensor<1x2048x1xf64>) -> tensor<1x2048x1xf32>
-    %36 = stablehlo.reduce(%18 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x2048x768xf32>, tensor<f32>) -> tensor<1x2048xf32>
-    %37 = stablehlo.reshape %36 : (tensor<1x2048xf32>) -> tensor<1x2048x1xf32>
-    %38 = stablehlo.convert %cst_42 : (tensor<1xi64>) -> tensor<1xf32>
-    %39 = stablehlo.reshape %38 : (tensor<1xf32>) -> tensor<f32>
-    %40 = stablehlo.broadcast_in_dim %37, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x1xf32>
-    %41 = stablehlo.broadcast_in_dim %39, dims = [] : (tensor<f32>) -> tensor<1x2048x1xf32>
-    %42 = stablehlo.divide %40, %41 : tensor<1x2048x1xf32>
-    %43 = stablehlo.convert %cst_43 : (tensor<1xf64>) -> tensor<1xf32>
-    %44 = stablehlo.reshape %43 : (tensor<1xf32>) -> tensor<f32>
-    %45 = stablehlo.broadcast_in_dim %35, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x1xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [] : (tensor<f32>) -> tensor<1x2048x1xf32>
-    %47 = stablehlo.add %45, %46 : tensor<1x2048x1xf32>
-    %48 = stablehlo.rsqrt %47 : tensor<1x2048x1xf32>
-    %49 = stablehlo.broadcast_in_dim %18, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %50 = stablehlo.broadcast_in_dim %42, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x768xf32>
-    %51 = stablehlo.subtract %49, %50 : tensor<1x2048x768xf32>
-    %52 = stablehlo.broadcast_in_dim %51, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %53 = stablehlo.broadcast_in_dim %48, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x768xf32>
-    %54 = stablehlo.multiply %52, %53 : tensor<1x2048x768xf32>
-    %55 = stablehlo.convert %arg3 : (tensor<768xbf16>) -> tensor<768xf32>
-    %56 = stablehlo.broadcast_in_dim %54, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %57 = stablehlo.broadcast_in_dim %55, dims = [2] : (tensor<768xf32>) -> tensor<1x2048x768xf32>
-    %58 = stablehlo.multiply %56, %57 : tensor<1x2048x768xf32>
-    %59 = stablehlo.convert %arg4 : (tensor<768xbf16>) -> tensor<768xf32>
-    %60 = stablehlo.broadcast_in_dim %58, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %61 = stablehlo.broadcast_in_dim %59, dims = [2] : (tensor<768xf32>) -> tensor<1x2048x768xf32>
-    %62 = stablehlo.add %60, %61 : tensor<1x2048x768xf32>
-    %63 = stablehlo.convert %62 : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xbf16>
-    %64 = stablehlo.reshape %63 : (tensor<1x2048x768xbf16>) -> tensor<2048x768xbf16>
-    %65 = stablehlo.convert %64 : (tensor<2048x768xbf16>) -> tensor<2048x768xf32>
-    %66 = stablehlo.dot_general %65, %arg117, contracting_dims = [1] x [0] : (tensor<2048x768xf32>, tensor<768x256xf32>) -> tensor<2048x256xf32>
-    %67 = stablehlo.convert %cst_44 : (tensor<1xi64>) -> tensor<1xf32>
-    %68 = stablehlo.reshape %67 : (tensor<1xf32>) -> tensor<f32>
-    %69 = stablehlo.broadcast_in_dim %66, dims = [0, 1] : (tensor<2048x256xf32>) -> tensor<2048x256xf32>
-    %70 = stablehlo.broadcast_in_dim %68, dims = [] : (tensor<f32>) -> tensor<2048x256xf32>
-    %71 = stablehlo.multiply %69, %70 : tensor<2048x256xf32>
-    %72 = stablehlo.broadcast_in_dim %71, dims = [0, 1] : (tensor<2048x256xf32>) -> tensor<2048x256xf32>
-    %73 = stablehlo.broadcast_in_dim %arg118, dims = [1] : (tensor<256xf32>) -> tensor<2048x256xf32>
-    %74 = stablehlo.add %72, %73 : tensor<2048x256xf32>
-    %75 = stablehlo.convert %74 : (tensor<2048x256xf32>) -> tensor<2048x256xbf16>
-    %76 = stablehlo.reshape %75 : (tensor<2048x256xbf16>) -> tensor<1x2048x256xbf16>
-    %77 = stablehlo.dot_general %65, %arg119, contracting_dims = [1] x [0] : (tensor<2048x768xf32>, tensor<768x1280xf32>) -> tensor<2048x1280xf32>
-    %78 = stablehlo.broadcast_in_dim %77, dims = [0, 1] : (tensor<2048x1280xf32>) -> tensor<2048x1280xf32>
-    %79 = stablehlo.broadcast_in_dim %68, dims = [] : (tensor<f32>) -> tensor<2048x1280xf32>
-    %80 = stablehlo.multiply %78, %79 : tensor<2048x1280xf32>
-    %81 = stablehlo.broadcast_in_dim %80, dims = [0, 1] : (tensor<2048x1280xf32>) -> tensor<2048x1280xf32>
-    %82 = stablehlo.broadcast_in_dim %arg120, dims = [1] : (tensor<1280xf32>) -> tensor<2048x1280xf32>
-    %83 = stablehlo.add %81, %82 : tensor<2048x1280xf32>
-    %84 = stablehlo.convert %83 : (tensor<2048x1280xf32>) -> tensor<2048x1280xbf16>
-    %85 = stablehlo.reshape %84 : (tensor<2048x1280xbf16>) -> tensor<1x2048x1280xbf16>
-    %86 = stablehlo.reshape %76 : (tensor<1x2048x256xbf16>) -> tensor<1x2048x8x32xbf16>
-    %87 = stablehlo.transpose %86, dims = [0, 2, 1, 3] : (tensor<1x2048x8x32xbf16>) -> tensor<1x8x2048x32xbf16>
-    %88 = stablehlo.reshape %85 : (tensor<1x2048x1280xbf16>) -> tensor<1x2048x8x160xbf16>
-    %89 = stablehlo.transpose %88, dims = [0, 2, 1, 3] : (tensor<1x2048x8x160xbf16>) -> tensor<1x8x2048x160xbf16>
-    %90 = stablehlo.transpose %87, dims = [0, 1, 3, 2] : (tensor<1x8x2048x32xbf16>) -> tensor<1x8x32x2048xbf16>
-    %91 = stablehlo.reshape %90 : (tensor<1x8x32x2048xbf16>) -> tensor<8x32x2048xbf16>
-    %92 = stablehlo.broadcast_in_dim %91, dims = [0, 1, 2] : (tensor<8x32x2048xbf16>) -> tensor<8x32x2048xbf16>
-    %93 = stablehlo.dot_general %arg121, %92, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x2048xbf16>) -> tensor<8x256x2048xbf16>
-    %94 = stablehlo.reshape %93 : (tensor<8x256x2048xbf16>) -> tensor<1x8x256x2048xbf16>
-    %95 = stablehlo.convert %cst_45 : (tensor<1xf64>) -> tensor<1xbf16>
-    %96 = stablehlo.reshape %95 : (tensor<1xbf16>) -> tensor<bf16>
-    %97 = stablehlo.broadcast_in_dim %94, dims = [0, 1, 2, 3] : (tensor<1x8x256x2048xbf16>) -> tensor<1x8x256x2048xbf16>
-    %98 = stablehlo.broadcast_in_dim %96, dims = [] : (tensor<bf16>) -> tensor<1x8x256x2048xbf16>
-    %99 = stablehlo.divide %97, %98 : tensor<1x8x256x2048xbf16>
-    %100 = stablehlo.broadcast_in_dim %99, dims = [0, 1, 2, 3] : (tensor<1x8x256x2048xbf16>) -> tensor<1x8x256x2048xbf16>
-    %101 = stablehlo.broadcast_in_dim %17, dims = [0, 1, 2, 3] : (tensor<1x1x1x2048xbf16>) -> tensor<1x8x256x2048xbf16>
-    %102 = stablehlo.add %100, %101 : tensor<1x8x256x2048xbf16>
-    %103 = stablehlo.convert %102 : (tensor<1x8x256x2048xbf16>) -> tensor<1x8x256x2048xf32>
-    %104 = stablehlo.reduce(%103 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x2048xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %105 = stablehlo.reshape %104 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %106 = stablehlo.broadcast_in_dim %103, dims = [0, 1, 2, 3] : (tensor<1x8x256x2048xf32>) -> tensor<1x8x256x2048xf32>
-    %107 = stablehlo.broadcast_in_dim %105, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x2048xf32>
-    %108 = stablehlo.subtract %106, %107 : tensor<1x8x256x2048xf32>
-    %109 = stablehlo.exponential %108 : tensor<1x8x256x2048xf32>
-    %110 = stablehlo.reduce(%109 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x2048xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %111 = stablehlo.reshape %110 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %112 = stablehlo.broadcast_in_dim %109, dims = [0, 1, 2, 3] : (tensor<1x8x256x2048xf32>) -> tensor<1x8x256x2048xf32>
-    %113 = stablehlo.broadcast_in_dim %111, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x2048xf32>
-    %114 = stablehlo.divide %112, %113 : tensor<1x8x256x2048xf32>
-    %115 = stablehlo.convert %114 : (tensor<1x8x256x2048xf32>) -> tensor<1x8x256x2048xbf16>
-    %116 = stablehlo.reshape %115 : (tensor<1x8x256x2048xbf16>) -> tensor<8x256x2048xbf16>
-    %117 = stablehlo.reshape %89 : (tensor<1x8x2048x160xbf16>) -> tensor<8x2048x160xbf16>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1, 2] : (tensor<8x2048x160xbf16>) -> tensor<8x2048x160xbf16>
-    %119 = stablehlo.dot_general %116, %118, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x2048xbf16>, tensor<8x2048x160xbf16>) -> tensor<8x256x160xbf16>
-    %120 = stablehlo.reshape %119 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %121 = stablehlo.transpose %120, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %122 = stablehlo.reshape %121 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %123 = stablehlo.reshape %122 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %124 = stablehlo.convert %123 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %125 = stablehlo.dot_general %124, %arg122, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %126 = stablehlo.broadcast_in_dim %125, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %127 = stablehlo.broadcast_in_dim %68, dims = [] : (tensor<f32>) -> tensor<256x1280xf32>
-    %128 = stablehlo.multiply %126, %127 : tensor<256x1280xf32>
-    %129 = stablehlo.broadcast_in_dim %128, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %130 = stablehlo.broadcast_in_dim %arg123, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %131 = stablehlo.add %129, %130 : tensor<256x1280xf32>
-    %132 = stablehlo.convert %131 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %133 = stablehlo.reshape %132 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %134 = stablehlo.add %133, %arg124 : tensor<1x256x1280xbf16>
-    %135 = stablehlo.convert %134 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %136 = stablehlo.convert %135 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %137 = stablehlo.reduce(%136 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %138 = stablehlo.reshape %137 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %139 = stablehlo.convert %cst_46 : (tensor<1xi64>) -> tensor<1xf64>
-    %140 = stablehlo.reshape %139 : (tensor<1xf64>) -> tensor<f64>
-    %141 = stablehlo.broadcast_in_dim %138, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %142 = stablehlo.broadcast_in_dim %140, dims = [] : (tensor<f64>) -> tensor<1x256x1xf64>
-    %143 = stablehlo.divide %141, %142 : tensor<1x256x1xf64>
-    %144 = stablehlo.broadcast_in_dim %136, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %145 = stablehlo.broadcast_in_dim %143, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %146 = stablehlo.subtract %144, %145 : tensor<1x256x1280xf64>
-    %147 = stablehlo.multiply %146, %146 : tensor<1x256x1280xf64>
-    %148 = stablehlo.reduce(%147 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %149 = stablehlo.reshape %148 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %150 = stablehlo.broadcast_in_dim %149, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %151 = stablehlo.divide %150, %142 : tensor<1x256x1xf64>
-    %152 = stablehlo.convert %151 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %153 = stablehlo.reduce(%135 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %154 = stablehlo.reshape %153 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %155 = stablehlo.convert %cst_46 : (tensor<1xi64>) -> tensor<1xf32>
-    %156 = stablehlo.reshape %155 : (tensor<1xf32>) -> tensor<f32>
-    %157 = stablehlo.broadcast_in_dim %154, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %158 = stablehlo.broadcast_in_dim %156, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %159 = stablehlo.divide %157, %158 : tensor<1x256x1xf32>
-    %160 = stablehlo.broadcast_in_dim %152, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %161 = stablehlo.broadcast_in_dim %44, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %162 = stablehlo.add %160, %161 : tensor<1x256x1xf32>
-    %163 = stablehlo.rsqrt %162 : tensor<1x256x1xf32>
-    %164 = stablehlo.broadcast_in_dim %135, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %165 = stablehlo.broadcast_in_dim %159, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %166 = stablehlo.subtract %164, %165 : tensor<1x256x1280xf32>
-    %167 = stablehlo.broadcast_in_dim %166, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %168 = stablehlo.broadcast_in_dim %163, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %169 = stablehlo.multiply %167, %168 : tensor<1x256x1280xf32>
-    %170 = stablehlo.convert %arg5 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %171 = stablehlo.broadcast_in_dim %169, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %172 = stablehlo.broadcast_in_dim %170, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %173 = stablehlo.multiply %171, %172 : tensor<1x256x1280xf32>
-    %174 = stablehlo.convert %arg6 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %175 = stablehlo.broadcast_in_dim %173, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %176 = stablehlo.broadcast_in_dim %174, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %177 = stablehlo.add %175, %176 : tensor<1x256x1280xf32>
-    %178 = stablehlo.convert %177 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %179 = stablehlo.reshape %178 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %180 = stablehlo.convert %179 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %181 = stablehlo.dot_general %180, %arg125, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %182 = stablehlo.broadcast_in_dim %181, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %183 = stablehlo.multiply %182, %127 : tensor<256x1280xf32>
-    %184 = stablehlo.broadcast_in_dim %183, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %185 = stablehlo.broadcast_in_dim %arg126, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %186 = stablehlo.add %184, %185 : tensor<256x1280xf32>
-    %187 = stablehlo.convert %186 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %188 = stablehlo.reshape %187 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %189 = stablehlo.multiply %188, %cst_4 : tensor<1x256x1280xbf16>
-    %190 = stablehlo.rsqrt %cst_3 : tensor<1x256x1280xbf16>
-    %191 = stablehlo.multiply %188, %190 : tensor<1x256x1280xbf16>
-    %192 = stablehlo.convert %191 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %193 = stablehlo.clamp %cst_5, %192, %cst_6 : tensor<1x256x1280xf32>
-    %194 = stablehlo.multiply %193, %193 : tensor<1x256x1280xf32>
-    %195 = stablehlo.multiply %cst_7, %194 : tensor<1x256x1280xf32>
-    %196 = stablehlo.add %195, %cst_8 : tensor<1x256x1280xf32>
-    %197 = stablehlo.multiply %196, %194 : tensor<1x256x1280xf32>
-    %198 = stablehlo.add %197, %cst_9 : tensor<1x256x1280xf32>
-    %199 = stablehlo.multiply %198, %194 : tensor<1x256x1280xf32>
-    %200 = stablehlo.add %199, %cst_10 : tensor<1x256x1280xf32>
-    %201 = stablehlo.multiply %200, %194 : tensor<1x256x1280xf32>
-    %202 = stablehlo.add %201, %cst_11 : tensor<1x256x1280xf32>
-    %203 = stablehlo.multiply %202, %194 : tensor<1x256x1280xf32>
-    %204 = stablehlo.add %203, %cst_12 : tensor<1x256x1280xf32>
-    %205 = stablehlo.multiply %204, %194 : tensor<1x256x1280xf32>
-    %206 = stablehlo.add %205, %cst_13 : tensor<1x256x1280xf32>
-    %207 = stablehlo.multiply %cst_14, %194 : tensor<1x256x1280xf32>
-    %208 = stablehlo.add %207, %cst_15 : tensor<1x256x1280xf32>
-    %209 = stablehlo.multiply %208, %194 : tensor<1x256x1280xf32>
-    %210 = stablehlo.add %209, %cst_16 : tensor<1x256x1280xf32>
-    %211 = stablehlo.multiply %210, %194 : tensor<1x256x1280xf32>
-    %212 = stablehlo.add %211, %cst_17 : tensor<1x256x1280xf32>
-    %213 = stablehlo.multiply %212, %194 : tensor<1x256x1280xf32>
-    %214 = stablehlo.add %213, %cst_18 : tensor<1x256x1280xf32>
-    %215 = stablehlo.multiply %193, %206 : tensor<1x256x1280xf32>
-    %216 = stablehlo.divide %215, %214 : tensor<1x256x1280xf32>
-    %217 = stablehlo.clamp %cst_19, %216, %cst_20 : tensor<1x256x1280xf32>
-    %218 = stablehlo.convert %217 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %219 = stablehlo.add %218, %cst_2 : tensor<1x256x1280xbf16>
-    %220 = stablehlo.multiply %219, %189 : tensor<1x256x1280xbf16>
-    %221 = stablehlo.reshape %220 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %222 = stablehlo.convert %221 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %223 = stablehlo.dot_general %222, %arg127, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %224 = stablehlo.broadcast_in_dim %223, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %225 = stablehlo.multiply %224, %127 : tensor<256x1280xf32>
-    %226 = stablehlo.broadcast_in_dim %225, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %227 = stablehlo.broadcast_in_dim %arg128, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %228 = stablehlo.add %226, %227 : tensor<256x1280xf32>
-    %229 = stablehlo.convert %228 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %230 = stablehlo.reshape %229 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %231 = stablehlo.add %230, %134 : tensor<1x256x1280xbf16>
-    %232 = stablehlo.convert %231 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %233 = stablehlo.convert %232 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %234 = stablehlo.reduce(%233 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %235 = stablehlo.reshape %234 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %236 = stablehlo.broadcast_in_dim %235, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %237 = stablehlo.divide %236, %142 : tensor<1x256x1xf64>
-    %238 = stablehlo.broadcast_in_dim %233, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %239 = stablehlo.broadcast_in_dim %237, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %240 = stablehlo.subtract %238, %239 : tensor<1x256x1280xf64>
-    %241 = stablehlo.multiply %240, %240 : tensor<1x256x1280xf64>
-    %242 = stablehlo.reduce(%241 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %243 = stablehlo.reshape %242 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %244 = stablehlo.broadcast_in_dim %243, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %245 = stablehlo.divide %244, %142 : tensor<1x256x1xf64>
-    %246 = stablehlo.convert %245 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %247 = stablehlo.reduce(%232 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %248 = stablehlo.reshape %247 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %249 = stablehlo.broadcast_in_dim %248, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %250 = stablehlo.divide %249, %158 : tensor<1x256x1xf32>
-    %251 = stablehlo.broadcast_in_dim %246, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %252 = stablehlo.add %251, %161 : tensor<1x256x1xf32>
-    %253 = stablehlo.rsqrt %252 : tensor<1x256x1xf32>
-    %254 = stablehlo.broadcast_in_dim %232, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %255 = stablehlo.broadcast_in_dim %250, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %256 = stablehlo.subtract %254, %255 : tensor<1x256x1280xf32>
-    %257 = stablehlo.broadcast_in_dim %256, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %258 = stablehlo.broadcast_in_dim %253, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %259 = stablehlo.multiply %257, %258 : tensor<1x256x1280xf32>
-    %260 = stablehlo.convert %arg7 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %261 = stablehlo.broadcast_in_dim %259, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %262 = stablehlo.broadcast_in_dim %260, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %263 = stablehlo.multiply %261, %262 : tensor<1x256x1280xf32>
-    %264 = stablehlo.convert %arg8 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %265 = stablehlo.broadcast_in_dim %263, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %266 = stablehlo.broadcast_in_dim %264, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %267 = stablehlo.add %265, %266 : tensor<1x256x1280xf32>
-    %268 = stablehlo.convert %267 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %269 = stablehlo.reshape %268 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %270 = stablehlo.convert %269 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %271 = stablehlo.dot_general %270, %arg129, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %272 = stablehlo.broadcast_in_dim %271, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %273 = stablehlo.broadcast_in_dim %68, dims = [] : (tensor<f32>) -> tensor<256x256xf32>
-    %274 = stablehlo.multiply %272, %273 : tensor<256x256xf32>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %276 = stablehlo.broadcast_in_dim %arg130, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %277 = stablehlo.add %275, %276 : tensor<256x256xf32>
-    %278 = stablehlo.convert %277 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %279 = stablehlo.reshape %278 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %280 = stablehlo.dot_general %270, %arg131, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %281 = stablehlo.broadcast_in_dim %280, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %282 = stablehlo.multiply %281, %273 : tensor<256x256xf32>
-    %283 = stablehlo.broadcast_in_dim %282, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %284 = stablehlo.broadcast_in_dim %arg132, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %285 = stablehlo.add %283, %284 : tensor<256x256xf32>
-    %286 = stablehlo.convert %285 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %287 = stablehlo.reshape %286 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %288 = stablehlo.dot_general %270, %arg133, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %289 = stablehlo.broadcast_in_dim %288, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %290 = stablehlo.multiply %289, %127 : tensor<256x1280xf32>
-    %291 = stablehlo.broadcast_in_dim %290, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %292 = stablehlo.broadcast_in_dim %arg134, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %293 = stablehlo.add %291, %292 : tensor<256x1280xf32>
-    %294 = stablehlo.convert %293 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %295 = stablehlo.reshape %294 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %296 = stablehlo.reshape %279 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %297 = stablehlo.transpose %296, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %298 = stablehlo.reshape %287 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %299 = stablehlo.transpose %298, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %300 = stablehlo.reshape %295 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %301 = stablehlo.transpose %300, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %302 = stablehlo.transpose %299, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %303 = stablehlo.reshape %297 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %304 = stablehlo.reshape %302 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %305 = stablehlo.broadcast_in_dim %304, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %306 = stablehlo.dot_general %303, %305, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %307 = stablehlo.reshape %306 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %308 = stablehlo.broadcast_in_dim %307, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %309 = stablehlo.broadcast_in_dim %96, dims = [] : (tensor<bf16>) -> tensor<1x8x256x256xbf16>
-    %310 = stablehlo.divide %308, %309 : tensor<1x8x256x256xbf16>
-    %311 = stablehlo.convert %310 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %312 = stablehlo.reduce(%311 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %313 = stablehlo.reshape %312 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %314 = stablehlo.broadcast_in_dim %311, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %315 = stablehlo.broadcast_in_dim %313, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %316 = stablehlo.subtract %314, %315 : tensor<1x8x256x256xf32>
-    %317 = stablehlo.exponential %316 : tensor<1x8x256x256xf32>
-    %318 = stablehlo.reduce(%317 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %319 = stablehlo.reshape %318 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %320 = stablehlo.broadcast_in_dim %317, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %321 = stablehlo.broadcast_in_dim %319, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %322 = stablehlo.divide %320, %321 : tensor<1x8x256x256xf32>
-    %323 = stablehlo.convert %322 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %324 = stablehlo.reshape %323 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %325 = stablehlo.reshape %301 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %326 = stablehlo.broadcast_in_dim %325, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %327 = stablehlo.dot_general %324, %326, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %328 = stablehlo.reshape %327 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %329 = stablehlo.transpose %328, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %330 = stablehlo.reshape %329 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %331 = stablehlo.reshape %330 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %332 = stablehlo.convert %331 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %333 = stablehlo.dot_general %332, %arg135, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %334 = stablehlo.broadcast_in_dim %333, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %335 = stablehlo.multiply %334, %127 : tensor<256x1280xf32>
-    %336 = stablehlo.broadcast_in_dim %335, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %337 = stablehlo.broadcast_in_dim %arg136, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %338 = stablehlo.add %336, %337 : tensor<256x1280xf32>
-    %339 = stablehlo.convert %338 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %340 = stablehlo.reshape %339 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %341 = stablehlo.add %340, %231 : tensor<1x256x1280xbf16>
-    %342 = stablehlo.convert %341 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %343 = stablehlo.convert %342 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %344 = stablehlo.reduce(%343 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %345 = stablehlo.reshape %344 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %346 = stablehlo.broadcast_in_dim %345, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %347 = stablehlo.divide %346, %142 : tensor<1x256x1xf64>
-    %348 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %349 = stablehlo.broadcast_in_dim %347, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %350 = stablehlo.subtract %348, %349 : tensor<1x256x1280xf64>
-    %351 = stablehlo.multiply %350, %350 : tensor<1x256x1280xf64>
-    %352 = stablehlo.reduce(%351 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %353 = stablehlo.reshape %352 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %354 = stablehlo.broadcast_in_dim %353, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %355 = stablehlo.divide %354, %142 : tensor<1x256x1xf64>
-    %356 = stablehlo.convert %355 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %357 = stablehlo.reduce(%342 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %358 = stablehlo.reshape %357 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %359 = stablehlo.broadcast_in_dim %358, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %360 = stablehlo.divide %359, %158 : tensor<1x256x1xf32>
-    %361 = stablehlo.broadcast_in_dim %356, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %362 = stablehlo.add %361, %161 : tensor<1x256x1xf32>
-    %363 = stablehlo.rsqrt %362 : tensor<1x256x1xf32>
-    %364 = stablehlo.broadcast_in_dim %342, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %365 = stablehlo.broadcast_in_dim %360, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %366 = stablehlo.subtract %364, %365 : tensor<1x256x1280xf32>
-    %367 = stablehlo.broadcast_in_dim %366, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %368 = stablehlo.broadcast_in_dim %363, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %369 = stablehlo.multiply %367, %368 : tensor<1x256x1280xf32>
-    %370 = stablehlo.convert %arg9 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %371 = stablehlo.broadcast_in_dim %369, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %372 = stablehlo.broadcast_in_dim %370, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %373 = stablehlo.multiply %371, %372 : tensor<1x256x1280xf32>
-    %374 = stablehlo.convert %arg10 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %375 = stablehlo.broadcast_in_dim %373, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %376 = stablehlo.broadcast_in_dim %374, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %377 = stablehlo.add %375, %376 : tensor<1x256x1280xf32>
-    %378 = stablehlo.convert %377 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %379 = stablehlo.reshape %378 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %380 = stablehlo.convert %379 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %381 = stablehlo.dot_general %380, %arg137, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %382 = stablehlo.broadcast_in_dim %381, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %383 = stablehlo.multiply %382, %127 : tensor<256x1280xf32>
-    %384 = stablehlo.broadcast_in_dim %383, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %385 = stablehlo.broadcast_in_dim %arg138, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %386 = stablehlo.add %384, %385 : tensor<256x1280xf32>
-    %387 = stablehlo.convert %386 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %388 = stablehlo.reshape %387 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %389 = stablehlo.multiply %388, %cst_4 : tensor<1x256x1280xbf16>
-    %390 = stablehlo.multiply %388, %190 : tensor<1x256x1280xbf16>
-    %391 = stablehlo.convert %390 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %392 = stablehlo.clamp %cst_5, %391, %cst_6 : tensor<1x256x1280xf32>
-    %393 = stablehlo.multiply %392, %392 : tensor<1x256x1280xf32>
-    %394 = stablehlo.multiply %cst_7, %393 : tensor<1x256x1280xf32>
-    %395 = stablehlo.add %394, %cst_8 : tensor<1x256x1280xf32>
-    %396 = stablehlo.multiply %395, %393 : tensor<1x256x1280xf32>
-    %397 = stablehlo.add %396, %cst_9 : tensor<1x256x1280xf32>
-    %398 = stablehlo.multiply %397, %393 : tensor<1x256x1280xf32>
-    %399 = stablehlo.add %398, %cst_10 : tensor<1x256x1280xf32>
-    %400 = stablehlo.multiply %399, %393 : tensor<1x256x1280xf32>
-    %401 = stablehlo.add %400, %cst_11 : tensor<1x256x1280xf32>
-    %402 = stablehlo.multiply %401, %393 : tensor<1x256x1280xf32>
-    %403 = stablehlo.add %402, %cst_12 : tensor<1x256x1280xf32>
-    %404 = stablehlo.multiply %403, %393 : tensor<1x256x1280xf32>
-    %405 = stablehlo.add %404, %cst_13 : tensor<1x256x1280xf32>
-    %406 = stablehlo.multiply %cst_14, %393 : tensor<1x256x1280xf32>
-    %407 = stablehlo.add %406, %cst_15 : tensor<1x256x1280xf32>
-    %408 = stablehlo.multiply %407, %393 : tensor<1x256x1280xf32>
-    %409 = stablehlo.add %408, %cst_16 : tensor<1x256x1280xf32>
-    %410 = stablehlo.multiply %409, %393 : tensor<1x256x1280xf32>
-    %411 = stablehlo.add %410, %cst_17 : tensor<1x256x1280xf32>
-    %412 = stablehlo.multiply %411, %393 : tensor<1x256x1280xf32>
-    %413 = stablehlo.add %412, %cst_18 : tensor<1x256x1280xf32>
-    %414 = stablehlo.multiply %392, %405 : tensor<1x256x1280xf32>
-    %415 = stablehlo.divide %414, %413 : tensor<1x256x1280xf32>
-    %416 = stablehlo.clamp %cst_19, %415, %cst_20 : tensor<1x256x1280xf32>
-    %417 = stablehlo.convert %416 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %418 = stablehlo.add %417, %cst_2 : tensor<1x256x1280xbf16>
-    %419 = stablehlo.multiply %418, %389 : tensor<1x256x1280xbf16>
-    %420 = stablehlo.reshape %419 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %421 = stablehlo.convert %420 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %422 = stablehlo.dot_general %421, %arg139, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %423 = stablehlo.broadcast_in_dim %422, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %424 = stablehlo.multiply %423, %127 : tensor<256x1280xf32>
-    %425 = stablehlo.broadcast_in_dim %424, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %426 = stablehlo.broadcast_in_dim %arg140, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %427 = stablehlo.add %425, %426 : tensor<256x1280xf32>
-    %428 = stablehlo.convert %427 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %429 = stablehlo.reshape %428 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %430 = stablehlo.add %429, %341 : tensor<1x256x1280xbf16>
-    %431 = stablehlo.convert %430 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %432 = stablehlo.convert %431 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %433 = stablehlo.reduce(%432 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %434 = stablehlo.reshape %433 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %435 = stablehlo.broadcast_in_dim %434, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %436 = stablehlo.divide %435, %142 : tensor<1x256x1xf64>
-    %437 = stablehlo.broadcast_in_dim %432, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %438 = stablehlo.broadcast_in_dim %436, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %439 = stablehlo.subtract %437, %438 : tensor<1x256x1280xf64>
-    %440 = stablehlo.multiply %439, %439 : tensor<1x256x1280xf64>
-    %441 = stablehlo.reduce(%440 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %442 = stablehlo.reshape %441 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %443 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %444 = stablehlo.divide %443, %142 : tensor<1x256x1xf64>
-    %445 = stablehlo.convert %444 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %446 = stablehlo.reduce(%431 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %447 = stablehlo.reshape %446 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %448 = stablehlo.broadcast_in_dim %447, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %449 = stablehlo.divide %448, %158 : tensor<1x256x1xf32>
-    %450 = stablehlo.broadcast_in_dim %445, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %451 = stablehlo.add %450, %161 : tensor<1x256x1xf32>
-    %452 = stablehlo.rsqrt %451 : tensor<1x256x1xf32>
-    %453 = stablehlo.broadcast_in_dim %431, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %454 = stablehlo.broadcast_in_dim %449, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %455 = stablehlo.subtract %453, %454 : tensor<1x256x1280xf32>
-    %456 = stablehlo.broadcast_in_dim %455, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %457 = stablehlo.broadcast_in_dim %452, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %458 = stablehlo.multiply %456, %457 : tensor<1x256x1280xf32>
-    %459 = stablehlo.convert %arg11 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %460 = stablehlo.broadcast_in_dim %458, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %461 = stablehlo.broadcast_in_dim %459, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %462 = stablehlo.multiply %460, %461 : tensor<1x256x1280xf32>
-    %463 = stablehlo.convert %arg12 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %464 = stablehlo.broadcast_in_dim %462, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %465 = stablehlo.broadcast_in_dim %463, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %466 = stablehlo.add %464, %465 : tensor<1x256x1280xf32>
-    %467 = stablehlo.convert %466 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %468 = stablehlo.reshape %467 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %469 = stablehlo.convert %468 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %470 = stablehlo.dot_general %469, %arg141, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %471 = stablehlo.broadcast_in_dim %470, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %472 = stablehlo.multiply %471, %273 : tensor<256x256xf32>
-    %473 = stablehlo.broadcast_in_dim %472, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %474 = stablehlo.broadcast_in_dim %arg142, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %475 = stablehlo.add %473, %474 : tensor<256x256xf32>
-    %476 = stablehlo.convert %475 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %477 = stablehlo.reshape %476 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %478 = stablehlo.dot_general %469, %arg143, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %479 = stablehlo.broadcast_in_dim %478, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %480 = stablehlo.multiply %479, %273 : tensor<256x256xf32>
-    %481 = stablehlo.broadcast_in_dim %480, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %482 = stablehlo.broadcast_in_dim %arg144, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %483 = stablehlo.add %481, %482 : tensor<256x256xf32>
-    %484 = stablehlo.convert %483 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %485 = stablehlo.reshape %484 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %486 = stablehlo.dot_general %469, %arg145, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %487 = stablehlo.broadcast_in_dim %486, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %488 = stablehlo.multiply %487, %127 : tensor<256x1280xf32>
-    %489 = stablehlo.broadcast_in_dim %488, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %490 = stablehlo.broadcast_in_dim %arg146, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %491 = stablehlo.add %489, %490 : tensor<256x1280xf32>
-    %492 = stablehlo.convert %491 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %493 = stablehlo.reshape %492 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %494 = stablehlo.reshape %477 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %495 = stablehlo.transpose %494, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %496 = stablehlo.reshape %485 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %497 = stablehlo.transpose %496, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %498 = stablehlo.reshape %493 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %499 = stablehlo.transpose %498, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %500 = stablehlo.transpose %497, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %501 = stablehlo.reshape %495 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %502 = stablehlo.reshape %500 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %503 = stablehlo.broadcast_in_dim %502, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %504 = stablehlo.dot_general %501, %503, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %505 = stablehlo.reshape %504 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %506 = stablehlo.broadcast_in_dim %505, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %507 = stablehlo.divide %506, %309 : tensor<1x8x256x256xbf16>
-    %508 = stablehlo.convert %507 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %509 = stablehlo.reduce(%508 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %510 = stablehlo.reshape %509 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %511 = stablehlo.broadcast_in_dim %508, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %512 = stablehlo.broadcast_in_dim %510, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %513 = stablehlo.subtract %511, %512 : tensor<1x8x256x256xf32>
-    %514 = stablehlo.exponential %513 : tensor<1x8x256x256xf32>
-    %515 = stablehlo.reduce(%514 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %516 = stablehlo.reshape %515 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %517 = stablehlo.broadcast_in_dim %514, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %518 = stablehlo.broadcast_in_dim %516, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %519 = stablehlo.divide %517, %518 : tensor<1x8x256x256xf32>
-    %520 = stablehlo.convert %519 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %521 = stablehlo.reshape %520 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %522 = stablehlo.reshape %499 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %523 = stablehlo.broadcast_in_dim %522, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %524 = stablehlo.dot_general %521, %523, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %525 = stablehlo.reshape %524 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %526 = stablehlo.transpose %525, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %527 = stablehlo.reshape %526 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %528 = stablehlo.reshape %527 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %529 = stablehlo.convert %528 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %530 = stablehlo.dot_general %529, %arg147, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %531 = stablehlo.broadcast_in_dim %530, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %532 = stablehlo.multiply %531, %127 : tensor<256x1280xf32>
-    %533 = stablehlo.broadcast_in_dim %532, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %534 = stablehlo.broadcast_in_dim %arg148, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %535 = stablehlo.add %533, %534 : tensor<256x1280xf32>
-    %536 = stablehlo.convert %535 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %537 = stablehlo.reshape %536 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %538 = stablehlo.add %537, %430 : tensor<1x256x1280xbf16>
-    %539 = stablehlo.convert %538 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %540 = stablehlo.convert %539 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %541 = stablehlo.reduce(%540 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %542 = stablehlo.reshape %541 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %543 = stablehlo.broadcast_in_dim %542, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %544 = stablehlo.divide %543, %142 : tensor<1x256x1xf64>
-    %545 = stablehlo.broadcast_in_dim %540, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %546 = stablehlo.broadcast_in_dim %544, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %547 = stablehlo.subtract %545, %546 : tensor<1x256x1280xf64>
-    %548 = stablehlo.multiply %547, %547 : tensor<1x256x1280xf64>
-    %549 = stablehlo.reduce(%548 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %550 = stablehlo.reshape %549 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %551 = stablehlo.broadcast_in_dim %550, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %552 = stablehlo.divide %551, %142 : tensor<1x256x1xf64>
-    %553 = stablehlo.convert %552 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %554 = stablehlo.reduce(%539 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %555 = stablehlo.reshape %554 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %556 = stablehlo.broadcast_in_dim %555, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %557 = stablehlo.divide %556, %158 : tensor<1x256x1xf32>
-    %558 = stablehlo.broadcast_in_dim %553, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %559 = stablehlo.add %558, %161 : tensor<1x256x1xf32>
-    %560 = stablehlo.rsqrt %559 : tensor<1x256x1xf32>
-    %561 = stablehlo.broadcast_in_dim %539, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %562 = stablehlo.broadcast_in_dim %557, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %563 = stablehlo.subtract %561, %562 : tensor<1x256x1280xf32>
-    %564 = stablehlo.broadcast_in_dim %563, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %565 = stablehlo.broadcast_in_dim %560, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %566 = stablehlo.multiply %564, %565 : tensor<1x256x1280xf32>
-    %567 = stablehlo.convert %arg13 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %568 = stablehlo.broadcast_in_dim %566, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %569 = stablehlo.broadcast_in_dim %567, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %570 = stablehlo.multiply %568, %569 : tensor<1x256x1280xf32>
-    %571 = stablehlo.convert %arg14 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %572 = stablehlo.broadcast_in_dim %570, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %573 = stablehlo.broadcast_in_dim %571, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %574 = stablehlo.add %572, %573 : tensor<1x256x1280xf32>
-    %575 = stablehlo.convert %574 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %576 = stablehlo.reshape %575 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %577 = stablehlo.convert %576 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %578 = stablehlo.dot_general %577, %arg149, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %579 = stablehlo.broadcast_in_dim %578, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %580 = stablehlo.multiply %579, %127 : tensor<256x1280xf32>
-    %581 = stablehlo.broadcast_in_dim %580, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %582 = stablehlo.broadcast_in_dim %arg150, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %583 = stablehlo.add %581, %582 : tensor<256x1280xf32>
-    %584 = stablehlo.convert %583 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %585 = stablehlo.reshape %584 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %586 = stablehlo.multiply %585, %cst_4 : tensor<1x256x1280xbf16>
-    %587 = stablehlo.multiply %585, %190 : tensor<1x256x1280xbf16>
-    %588 = stablehlo.convert %587 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %589 = stablehlo.clamp %cst_5, %588, %cst_6 : tensor<1x256x1280xf32>
-    %590 = stablehlo.multiply %589, %589 : tensor<1x256x1280xf32>
-    %591 = stablehlo.multiply %cst_7, %590 : tensor<1x256x1280xf32>
-    %592 = stablehlo.add %591, %cst_8 : tensor<1x256x1280xf32>
-    %593 = stablehlo.multiply %592, %590 : tensor<1x256x1280xf32>
-    %594 = stablehlo.add %593, %cst_9 : tensor<1x256x1280xf32>
-    %595 = stablehlo.multiply %594, %590 : tensor<1x256x1280xf32>
-    %596 = stablehlo.add %595, %cst_10 : tensor<1x256x1280xf32>
-    %597 = stablehlo.multiply %596, %590 : tensor<1x256x1280xf32>
-    %598 = stablehlo.add %597, %cst_11 : tensor<1x256x1280xf32>
-    %599 = stablehlo.multiply %598, %590 : tensor<1x256x1280xf32>
-    %600 = stablehlo.add %599, %cst_12 : tensor<1x256x1280xf32>
-    %601 = stablehlo.multiply %600, %590 : tensor<1x256x1280xf32>
-    %602 = stablehlo.add %601, %cst_13 : tensor<1x256x1280xf32>
-    %603 = stablehlo.multiply %cst_14, %590 : tensor<1x256x1280xf32>
-    %604 = stablehlo.add %603, %cst_15 : tensor<1x256x1280xf32>
-    %605 = stablehlo.multiply %604, %590 : tensor<1x256x1280xf32>
-    %606 = stablehlo.add %605, %cst_16 : tensor<1x256x1280xf32>
-    %607 = stablehlo.multiply %606, %590 : tensor<1x256x1280xf32>
-    %608 = stablehlo.add %607, %cst_17 : tensor<1x256x1280xf32>
-    %609 = stablehlo.multiply %608, %590 : tensor<1x256x1280xf32>
-    %610 = stablehlo.add %609, %cst_18 : tensor<1x256x1280xf32>
-    %611 = stablehlo.multiply %589, %602 : tensor<1x256x1280xf32>
-    %612 = stablehlo.divide %611, %610 : tensor<1x256x1280xf32>
-    %613 = stablehlo.clamp %cst_19, %612, %cst_20 : tensor<1x256x1280xf32>
-    %614 = stablehlo.convert %613 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %615 = stablehlo.add %614, %cst_2 : tensor<1x256x1280xbf16>
-    %616 = stablehlo.multiply %615, %586 : tensor<1x256x1280xbf16>
-    %617 = stablehlo.reshape %616 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %618 = stablehlo.convert %617 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %619 = stablehlo.dot_general %618, %arg151, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %620 = stablehlo.broadcast_in_dim %619, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %621 = stablehlo.multiply %620, %127 : tensor<256x1280xf32>
-    %622 = stablehlo.broadcast_in_dim %621, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %623 = stablehlo.broadcast_in_dim %arg152, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %624 = stablehlo.add %622, %623 : tensor<256x1280xf32>
-    %625 = stablehlo.convert %624 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %626 = stablehlo.reshape %625 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %627 = stablehlo.add %626, %538 : tensor<1x256x1280xbf16>
-    %628 = stablehlo.convert %627 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %629 = stablehlo.convert %628 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %630 = stablehlo.reduce(%629 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %631 = stablehlo.reshape %630 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %632 = stablehlo.broadcast_in_dim %631, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %633 = stablehlo.divide %632, %142 : tensor<1x256x1xf64>
-    %634 = stablehlo.broadcast_in_dim %629, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %635 = stablehlo.broadcast_in_dim %633, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %636 = stablehlo.subtract %634, %635 : tensor<1x256x1280xf64>
-    %637 = stablehlo.multiply %636, %636 : tensor<1x256x1280xf64>
-    %638 = stablehlo.reduce(%637 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %639 = stablehlo.reshape %638 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %640 = stablehlo.broadcast_in_dim %639, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %641 = stablehlo.divide %640, %142 : tensor<1x256x1xf64>
-    %642 = stablehlo.convert %641 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %643 = stablehlo.reduce(%628 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %644 = stablehlo.reshape %643 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %645 = stablehlo.broadcast_in_dim %644, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %646 = stablehlo.divide %645, %158 : tensor<1x256x1xf32>
-    %647 = stablehlo.broadcast_in_dim %642, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %648 = stablehlo.add %647, %161 : tensor<1x256x1xf32>
-    %649 = stablehlo.rsqrt %648 : tensor<1x256x1xf32>
-    %650 = stablehlo.broadcast_in_dim %628, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %651 = stablehlo.broadcast_in_dim %646, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %652 = stablehlo.subtract %650, %651 : tensor<1x256x1280xf32>
-    %653 = stablehlo.broadcast_in_dim %652, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %654 = stablehlo.broadcast_in_dim %649, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %655 = stablehlo.multiply %653, %654 : tensor<1x256x1280xf32>
-    %656 = stablehlo.convert %arg15 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %657 = stablehlo.broadcast_in_dim %655, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %658 = stablehlo.broadcast_in_dim %656, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %659 = stablehlo.multiply %657, %658 : tensor<1x256x1280xf32>
-    %660 = stablehlo.convert %arg16 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %661 = stablehlo.broadcast_in_dim %659, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %662 = stablehlo.broadcast_in_dim %660, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %663 = stablehlo.add %661, %662 : tensor<1x256x1280xf32>
-    %664 = stablehlo.convert %663 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %665 = stablehlo.reshape %664 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %666 = stablehlo.convert %665 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %667 = stablehlo.dot_general %666, %arg153, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %668 = stablehlo.broadcast_in_dim %667, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %669 = stablehlo.multiply %668, %273 : tensor<256x256xf32>
-    %670 = stablehlo.broadcast_in_dim %669, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %671 = stablehlo.broadcast_in_dim %arg154, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %672 = stablehlo.add %670, %671 : tensor<256x256xf32>
-    %673 = stablehlo.convert %672 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %674 = stablehlo.reshape %673 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %675 = stablehlo.dot_general %666, %arg155, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %676 = stablehlo.broadcast_in_dim %675, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %677 = stablehlo.multiply %676, %273 : tensor<256x256xf32>
-    %678 = stablehlo.broadcast_in_dim %677, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %679 = stablehlo.broadcast_in_dim %arg156, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %680 = stablehlo.add %678, %679 : tensor<256x256xf32>
-    %681 = stablehlo.convert %680 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %682 = stablehlo.reshape %681 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %683 = stablehlo.dot_general %666, %arg157, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %684 = stablehlo.broadcast_in_dim %683, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %685 = stablehlo.multiply %684, %127 : tensor<256x1280xf32>
-    %686 = stablehlo.broadcast_in_dim %685, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %687 = stablehlo.broadcast_in_dim %arg158, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %688 = stablehlo.add %686, %687 : tensor<256x1280xf32>
-    %689 = stablehlo.convert %688 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %690 = stablehlo.reshape %689 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %691 = stablehlo.reshape %674 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %692 = stablehlo.transpose %691, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %693 = stablehlo.reshape %682 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %694 = stablehlo.transpose %693, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %695 = stablehlo.reshape %690 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %696 = stablehlo.transpose %695, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %697 = stablehlo.transpose %694, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %698 = stablehlo.reshape %692 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %699 = stablehlo.reshape %697 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %700 = stablehlo.broadcast_in_dim %699, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %701 = stablehlo.dot_general %698, %700, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %702 = stablehlo.reshape %701 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %703 = stablehlo.broadcast_in_dim %702, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %704 = stablehlo.divide %703, %309 : tensor<1x8x256x256xbf16>
-    %705 = stablehlo.convert %704 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %706 = stablehlo.reduce(%705 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %707 = stablehlo.reshape %706 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %708 = stablehlo.broadcast_in_dim %705, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %709 = stablehlo.broadcast_in_dim %707, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %710 = stablehlo.subtract %708, %709 : tensor<1x8x256x256xf32>
-    %711 = stablehlo.exponential %710 : tensor<1x8x256x256xf32>
-    %712 = stablehlo.reduce(%711 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %713 = stablehlo.reshape %712 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %714 = stablehlo.broadcast_in_dim %711, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %715 = stablehlo.broadcast_in_dim %713, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %716 = stablehlo.divide %714, %715 : tensor<1x8x256x256xf32>
-    %717 = stablehlo.convert %716 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %718 = stablehlo.reshape %717 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %719 = stablehlo.reshape %696 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %720 = stablehlo.broadcast_in_dim %719, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %721 = stablehlo.dot_general %718, %720, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %722 = stablehlo.reshape %721 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %723 = stablehlo.transpose %722, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %724 = stablehlo.reshape %723 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %725 = stablehlo.reshape %724 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %726 = stablehlo.convert %725 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %727 = stablehlo.dot_general %726, %arg159, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %728 = stablehlo.broadcast_in_dim %727, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %729 = stablehlo.multiply %728, %127 : tensor<256x1280xf32>
-    %730 = stablehlo.broadcast_in_dim %729, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %731 = stablehlo.broadcast_in_dim %arg160, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %732 = stablehlo.add %730, %731 : tensor<256x1280xf32>
-    %733 = stablehlo.convert %732 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %734 = stablehlo.reshape %733 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %735 = stablehlo.add %734, %627 : tensor<1x256x1280xbf16>
-    %736 = stablehlo.convert %735 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %737 = stablehlo.convert %736 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %738 = stablehlo.reduce(%737 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %739 = stablehlo.reshape %738 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %740 = stablehlo.broadcast_in_dim %739, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %741 = stablehlo.divide %740, %142 : tensor<1x256x1xf64>
-    %742 = stablehlo.broadcast_in_dim %737, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %743 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %744 = stablehlo.subtract %742, %743 : tensor<1x256x1280xf64>
-    %745 = stablehlo.multiply %744, %744 : tensor<1x256x1280xf64>
-    %746 = stablehlo.reduce(%745 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %747 = stablehlo.reshape %746 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %748 = stablehlo.broadcast_in_dim %747, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %749 = stablehlo.divide %748, %142 : tensor<1x256x1xf64>
-    %750 = stablehlo.convert %749 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %751 = stablehlo.reduce(%736 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %752 = stablehlo.reshape %751 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %753 = stablehlo.broadcast_in_dim %752, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %754 = stablehlo.divide %753, %158 : tensor<1x256x1xf32>
-    %755 = stablehlo.broadcast_in_dim %750, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %756 = stablehlo.add %755, %161 : tensor<1x256x1xf32>
-    %757 = stablehlo.rsqrt %756 : tensor<1x256x1xf32>
-    %758 = stablehlo.broadcast_in_dim %736, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %759 = stablehlo.broadcast_in_dim %754, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %760 = stablehlo.subtract %758, %759 : tensor<1x256x1280xf32>
-    %761 = stablehlo.broadcast_in_dim %760, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %762 = stablehlo.broadcast_in_dim %757, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %763 = stablehlo.multiply %761, %762 : tensor<1x256x1280xf32>
-    %764 = stablehlo.convert %arg17 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %765 = stablehlo.broadcast_in_dim %763, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %766 = stablehlo.broadcast_in_dim %764, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %767 = stablehlo.multiply %765, %766 : tensor<1x256x1280xf32>
-    %768 = stablehlo.convert %arg18 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %769 = stablehlo.broadcast_in_dim %767, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %770 = stablehlo.broadcast_in_dim %768, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %771 = stablehlo.add %769, %770 : tensor<1x256x1280xf32>
-    %772 = stablehlo.convert %771 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %773 = stablehlo.reshape %772 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %774 = stablehlo.convert %773 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %775 = stablehlo.dot_general %774, %arg161, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %776 = stablehlo.broadcast_in_dim %775, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %777 = stablehlo.multiply %776, %127 : tensor<256x1280xf32>
-    %778 = stablehlo.broadcast_in_dim %777, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %779 = stablehlo.broadcast_in_dim %arg162, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %780 = stablehlo.add %778, %779 : tensor<256x1280xf32>
-    %781 = stablehlo.convert %780 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %782 = stablehlo.reshape %781 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %783 = stablehlo.multiply %782, %cst_4 : tensor<1x256x1280xbf16>
-    %784 = stablehlo.multiply %782, %190 : tensor<1x256x1280xbf16>
-    %785 = stablehlo.convert %784 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %786 = stablehlo.clamp %cst_5, %785, %cst_6 : tensor<1x256x1280xf32>
-    %787 = stablehlo.multiply %786, %786 : tensor<1x256x1280xf32>
-    %788 = stablehlo.multiply %cst_7, %787 : tensor<1x256x1280xf32>
-    %789 = stablehlo.add %788, %cst_8 : tensor<1x256x1280xf32>
-    %790 = stablehlo.multiply %789, %787 : tensor<1x256x1280xf32>
-    %791 = stablehlo.add %790, %cst_9 : tensor<1x256x1280xf32>
-    %792 = stablehlo.multiply %791, %787 : tensor<1x256x1280xf32>
-    %793 = stablehlo.add %792, %cst_10 : tensor<1x256x1280xf32>
-    %794 = stablehlo.multiply %793, %787 : tensor<1x256x1280xf32>
-    %795 = stablehlo.add %794, %cst_11 : tensor<1x256x1280xf32>
-    %796 = stablehlo.multiply %795, %787 : tensor<1x256x1280xf32>
-    %797 = stablehlo.add %796, %cst_12 : tensor<1x256x1280xf32>
-    %798 = stablehlo.multiply %797, %787 : tensor<1x256x1280xf32>
-    %799 = stablehlo.add %798, %cst_13 : tensor<1x256x1280xf32>
-    %800 = stablehlo.multiply %cst_14, %787 : tensor<1x256x1280xf32>
-    %801 = stablehlo.add %800, %cst_15 : tensor<1x256x1280xf32>
-    %802 = stablehlo.multiply %801, %787 : tensor<1x256x1280xf32>
-    %803 = stablehlo.add %802, %cst_16 : tensor<1x256x1280xf32>
-    %804 = stablehlo.multiply %803, %787 : tensor<1x256x1280xf32>
-    %805 = stablehlo.add %804, %cst_17 : tensor<1x256x1280xf32>
-    %806 = stablehlo.multiply %805, %787 : tensor<1x256x1280xf32>
-    %807 = stablehlo.add %806, %cst_18 : tensor<1x256x1280xf32>
-    %808 = stablehlo.multiply %786, %799 : tensor<1x256x1280xf32>
-    %809 = stablehlo.divide %808, %807 : tensor<1x256x1280xf32>
-    %810 = stablehlo.clamp %cst_19, %809, %cst_20 : tensor<1x256x1280xf32>
-    %811 = stablehlo.convert %810 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %812 = stablehlo.add %811, %cst_2 : tensor<1x256x1280xbf16>
-    %813 = stablehlo.multiply %812, %783 : tensor<1x256x1280xbf16>
-    %814 = stablehlo.reshape %813 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %815 = stablehlo.convert %814 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %816 = stablehlo.dot_general %815, %arg163, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %817 = stablehlo.broadcast_in_dim %816, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %818 = stablehlo.multiply %817, %127 : tensor<256x1280xf32>
-    %819 = stablehlo.broadcast_in_dim %818, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %820 = stablehlo.broadcast_in_dim %arg164, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %821 = stablehlo.add %819, %820 : tensor<256x1280xf32>
-    %822 = stablehlo.convert %821 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %823 = stablehlo.reshape %822 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %824 = stablehlo.add %823, %735 : tensor<1x256x1280xbf16>
-    %825 = stablehlo.convert %824 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %826 = stablehlo.convert %825 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %827 = stablehlo.reduce(%826 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %828 = stablehlo.reshape %827 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %829 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %830 = stablehlo.divide %829, %142 : tensor<1x256x1xf64>
-    %831 = stablehlo.broadcast_in_dim %826, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %832 = stablehlo.broadcast_in_dim %830, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %833 = stablehlo.subtract %831, %832 : tensor<1x256x1280xf64>
-    %834 = stablehlo.multiply %833, %833 : tensor<1x256x1280xf64>
-    %835 = stablehlo.reduce(%834 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %836 = stablehlo.reshape %835 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %837 = stablehlo.broadcast_in_dim %836, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %838 = stablehlo.divide %837, %142 : tensor<1x256x1xf64>
-    %839 = stablehlo.convert %838 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %840 = stablehlo.reduce(%825 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %841 = stablehlo.reshape %840 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %842 = stablehlo.broadcast_in_dim %841, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %843 = stablehlo.divide %842, %158 : tensor<1x256x1xf32>
-    %844 = stablehlo.broadcast_in_dim %839, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %845 = stablehlo.add %844, %161 : tensor<1x256x1xf32>
-    %846 = stablehlo.rsqrt %845 : tensor<1x256x1xf32>
-    %847 = stablehlo.broadcast_in_dim %825, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %848 = stablehlo.broadcast_in_dim %843, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %849 = stablehlo.subtract %847, %848 : tensor<1x256x1280xf32>
-    %850 = stablehlo.broadcast_in_dim %849, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %851 = stablehlo.broadcast_in_dim %846, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %852 = stablehlo.multiply %850, %851 : tensor<1x256x1280xf32>
-    %853 = stablehlo.convert %arg19 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %854 = stablehlo.broadcast_in_dim %852, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %855 = stablehlo.broadcast_in_dim %853, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %856 = stablehlo.multiply %854, %855 : tensor<1x256x1280xf32>
-    %857 = stablehlo.convert %arg20 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %858 = stablehlo.broadcast_in_dim %856, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %859 = stablehlo.broadcast_in_dim %857, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %860 = stablehlo.add %858, %859 : tensor<1x256x1280xf32>
-    %861 = stablehlo.convert %860 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %862 = stablehlo.reshape %861 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %863 = stablehlo.convert %862 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %864 = stablehlo.dot_general %863, %arg165, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %865 = stablehlo.broadcast_in_dim %864, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %866 = stablehlo.multiply %865, %273 : tensor<256x256xf32>
-    %867 = stablehlo.broadcast_in_dim %866, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %868 = stablehlo.broadcast_in_dim %arg166, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %869 = stablehlo.add %867, %868 : tensor<256x256xf32>
-    %870 = stablehlo.convert %869 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %871 = stablehlo.reshape %870 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %872 = stablehlo.dot_general %863, %arg167, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %873 = stablehlo.broadcast_in_dim %872, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %874 = stablehlo.multiply %873, %273 : tensor<256x256xf32>
-    %875 = stablehlo.broadcast_in_dim %874, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %876 = stablehlo.broadcast_in_dim %arg168, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %877 = stablehlo.add %875, %876 : tensor<256x256xf32>
-    %878 = stablehlo.convert %877 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %879 = stablehlo.reshape %878 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %880 = stablehlo.dot_general %863, %arg169, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %881 = stablehlo.broadcast_in_dim %880, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %882 = stablehlo.multiply %881, %127 : tensor<256x1280xf32>
-    %883 = stablehlo.broadcast_in_dim %882, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %884 = stablehlo.broadcast_in_dim %arg170, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %885 = stablehlo.add %883, %884 : tensor<256x1280xf32>
-    %886 = stablehlo.convert %885 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %887 = stablehlo.reshape %886 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %888 = stablehlo.reshape %871 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %889 = stablehlo.transpose %888, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %890 = stablehlo.reshape %879 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %891 = stablehlo.transpose %890, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %892 = stablehlo.reshape %887 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %893 = stablehlo.transpose %892, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %894 = stablehlo.transpose %891, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %895 = stablehlo.reshape %889 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %896 = stablehlo.reshape %894 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %897 = stablehlo.broadcast_in_dim %896, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %898 = stablehlo.dot_general %895, %897, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %899 = stablehlo.reshape %898 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %900 = stablehlo.broadcast_in_dim %899, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %901 = stablehlo.divide %900, %309 : tensor<1x8x256x256xbf16>
-    %902 = stablehlo.convert %901 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %903 = stablehlo.reduce(%902 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %904 = stablehlo.reshape %903 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %905 = stablehlo.broadcast_in_dim %902, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %906 = stablehlo.broadcast_in_dim %904, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %907 = stablehlo.subtract %905, %906 : tensor<1x8x256x256xf32>
-    %908 = stablehlo.exponential %907 : tensor<1x8x256x256xf32>
-    %909 = stablehlo.reduce(%908 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %910 = stablehlo.reshape %909 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %911 = stablehlo.broadcast_in_dim %908, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %912 = stablehlo.broadcast_in_dim %910, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %913 = stablehlo.divide %911, %912 : tensor<1x8x256x256xf32>
-    %914 = stablehlo.convert %913 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %915 = stablehlo.reshape %914 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %916 = stablehlo.reshape %893 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %917 = stablehlo.broadcast_in_dim %916, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %918 = stablehlo.dot_general %915, %917, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %919 = stablehlo.reshape %918 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %920 = stablehlo.transpose %919, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %921 = stablehlo.reshape %920 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %922 = stablehlo.reshape %921 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %923 = stablehlo.convert %922 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %924 = stablehlo.dot_general %923, %arg171, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %925 = stablehlo.broadcast_in_dim %924, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %926 = stablehlo.multiply %925, %127 : tensor<256x1280xf32>
-    %927 = stablehlo.broadcast_in_dim %926, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %928 = stablehlo.broadcast_in_dim %arg172, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %929 = stablehlo.add %927, %928 : tensor<256x1280xf32>
-    %930 = stablehlo.convert %929 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %931 = stablehlo.reshape %930 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %932 = stablehlo.add %931, %824 : tensor<1x256x1280xbf16>
-    %933 = stablehlo.convert %932 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %934 = stablehlo.convert %933 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %935 = stablehlo.reduce(%934 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %936 = stablehlo.reshape %935 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %937 = stablehlo.broadcast_in_dim %936, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %938 = stablehlo.divide %937, %142 : tensor<1x256x1xf64>
-    %939 = stablehlo.broadcast_in_dim %934, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %940 = stablehlo.broadcast_in_dim %938, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %941 = stablehlo.subtract %939, %940 : tensor<1x256x1280xf64>
-    %942 = stablehlo.multiply %941, %941 : tensor<1x256x1280xf64>
-    %943 = stablehlo.reduce(%942 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %944 = stablehlo.reshape %943 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %945 = stablehlo.broadcast_in_dim %944, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %946 = stablehlo.divide %945, %142 : tensor<1x256x1xf64>
-    %947 = stablehlo.convert %946 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %948 = stablehlo.reduce(%933 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %949 = stablehlo.reshape %948 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %950 = stablehlo.broadcast_in_dim %949, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %951 = stablehlo.divide %950, %158 : tensor<1x256x1xf32>
-    %952 = stablehlo.broadcast_in_dim %947, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %953 = stablehlo.add %952, %161 : tensor<1x256x1xf32>
-    %954 = stablehlo.rsqrt %953 : tensor<1x256x1xf32>
-    %955 = stablehlo.broadcast_in_dim %933, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %956 = stablehlo.broadcast_in_dim %951, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %957 = stablehlo.subtract %955, %956 : tensor<1x256x1280xf32>
-    %958 = stablehlo.broadcast_in_dim %957, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %959 = stablehlo.broadcast_in_dim %954, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %960 = stablehlo.multiply %958, %959 : tensor<1x256x1280xf32>
-    %961 = stablehlo.convert %arg21 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %962 = stablehlo.broadcast_in_dim %960, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %963 = stablehlo.broadcast_in_dim %961, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %964 = stablehlo.multiply %962, %963 : tensor<1x256x1280xf32>
-    %965 = stablehlo.convert %arg22 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %966 = stablehlo.broadcast_in_dim %964, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %967 = stablehlo.broadcast_in_dim %965, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %968 = stablehlo.add %966, %967 : tensor<1x256x1280xf32>
-    %969 = stablehlo.convert %968 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %970 = stablehlo.reshape %969 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %971 = stablehlo.convert %970 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %972 = stablehlo.dot_general %971, %arg173, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %973 = stablehlo.broadcast_in_dim %972, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %974 = stablehlo.multiply %973, %127 : tensor<256x1280xf32>
-    %975 = stablehlo.broadcast_in_dim %974, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %976 = stablehlo.broadcast_in_dim %arg174, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %977 = stablehlo.add %975, %976 : tensor<256x1280xf32>
-    %978 = stablehlo.convert %977 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %979 = stablehlo.reshape %978 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %980 = stablehlo.multiply %979, %cst_4 : tensor<1x256x1280xbf16>
-    %981 = stablehlo.multiply %979, %190 : tensor<1x256x1280xbf16>
-    %982 = stablehlo.convert %981 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %983 = stablehlo.clamp %cst_5, %982, %cst_6 : tensor<1x256x1280xf32>
-    %984 = stablehlo.multiply %983, %983 : tensor<1x256x1280xf32>
-    %985 = stablehlo.multiply %cst_7, %984 : tensor<1x256x1280xf32>
-    %986 = stablehlo.add %985, %cst_8 : tensor<1x256x1280xf32>
-    %987 = stablehlo.multiply %986, %984 : tensor<1x256x1280xf32>
-    %988 = stablehlo.add %987, %cst_9 : tensor<1x256x1280xf32>
-    %989 = stablehlo.multiply %988, %984 : tensor<1x256x1280xf32>
-    %990 = stablehlo.add %989, %cst_10 : tensor<1x256x1280xf32>
-    %991 = stablehlo.multiply %990, %984 : tensor<1x256x1280xf32>
-    %992 = stablehlo.add %991, %cst_11 : tensor<1x256x1280xf32>
-    %993 = stablehlo.multiply %992, %984 : tensor<1x256x1280xf32>
-    %994 = stablehlo.add %993, %cst_12 : tensor<1x256x1280xf32>
-    %995 = stablehlo.multiply %994, %984 : tensor<1x256x1280xf32>
-    %996 = stablehlo.add %995, %cst_13 : tensor<1x256x1280xf32>
-    %997 = stablehlo.multiply %cst_14, %984 : tensor<1x256x1280xf32>
-    %998 = stablehlo.add %997, %cst_15 : tensor<1x256x1280xf32>
-    %999 = stablehlo.multiply %998, %984 : tensor<1x256x1280xf32>
-    %1000 = stablehlo.add %999, %cst_16 : tensor<1x256x1280xf32>
-    %1001 = stablehlo.multiply %1000, %984 : tensor<1x256x1280xf32>
-    %1002 = stablehlo.add %1001, %cst_17 : tensor<1x256x1280xf32>
-    %1003 = stablehlo.multiply %1002, %984 : tensor<1x256x1280xf32>
-    %1004 = stablehlo.add %1003, %cst_18 : tensor<1x256x1280xf32>
-    %1005 = stablehlo.multiply %983, %996 : tensor<1x256x1280xf32>
-    %1006 = stablehlo.divide %1005, %1004 : tensor<1x256x1280xf32>
-    %1007 = stablehlo.clamp %cst_19, %1006, %cst_20 : tensor<1x256x1280xf32>
-    %1008 = stablehlo.convert %1007 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1009 = stablehlo.add %1008, %cst_2 : tensor<1x256x1280xbf16>
-    %1010 = stablehlo.multiply %1009, %980 : tensor<1x256x1280xbf16>
-    %1011 = stablehlo.reshape %1010 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1012 = stablehlo.convert %1011 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1013 = stablehlo.dot_general %1012, %arg175, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1014 = stablehlo.broadcast_in_dim %1013, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1015 = stablehlo.multiply %1014, %127 : tensor<256x1280xf32>
-    %1016 = stablehlo.broadcast_in_dim %1015, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1017 = stablehlo.broadcast_in_dim %arg176, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1018 = stablehlo.add %1016, %1017 : tensor<256x1280xf32>
-    %1019 = stablehlo.convert %1018 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1020 = stablehlo.reshape %1019 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1021 = stablehlo.add %1020, %932 : tensor<1x256x1280xbf16>
-    %1022 = stablehlo.convert %1021 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1023 = stablehlo.convert %1022 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1024 = stablehlo.reduce(%1023 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1025 = stablehlo.reshape %1024 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1026 = stablehlo.broadcast_in_dim %1025, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1027 = stablehlo.divide %1026, %142 : tensor<1x256x1xf64>
-    %1028 = stablehlo.broadcast_in_dim %1023, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1029 = stablehlo.broadcast_in_dim %1027, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1030 = stablehlo.subtract %1028, %1029 : tensor<1x256x1280xf64>
-    %1031 = stablehlo.multiply %1030, %1030 : tensor<1x256x1280xf64>
-    %1032 = stablehlo.reduce(%1031 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1033 = stablehlo.reshape %1032 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1034 = stablehlo.broadcast_in_dim %1033, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1035 = stablehlo.divide %1034, %142 : tensor<1x256x1xf64>
-    %1036 = stablehlo.convert %1035 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1037 = stablehlo.reduce(%1022 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1038 = stablehlo.reshape %1037 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1039 = stablehlo.broadcast_in_dim %1038, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1040 = stablehlo.divide %1039, %158 : tensor<1x256x1xf32>
-    %1041 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1042 = stablehlo.add %1041, %161 : tensor<1x256x1xf32>
-    %1043 = stablehlo.rsqrt %1042 : tensor<1x256x1xf32>
-    %1044 = stablehlo.broadcast_in_dim %1022, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1045 = stablehlo.broadcast_in_dim %1040, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1046 = stablehlo.subtract %1044, %1045 : tensor<1x256x1280xf32>
-    %1047 = stablehlo.broadcast_in_dim %1046, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1048 = stablehlo.broadcast_in_dim %1043, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1049 = stablehlo.multiply %1047, %1048 : tensor<1x256x1280xf32>
-    %1050 = stablehlo.convert %arg23 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1051 = stablehlo.broadcast_in_dim %1049, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1052 = stablehlo.broadcast_in_dim %1050, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1053 = stablehlo.multiply %1051, %1052 : tensor<1x256x1280xf32>
-    %1054 = stablehlo.convert %arg24 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1055 = stablehlo.broadcast_in_dim %1053, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1056 = stablehlo.broadcast_in_dim %1054, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1057 = stablehlo.add %1055, %1056 : tensor<1x256x1280xf32>
-    %1058 = stablehlo.convert %1057 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1059 = stablehlo.reshape %1058 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1060 = stablehlo.convert %1059 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1061 = stablehlo.dot_general %1060, %arg177, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1062 = stablehlo.broadcast_in_dim %1061, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1063 = stablehlo.multiply %1062, %273 : tensor<256x256xf32>
-    %1064 = stablehlo.broadcast_in_dim %1063, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1065 = stablehlo.broadcast_in_dim %arg178, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1066 = stablehlo.add %1064, %1065 : tensor<256x256xf32>
-    %1067 = stablehlo.convert %1066 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1068 = stablehlo.reshape %1067 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1069 = stablehlo.dot_general %1060, %arg179, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1070 = stablehlo.broadcast_in_dim %1069, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1071 = stablehlo.multiply %1070, %273 : tensor<256x256xf32>
-    %1072 = stablehlo.broadcast_in_dim %1071, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1073 = stablehlo.broadcast_in_dim %arg180, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1074 = stablehlo.add %1072, %1073 : tensor<256x256xf32>
-    %1075 = stablehlo.convert %1074 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1076 = stablehlo.reshape %1075 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1077 = stablehlo.dot_general %1060, %arg181, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1078 = stablehlo.broadcast_in_dim %1077, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1079 = stablehlo.multiply %1078, %127 : tensor<256x1280xf32>
-    %1080 = stablehlo.broadcast_in_dim %1079, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1081 = stablehlo.broadcast_in_dim %arg182, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1082 = stablehlo.add %1080, %1081 : tensor<256x1280xf32>
-    %1083 = stablehlo.convert %1082 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1084 = stablehlo.reshape %1083 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1085 = stablehlo.reshape %1068 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1086 = stablehlo.transpose %1085, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1087 = stablehlo.reshape %1076 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1088 = stablehlo.transpose %1087, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1089 = stablehlo.reshape %1084 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %1090 = stablehlo.transpose %1089, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1091 = stablehlo.transpose %1088, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %1092 = stablehlo.reshape %1086 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1093 = stablehlo.reshape %1091 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1094 = stablehlo.broadcast_in_dim %1093, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1095 = stablehlo.dot_general %1092, %1094, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %1096 = stablehlo.reshape %1095 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1097 = stablehlo.broadcast_in_dim %1096, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1098 = stablehlo.divide %1097, %309 : tensor<1x8x256x256xbf16>
-    %1099 = stablehlo.convert %1098 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %1100 = stablehlo.reduce(%1099 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1101 = stablehlo.reshape %1100 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1102 = stablehlo.broadcast_in_dim %1099, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1103 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1104 = stablehlo.subtract %1102, %1103 : tensor<1x8x256x256xf32>
-    %1105 = stablehlo.exponential %1104 : tensor<1x8x256x256xf32>
-    %1106 = stablehlo.reduce(%1105 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1107 = stablehlo.reshape %1106 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1108 = stablehlo.broadcast_in_dim %1105, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1109 = stablehlo.broadcast_in_dim %1107, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1110 = stablehlo.divide %1108, %1109 : tensor<1x8x256x256xf32>
-    %1111 = stablehlo.convert %1110 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %1112 = stablehlo.reshape %1111 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %1113 = stablehlo.reshape %1090 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1114 = stablehlo.broadcast_in_dim %1113, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1115 = stablehlo.dot_general %1112, %1114, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1116 = stablehlo.reshape %1115 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1117 = stablehlo.transpose %1116, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %1118 = stablehlo.reshape %1117 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %1119 = stablehlo.reshape %1118 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1120 = stablehlo.convert %1119 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1121 = stablehlo.dot_general %1120, %arg183, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1122 = stablehlo.broadcast_in_dim %1121, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1123 = stablehlo.multiply %1122, %127 : tensor<256x1280xf32>
-    %1124 = stablehlo.broadcast_in_dim %1123, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1125 = stablehlo.broadcast_in_dim %arg184, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1126 = stablehlo.add %1124, %1125 : tensor<256x1280xf32>
-    %1127 = stablehlo.convert %1126 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1128 = stablehlo.reshape %1127 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1129 = stablehlo.add %1128, %1021 : tensor<1x256x1280xbf16>
-    %1130 = stablehlo.convert %1129 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1131 = stablehlo.convert %1130 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1132 = stablehlo.reduce(%1131 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1133 = stablehlo.reshape %1132 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1134 = stablehlo.broadcast_in_dim %1133, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1135 = stablehlo.divide %1134, %142 : tensor<1x256x1xf64>
-    %1136 = stablehlo.broadcast_in_dim %1131, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1137 = stablehlo.broadcast_in_dim %1135, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1138 = stablehlo.subtract %1136, %1137 : tensor<1x256x1280xf64>
-    %1139 = stablehlo.multiply %1138, %1138 : tensor<1x256x1280xf64>
-    %1140 = stablehlo.reduce(%1139 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1141 = stablehlo.reshape %1140 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1142 = stablehlo.broadcast_in_dim %1141, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1143 = stablehlo.divide %1142, %142 : tensor<1x256x1xf64>
-    %1144 = stablehlo.convert %1143 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1145 = stablehlo.reduce(%1130 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1146 = stablehlo.reshape %1145 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1147 = stablehlo.broadcast_in_dim %1146, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1148 = stablehlo.divide %1147, %158 : tensor<1x256x1xf32>
-    %1149 = stablehlo.broadcast_in_dim %1144, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1150 = stablehlo.add %1149, %161 : tensor<1x256x1xf32>
-    %1151 = stablehlo.rsqrt %1150 : tensor<1x256x1xf32>
-    %1152 = stablehlo.broadcast_in_dim %1130, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1153 = stablehlo.broadcast_in_dim %1148, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1154 = stablehlo.subtract %1152, %1153 : tensor<1x256x1280xf32>
-    %1155 = stablehlo.broadcast_in_dim %1154, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1156 = stablehlo.broadcast_in_dim %1151, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1157 = stablehlo.multiply %1155, %1156 : tensor<1x256x1280xf32>
-    %1158 = stablehlo.convert %arg25 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1159 = stablehlo.broadcast_in_dim %1157, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1160 = stablehlo.broadcast_in_dim %1158, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1161 = stablehlo.multiply %1159, %1160 : tensor<1x256x1280xf32>
-    %1162 = stablehlo.convert %arg26 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1163 = stablehlo.broadcast_in_dim %1161, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1164 = stablehlo.broadcast_in_dim %1162, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1165 = stablehlo.add %1163, %1164 : tensor<1x256x1280xf32>
-    %1166 = stablehlo.convert %1165 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1167 = stablehlo.reshape %1166 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1168 = stablehlo.convert %1167 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1169 = stablehlo.dot_general %1168, %arg185, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1170 = stablehlo.broadcast_in_dim %1169, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1171 = stablehlo.multiply %1170, %127 : tensor<256x1280xf32>
-    %1172 = stablehlo.broadcast_in_dim %1171, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1173 = stablehlo.broadcast_in_dim %arg186, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1174 = stablehlo.add %1172, %1173 : tensor<256x1280xf32>
-    %1175 = stablehlo.convert %1174 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1176 = stablehlo.reshape %1175 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1177 = stablehlo.multiply %1176, %cst_4 : tensor<1x256x1280xbf16>
-    %1178 = stablehlo.multiply %1176, %190 : tensor<1x256x1280xbf16>
-    %1179 = stablehlo.convert %1178 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1180 = stablehlo.clamp %cst_5, %1179, %cst_6 : tensor<1x256x1280xf32>
-    %1181 = stablehlo.multiply %1180, %1180 : tensor<1x256x1280xf32>
-    %1182 = stablehlo.multiply %cst_7, %1181 : tensor<1x256x1280xf32>
-    %1183 = stablehlo.add %1182, %cst_8 : tensor<1x256x1280xf32>
-    %1184 = stablehlo.multiply %1183, %1181 : tensor<1x256x1280xf32>
-    %1185 = stablehlo.add %1184, %cst_9 : tensor<1x256x1280xf32>
-    %1186 = stablehlo.multiply %1185, %1181 : tensor<1x256x1280xf32>
-    %1187 = stablehlo.add %1186, %cst_10 : tensor<1x256x1280xf32>
-    %1188 = stablehlo.multiply %1187, %1181 : tensor<1x256x1280xf32>
-    %1189 = stablehlo.add %1188, %cst_11 : tensor<1x256x1280xf32>
-    %1190 = stablehlo.multiply %1189, %1181 : tensor<1x256x1280xf32>
-    %1191 = stablehlo.add %1190, %cst_12 : tensor<1x256x1280xf32>
-    %1192 = stablehlo.multiply %1191, %1181 : tensor<1x256x1280xf32>
-    %1193 = stablehlo.add %1192, %cst_13 : tensor<1x256x1280xf32>
-    %1194 = stablehlo.multiply %cst_14, %1181 : tensor<1x256x1280xf32>
-    %1195 = stablehlo.add %1194, %cst_15 : tensor<1x256x1280xf32>
-    %1196 = stablehlo.multiply %1195, %1181 : tensor<1x256x1280xf32>
-    %1197 = stablehlo.add %1196, %cst_16 : tensor<1x256x1280xf32>
-    %1198 = stablehlo.multiply %1197, %1181 : tensor<1x256x1280xf32>
-    %1199 = stablehlo.add %1198, %cst_17 : tensor<1x256x1280xf32>
-    %1200 = stablehlo.multiply %1199, %1181 : tensor<1x256x1280xf32>
-    %1201 = stablehlo.add %1200, %cst_18 : tensor<1x256x1280xf32>
-    %1202 = stablehlo.multiply %1180, %1193 : tensor<1x256x1280xf32>
-    %1203 = stablehlo.divide %1202, %1201 : tensor<1x256x1280xf32>
-    %1204 = stablehlo.clamp %cst_19, %1203, %cst_20 : tensor<1x256x1280xf32>
-    %1205 = stablehlo.convert %1204 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1206 = stablehlo.add %1205, %cst_2 : tensor<1x256x1280xbf16>
-    %1207 = stablehlo.multiply %1206, %1177 : tensor<1x256x1280xbf16>
-    %1208 = stablehlo.reshape %1207 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1209 = stablehlo.convert %1208 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1210 = stablehlo.dot_general %1209, %arg187, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1211 = stablehlo.broadcast_in_dim %1210, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1212 = stablehlo.multiply %1211, %127 : tensor<256x1280xf32>
-    %1213 = stablehlo.broadcast_in_dim %1212, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1214 = stablehlo.broadcast_in_dim %arg188, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1215 = stablehlo.add %1213, %1214 : tensor<256x1280xf32>
-    %1216 = stablehlo.convert %1215 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1217 = stablehlo.reshape %1216 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1218 = stablehlo.add %1217, %1129 : tensor<1x256x1280xbf16>
-    %1219 = stablehlo.convert %1218 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1220 = stablehlo.convert %1219 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1221 = stablehlo.reduce(%1220 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1222 = stablehlo.reshape %1221 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1223 = stablehlo.broadcast_in_dim %1222, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1224 = stablehlo.divide %1223, %142 : tensor<1x256x1xf64>
-    %1225 = stablehlo.broadcast_in_dim %1220, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1226 = stablehlo.broadcast_in_dim %1224, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1227 = stablehlo.subtract %1225, %1226 : tensor<1x256x1280xf64>
-    %1228 = stablehlo.multiply %1227, %1227 : tensor<1x256x1280xf64>
-    %1229 = stablehlo.reduce(%1228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1230 = stablehlo.reshape %1229 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1231 = stablehlo.broadcast_in_dim %1230, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1232 = stablehlo.divide %1231, %142 : tensor<1x256x1xf64>
-    %1233 = stablehlo.convert %1232 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1234 = stablehlo.reduce(%1219 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1235 = stablehlo.reshape %1234 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1236 = stablehlo.broadcast_in_dim %1235, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1237 = stablehlo.divide %1236, %158 : tensor<1x256x1xf32>
-    %1238 = stablehlo.broadcast_in_dim %1233, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1239 = stablehlo.add %1238, %161 : tensor<1x256x1xf32>
-    %1240 = stablehlo.rsqrt %1239 : tensor<1x256x1xf32>
-    %1241 = stablehlo.broadcast_in_dim %1219, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1242 = stablehlo.broadcast_in_dim %1237, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1243 = stablehlo.subtract %1241, %1242 : tensor<1x256x1280xf32>
-    %1244 = stablehlo.broadcast_in_dim %1243, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1245 = stablehlo.broadcast_in_dim %1240, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1246 = stablehlo.multiply %1244, %1245 : tensor<1x256x1280xf32>
-    %1247 = stablehlo.convert %arg27 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1248 = stablehlo.broadcast_in_dim %1246, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1249 = stablehlo.broadcast_in_dim %1247, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1250 = stablehlo.multiply %1248, %1249 : tensor<1x256x1280xf32>
-    %1251 = stablehlo.convert %arg28 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1252 = stablehlo.broadcast_in_dim %1250, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1253 = stablehlo.broadcast_in_dim %1251, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1254 = stablehlo.add %1252, %1253 : tensor<1x256x1280xf32>
-    %1255 = stablehlo.convert %1254 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1256 = stablehlo.reshape %1255 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1257 = stablehlo.convert %1256 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1258 = stablehlo.dot_general %1257, %arg189, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1259 = stablehlo.broadcast_in_dim %1258, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1260 = stablehlo.multiply %1259, %273 : tensor<256x256xf32>
-    %1261 = stablehlo.broadcast_in_dim %1260, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1262 = stablehlo.broadcast_in_dim %arg190, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1263 = stablehlo.add %1261, %1262 : tensor<256x256xf32>
-    %1264 = stablehlo.convert %1263 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1265 = stablehlo.reshape %1264 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1266 = stablehlo.dot_general %1257, %arg191, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1267 = stablehlo.broadcast_in_dim %1266, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1268 = stablehlo.multiply %1267, %273 : tensor<256x256xf32>
-    %1269 = stablehlo.broadcast_in_dim %1268, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1270 = stablehlo.broadcast_in_dim %arg192, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1271 = stablehlo.add %1269, %1270 : tensor<256x256xf32>
-    %1272 = stablehlo.convert %1271 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1273 = stablehlo.reshape %1272 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1274 = stablehlo.dot_general %1257, %arg193, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1275 = stablehlo.broadcast_in_dim %1274, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1276 = stablehlo.multiply %1275, %127 : tensor<256x1280xf32>
-    %1277 = stablehlo.broadcast_in_dim %1276, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1278 = stablehlo.broadcast_in_dim %arg194, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1279 = stablehlo.add %1277, %1278 : tensor<256x1280xf32>
-    %1280 = stablehlo.convert %1279 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1281 = stablehlo.reshape %1280 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1282 = stablehlo.reshape %1265 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1283 = stablehlo.transpose %1282, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1284 = stablehlo.reshape %1273 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1285 = stablehlo.transpose %1284, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1286 = stablehlo.reshape %1281 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %1287 = stablehlo.transpose %1286, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1288 = stablehlo.transpose %1285, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %1289 = stablehlo.reshape %1283 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1290 = stablehlo.reshape %1288 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1291 = stablehlo.broadcast_in_dim %1290, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1292 = stablehlo.dot_general %1289, %1291, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %1293 = stablehlo.reshape %1292 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1294 = stablehlo.broadcast_in_dim %1293, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1295 = stablehlo.divide %1294, %309 : tensor<1x8x256x256xbf16>
-    %1296 = stablehlo.convert %1295 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %1297 = stablehlo.reduce(%1296 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1298 = stablehlo.reshape %1297 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1299 = stablehlo.broadcast_in_dim %1296, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1300 = stablehlo.broadcast_in_dim %1298, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1301 = stablehlo.subtract %1299, %1300 : tensor<1x8x256x256xf32>
-    %1302 = stablehlo.exponential %1301 : tensor<1x8x256x256xf32>
-    %1303 = stablehlo.reduce(%1302 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1304 = stablehlo.reshape %1303 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1305 = stablehlo.broadcast_in_dim %1302, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1306 = stablehlo.broadcast_in_dim %1304, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1307 = stablehlo.divide %1305, %1306 : tensor<1x8x256x256xf32>
-    %1308 = stablehlo.convert %1307 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %1309 = stablehlo.reshape %1308 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %1310 = stablehlo.reshape %1287 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1311 = stablehlo.broadcast_in_dim %1310, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1312 = stablehlo.dot_general %1309, %1311, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1313 = stablehlo.reshape %1312 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1314 = stablehlo.transpose %1313, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %1315 = stablehlo.reshape %1314 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %1316 = stablehlo.reshape %1315 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1317 = stablehlo.convert %1316 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1318 = stablehlo.dot_general %1317, %arg195, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1319 = stablehlo.broadcast_in_dim %1318, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1320 = stablehlo.multiply %1319, %127 : tensor<256x1280xf32>
-    %1321 = stablehlo.broadcast_in_dim %1320, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1322 = stablehlo.broadcast_in_dim %arg196, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1323 = stablehlo.add %1321, %1322 : tensor<256x1280xf32>
-    %1324 = stablehlo.convert %1323 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1325 = stablehlo.reshape %1324 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1326 = stablehlo.add %1325, %1218 : tensor<1x256x1280xbf16>
-    %1327 = stablehlo.convert %1326 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1328 = stablehlo.convert %1327 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1329 = stablehlo.reduce(%1328 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1330 = stablehlo.reshape %1329 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1331 = stablehlo.broadcast_in_dim %1330, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1332 = stablehlo.divide %1331, %142 : tensor<1x256x1xf64>
-    %1333 = stablehlo.broadcast_in_dim %1328, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1334 = stablehlo.broadcast_in_dim %1332, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1335 = stablehlo.subtract %1333, %1334 : tensor<1x256x1280xf64>
-    %1336 = stablehlo.multiply %1335, %1335 : tensor<1x256x1280xf64>
-    %1337 = stablehlo.reduce(%1336 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1338 = stablehlo.reshape %1337 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1339 = stablehlo.broadcast_in_dim %1338, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1340 = stablehlo.divide %1339, %142 : tensor<1x256x1xf64>
-    %1341 = stablehlo.convert %1340 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1342 = stablehlo.reduce(%1327 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1343 = stablehlo.reshape %1342 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1344 = stablehlo.broadcast_in_dim %1343, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1345 = stablehlo.divide %1344, %158 : tensor<1x256x1xf32>
-    %1346 = stablehlo.broadcast_in_dim %1341, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1347 = stablehlo.add %1346, %161 : tensor<1x256x1xf32>
-    %1348 = stablehlo.rsqrt %1347 : tensor<1x256x1xf32>
-    %1349 = stablehlo.broadcast_in_dim %1327, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1350 = stablehlo.broadcast_in_dim %1345, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1351 = stablehlo.subtract %1349, %1350 : tensor<1x256x1280xf32>
-    %1352 = stablehlo.broadcast_in_dim %1351, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1353 = stablehlo.broadcast_in_dim %1348, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1354 = stablehlo.multiply %1352, %1353 : tensor<1x256x1280xf32>
-    %1355 = stablehlo.convert %arg29 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1356 = stablehlo.broadcast_in_dim %1354, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1357 = stablehlo.broadcast_in_dim %1355, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1358 = stablehlo.multiply %1356, %1357 : tensor<1x256x1280xf32>
-    %1359 = stablehlo.convert %arg30 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1360 = stablehlo.broadcast_in_dim %1358, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1361 = stablehlo.broadcast_in_dim %1359, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1362 = stablehlo.add %1360, %1361 : tensor<1x256x1280xf32>
-    %1363 = stablehlo.convert %1362 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1364 = stablehlo.reshape %1363 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1365 = stablehlo.convert %1364 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1366 = stablehlo.dot_general %1365, %arg197, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1367 = stablehlo.broadcast_in_dim %1366, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1368 = stablehlo.multiply %1367, %127 : tensor<256x1280xf32>
-    %1369 = stablehlo.broadcast_in_dim %1368, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1370 = stablehlo.broadcast_in_dim %arg198, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1371 = stablehlo.add %1369, %1370 : tensor<256x1280xf32>
-    %1372 = stablehlo.convert %1371 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1373 = stablehlo.reshape %1372 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1374 = stablehlo.multiply %1373, %cst_4 : tensor<1x256x1280xbf16>
-    %1375 = stablehlo.multiply %1373, %190 : tensor<1x256x1280xbf16>
-    %1376 = stablehlo.convert %1375 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1377 = stablehlo.clamp %cst_5, %1376, %cst_6 : tensor<1x256x1280xf32>
-    %1378 = stablehlo.multiply %1377, %1377 : tensor<1x256x1280xf32>
-    %1379 = stablehlo.multiply %cst_7, %1378 : tensor<1x256x1280xf32>
-    %1380 = stablehlo.add %1379, %cst_8 : tensor<1x256x1280xf32>
-    %1381 = stablehlo.multiply %1380, %1378 : tensor<1x256x1280xf32>
-    %1382 = stablehlo.add %1381, %cst_9 : tensor<1x256x1280xf32>
-    %1383 = stablehlo.multiply %1382, %1378 : tensor<1x256x1280xf32>
-    %1384 = stablehlo.add %1383, %cst_10 : tensor<1x256x1280xf32>
-    %1385 = stablehlo.multiply %1384, %1378 : tensor<1x256x1280xf32>
-    %1386 = stablehlo.add %1385, %cst_11 : tensor<1x256x1280xf32>
-    %1387 = stablehlo.multiply %1386, %1378 : tensor<1x256x1280xf32>
-    %1388 = stablehlo.add %1387, %cst_12 : tensor<1x256x1280xf32>
-    %1389 = stablehlo.multiply %1388, %1378 : tensor<1x256x1280xf32>
-    %1390 = stablehlo.add %1389, %cst_13 : tensor<1x256x1280xf32>
-    %1391 = stablehlo.multiply %cst_14, %1378 : tensor<1x256x1280xf32>
-    %1392 = stablehlo.add %1391, %cst_15 : tensor<1x256x1280xf32>
-    %1393 = stablehlo.multiply %1392, %1378 : tensor<1x256x1280xf32>
-    %1394 = stablehlo.add %1393, %cst_16 : tensor<1x256x1280xf32>
-    %1395 = stablehlo.multiply %1394, %1378 : tensor<1x256x1280xf32>
-    %1396 = stablehlo.add %1395, %cst_17 : tensor<1x256x1280xf32>
-    %1397 = stablehlo.multiply %1396, %1378 : tensor<1x256x1280xf32>
-    %1398 = stablehlo.add %1397, %cst_18 : tensor<1x256x1280xf32>
-    %1399 = stablehlo.multiply %1377, %1390 : tensor<1x256x1280xf32>
-    %1400 = stablehlo.divide %1399, %1398 : tensor<1x256x1280xf32>
-    %1401 = stablehlo.clamp %cst_19, %1400, %cst_20 : tensor<1x256x1280xf32>
-    %1402 = stablehlo.convert %1401 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1403 = stablehlo.add %1402, %cst_2 : tensor<1x256x1280xbf16>
-    %1404 = stablehlo.multiply %1403, %1374 : tensor<1x256x1280xbf16>
-    %1405 = stablehlo.reshape %1404 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1406 = stablehlo.convert %1405 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1407 = stablehlo.dot_general %1406, %arg199, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1408 = stablehlo.broadcast_in_dim %1407, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1409 = stablehlo.multiply %1408, %127 : tensor<256x1280xf32>
-    %1410 = stablehlo.broadcast_in_dim %1409, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1411 = stablehlo.broadcast_in_dim %arg200, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1412 = stablehlo.add %1410, %1411 : tensor<256x1280xf32>
-    %1413 = stablehlo.convert %1412 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1414 = stablehlo.reshape %1413 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1415 = stablehlo.add %1414, %1326 : tensor<1x256x1280xbf16>
-    %1416 = stablehlo.convert %1415 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1417 = stablehlo.convert %1416 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1418 = stablehlo.reduce(%1417 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1419 = stablehlo.reshape %1418 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1420 = stablehlo.broadcast_in_dim %1419, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1421 = stablehlo.divide %1420, %142 : tensor<1x256x1xf64>
-    %1422 = stablehlo.broadcast_in_dim %1417, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1423 = stablehlo.broadcast_in_dim %1421, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1424 = stablehlo.subtract %1422, %1423 : tensor<1x256x1280xf64>
-    %1425 = stablehlo.multiply %1424, %1424 : tensor<1x256x1280xf64>
-    %1426 = stablehlo.reduce(%1425 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1427 = stablehlo.reshape %1426 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1428 = stablehlo.broadcast_in_dim %1427, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1429 = stablehlo.divide %1428, %142 : tensor<1x256x1xf64>
-    %1430 = stablehlo.convert %1429 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1431 = stablehlo.reduce(%1416 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1432 = stablehlo.reshape %1431 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1433 = stablehlo.broadcast_in_dim %1432, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1434 = stablehlo.divide %1433, %158 : tensor<1x256x1xf32>
-    %1435 = stablehlo.broadcast_in_dim %1430, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1436 = stablehlo.add %1435, %161 : tensor<1x256x1xf32>
-    %1437 = stablehlo.rsqrt %1436 : tensor<1x256x1xf32>
-    %1438 = stablehlo.broadcast_in_dim %1416, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1439 = stablehlo.broadcast_in_dim %1434, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1440 = stablehlo.subtract %1438, %1439 : tensor<1x256x1280xf32>
-    %1441 = stablehlo.broadcast_in_dim %1440, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1442 = stablehlo.broadcast_in_dim %1437, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1443 = stablehlo.multiply %1441, %1442 : tensor<1x256x1280xf32>
-    %1444 = stablehlo.convert %arg31 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1445 = stablehlo.broadcast_in_dim %1443, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1446 = stablehlo.broadcast_in_dim %1444, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1447 = stablehlo.multiply %1445, %1446 : tensor<1x256x1280xf32>
-    %1448 = stablehlo.convert %arg32 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1449 = stablehlo.broadcast_in_dim %1447, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1450 = stablehlo.broadcast_in_dim %1448, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1451 = stablehlo.add %1449, %1450 : tensor<1x256x1280xf32>
-    %1452 = stablehlo.convert %1451 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1453 = stablehlo.reshape %1452 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1454 = stablehlo.convert %1453 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1455 = stablehlo.dot_general %1454, %arg201, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1456 = stablehlo.broadcast_in_dim %1455, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1457 = stablehlo.multiply %1456, %273 : tensor<256x256xf32>
-    %1458 = stablehlo.broadcast_in_dim %1457, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1459 = stablehlo.broadcast_in_dim %arg202, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1460 = stablehlo.add %1458, %1459 : tensor<256x256xf32>
-    %1461 = stablehlo.convert %1460 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1462 = stablehlo.reshape %1461 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1463 = stablehlo.dot_general %1454, %arg203, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1464 = stablehlo.broadcast_in_dim %1463, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1465 = stablehlo.multiply %1464, %273 : tensor<256x256xf32>
-    %1466 = stablehlo.broadcast_in_dim %1465, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1467 = stablehlo.broadcast_in_dim %arg204, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1468 = stablehlo.add %1466, %1467 : tensor<256x256xf32>
-    %1469 = stablehlo.convert %1468 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1470 = stablehlo.reshape %1469 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1471 = stablehlo.dot_general %1454, %arg205, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1472 = stablehlo.broadcast_in_dim %1471, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1473 = stablehlo.multiply %1472, %127 : tensor<256x1280xf32>
-    %1474 = stablehlo.broadcast_in_dim %1473, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1475 = stablehlo.broadcast_in_dim %arg206, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1476 = stablehlo.add %1474, %1475 : tensor<256x1280xf32>
-    %1477 = stablehlo.convert %1476 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1478 = stablehlo.reshape %1477 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1479 = stablehlo.reshape %1462 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1480 = stablehlo.transpose %1479, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1481 = stablehlo.reshape %1470 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1482 = stablehlo.transpose %1481, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1483 = stablehlo.reshape %1478 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %1484 = stablehlo.transpose %1483, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1485 = stablehlo.transpose %1482, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %1486 = stablehlo.reshape %1480 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1487 = stablehlo.reshape %1485 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1488 = stablehlo.broadcast_in_dim %1487, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1489 = stablehlo.dot_general %1486, %1488, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %1490 = stablehlo.reshape %1489 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1491 = stablehlo.broadcast_in_dim %1490, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1492 = stablehlo.divide %1491, %309 : tensor<1x8x256x256xbf16>
-    %1493 = stablehlo.convert %1492 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %1494 = stablehlo.reduce(%1493 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1495 = stablehlo.reshape %1494 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1496 = stablehlo.broadcast_in_dim %1493, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1497 = stablehlo.broadcast_in_dim %1495, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1498 = stablehlo.subtract %1496, %1497 : tensor<1x8x256x256xf32>
-    %1499 = stablehlo.exponential %1498 : tensor<1x8x256x256xf32>
-    %1500 = stablehlo.reduce(%1499 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1501 = stablehlo.reshape %1500 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1502 = stablehlo.broadcast_in_dim %1499, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1503 = stablehlo.broadcast_in_dim %1501, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1504 = stablehlo.divide %1502, %1503 : tensor<1x8x256x256xf32>
-    %1505 = stablehlo.convert %1504 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %1506 = stablehlo.reshape %1505 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %1507 = stablehlo.reshape %1484 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1508 = stablehlo.broadcast_in_dim %1507, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1509 = stablehlo.dot_general %1506, %1508, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1510 = stablehlo.reshape %1509 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1511 = stablehlo.transpose %1510, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %1512 = stablehlo.reshape %1511 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %1513 = stablehlo.reshape %1512 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1514 = stablehlo.convert %1513 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1515 = stablehlo.dot_general %1514, %arg207, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1516 = stablehlo.broadcast_in_dim %1515, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1517 = stablehlo.multiply %1516, %127 : tensor<256x1280xf32>
-    %1518 = stablehlo.broadcast_in_dim %1517, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1519 = stablehlo.broadcast_in_dim %arg208, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1520 = stablehlo.add %1518, %1519 : tensor<256x1280xf32>
-    %1521 = stablehlo.convert %1520 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1522 = stablehlo.reshape %1521 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1523 = stablehlo.add %1522, %1415 : tensor<1x256x1280xbf16>
-    %1524 = stablehlo.convert %1523 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1525 = stablehlo.convert %1524 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1526 = stablehlo.reduce(%1525 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1527 = stablehlo.reshape %1526 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1528 = stablehlo.broadcast_in_dim %1527, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1529 = stablehlo.divide %1528, %142 : tensor<1x256x1xf64>
-    %1530 = stablehlo.broadcast_in_dim %1525, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1531 = stablehlo.broadcast_in_dim %1529, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1532 = stablehlo.subtract %1530, %1531 : tensor<1x256x1280xf64>
-    %1533 = stablehlo.multiply %1532, %1532 : tensor<1x256x1280xf64>
-    %1534 = stablehlo.reduce(%1533 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1535 = stablehlo.reshape %1534 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1536 = stablehlo.broadcast_in_dim %1535, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1537 = stablehlo.divide %1536, %142 : tensor<1x256x1xf64>
-    %1538 = stablehlo.convert %1537 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1539 = stablehlo.reduce(%1524 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1540 = stablehlo.reshape %1539 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1541 = stablehlo.broadcast_in_dim %1540, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1542 = stablehlo.divide %1541, %158 : tensor<1x256x1xf32>
-    %1543 = stablehlo.broadcast_in_dim %1538, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1544 = stablehlo.add %1543, %161 : tensor<1x256x1xf32>
-    %1545 = stablehlo.rsqrt %1544 : tensor<1x256x1xf32>
-    %1546 = stablehlo.broadcast_in_dim %1524, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1547 = stablehlo.broadcast_in_dim %1542, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1548 = stablehlo.subtract %1546, %1547 : tensor<1x256x1280xf32>
-    %1549 = stablehlo.broadcast_in_dim %1548, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1550 = stablehlo.broadcast_in_dim %1545, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1551 = stablehlo.multiply %1549, %1550 : tensor<1x256x1280xf32>
-    %1552 = stablehlo.convert %arg33 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1553 = stablehlo.broadcast_in_dim %1551, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1554 = stablehlo.broadcast_in_dim %1552, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1555 = stablehlo.multiply %1553, %1554 : tensor<1x256x1280xf32>
-    %1556 = stablehlo.convert %arg34 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1557 = stablehlo.broadcast_in_dim %1555, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1558 = stablehlo.broadcast_in_dim %1556, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1559 = stablehlo.add %1557, %1558 : tensor<1x256x1280xf32>
-    %1560 = stablehlo.convert %1559 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1561 = stablehlo.reshape %1560 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1562 = stablehlo.convert %1561 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1563 = stablehlo.dot_general %1562, %arg209, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1564 = stablehlo.broadcast_in_dim %1563, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1565 = stablehlo.multiply %1564, %127 : tensor<256x1280xf32>
-    %1566 = stablehlo.broadcast_in_dim %1565, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1567 = stablehlo.broadcast_in_dim %arg210, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1568 = stablehlo.add %1566, %1567 : tensor<256x1280xf32>
-    %1569 = stablehlo.convert %1568 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1570 = stablehlo.reshape %1569 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1571 = stablehlo.multiply %1570, %cst_4 : tensor<1x256x1280xbf16>
-    %1572 = stablehlo.multiply %1570, %190 : tensor<1x256x1280xbf16>
-    %1573 = stablehlo.convert %1572 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1574 = stablehlo.clamp %cst_5, %1573, %cst_6 : tensor<1x256x1280xf32>
-    %1575 = stablehlo.multiply %1574, %1574 : tensor<1x256x1280xf32>
-    %1576 = stablehlo.multiply %cst_7, %1575 : tensor<1x256x1280xf32>
-    %1577 = stablehlo.add %1576, %cst_8 : tensor<1x256x1280xf32>
-    %1578 = stablehlo.multiply %1577, %1575 : tensor<1x256x1280xf32>
-    %1579 = stablehlo.add %1578, %cst_9 : tensor<1x256x1280xf32>
-    %1580 = stablehlo.multiply %1579, %1575 : tensor<1x256x1280xf32>
-    %1581 = stablehlo.add %1580, %cst_10 : tensor<1x256x1280xf32>
-    %1582 = stablehlo.multiply %1581, %1575 : tensor<1x256x1280xf32>
-    %1583 = stablehlo.add %1582, %cst_11 : tensor<1x256x1280xf32>
-    %1584 = stablehlo.multiply %1583, %1575 : tensor<1x256x1280xf32>
-    %1585 = stablehlo.add %1584, %cst_12 : tensor<1x256x1280xf32>
-    %1586 = stablehlo.multiply %1585, %1575 : tensor<1x256x1280xf32>
-    %1587 = stablehlo.add %1586, %cst_13 : tensor<1x256x1280xf32>
-    %1588 = stablehlo.multiply %cst_14, %1575 : tensor<1x256x1280xf32>
-    %1589 = stablehlo.add %1588, %cst_15 : tensor<1x256x1280xf32>
-    %1590 = stablehlo.multiply %1589, %1575 : tensor<1x256x1280xf32>
-    %1591 = stablehlo.add %1590, %cst_16 : tensor<1x256x1280xf32>
-    %1592 = stablehlo.multiply %1591, %1575 : tensor<1x256x1280xf32>
-    %1593 = stablehlo.add %1592, %cst_17 : tensor<1x256x1280xf32>
-    %1594 = stablehlo.multiply %1593, %1575 : tensor<1x256x1280xf32>
-    %1595 = stablehlo.add %1594, %cst_18 : tensor<1x256x1280xf32>
-    %1596 = stablehlo.multiply %1574, %1587 : tensor<1x256x1280xf32>
-    %1597 = stablehlo.divide %1596, %1595 : tensor<1x256x1280xf32>
-    %1598 = stablehlo.clamp %cst_19, %1597, %cst_20 : tensor<1x256x1280xf32>
-    %1599 = stablehlo.convert %1598 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1600 = stablehlo.add %1599, %cst_2 : tensor<1x256x1280xbf16>
-    %1601 = stablehlo.multiply %1600, %1571 : tensor<1x256x1280xbf16>
-    %1602 = stablehlo.reshape %1601 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1603 = stablehlo.convert %1602 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1604 = stablehlo.dot_general %1603, %arg211, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1605 = stablehlo.broadcast_in_dim %1604, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1606 = stablehlo.multiply %1605, %127 : tensor<256x1280xf32>
-    %1607 = stablehlo.broadcast_in_dim %1606, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1608 = stablehlo.broadcast_in_dim %arg212, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1609 = stablehlo.add %1607, %1608 : tensor<256x1280xf32>
-    %1610 = stablehlo.convert %1609 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1611 = stablehlo.reshape %1610 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1612 = stablehlo.add %1611, %1523 : tensor<1x256x1280xbf16>
-    %1613 = stablehlo.convert %1612 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1614 = stablehlo.convert %1613 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1615 = stablehlo.reduce(%1614 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1616 = stablehlo.reshape %1615 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1617 = stablehlo.broadcast_in_dim %1616, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1618 = stablehlo.divide %1617, %142 : tensor<1x256x1xf64>
-    %1619 = stablehlo.broadcast_in_dim %1614, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1620 = stablehlo.broadcast_in_dim %1618, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1621 = stablehlo.subtract %1619, %1620 : tensor<1x256x1280xf64>
-    %1622 = stablehlo.multiply %1621, %1621 : tensor<1x256x1280xf64>
-    %1623 = stablehlo.reduce(%1622 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1624 = stablehlo.reshape %1623 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1625 = stablehlo.broadcast_in_dim %1624, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1626 = stablehlo.divide %1625, %142 : tensor<1x256x1xf64>
-    %1627 = stablehlo.convert %1626 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1628 = stablehlo.reduce(%1613 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1629 = stablehlo.reshape %1628 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1630 = stablehlo.broadcast_in_dim %1629, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1631 = stablehlo.divide %1630, %158 : tensor<1x256x1xf32>
-    %1632 = stablehlo.broadcast_in_dim %1627, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1633 = stablehlo.add %1632, %161 : tensor<1x256x1xf32>
-    %1634 = stablehlo.rsqrt %1633 : tensor<1x256x1xf32>
-    %1635 = stablehlo.broadcast_in_dim %1613, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1636 = stablehlo.broadcast_in_dim %1631, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1637 = stablehlo.subtract %1635, %1636 : tensor<1x256x1280xf32>
-    %1638 = stablehlo.broadcast_in_dim %1637, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1639 = stablehlo.broadcast_in_dim %1634, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1640 = stablehlo.multiply %1638, %1639 : tensor<1x256x1280xf32>
-    %1641 = stablehlo.convert %arg35 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1642 = stablehlo.broadcast_in_dim %1640, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1643 = stablehlo.broadcast_in_dim %1641, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1644 = stablehlo.multiply %1642, %1643 : tensor<1x256x1280xf32>
-    %1645 = stablehlo.convert %arg36 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1646 = stablehlo.broadcast_in_dim %1644, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1647 = stablehlo.broadcast_in_dim %1645, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1648 = stablehlo.add %1646, %1647 : tensor<1x256x1280xf32>
-    %1649 = stablehlo.convert %1648 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1650 = stablehlo.reshape %1649 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1651 = stablehlo.convert %1650 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1652 = stablehlo.dot_general %1651, %arg213, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1653 = stablehlo.broadcast_in_dim %1652, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1654 = stablehlo.multiply %1653, %273 : tensor<256x256xf32>
-    %1655 = stablehlo.broadcast_in_dim %1654, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1656 = stablehlo.broadcast_in_dim %arg214, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1657 = stablehlo.add %1655, %1656 : tensor<256x256xf32>
-    %1658 = stablehlo.convert %1657 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1659 = stablehlo.reshape %1658 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1660 = stablehlo.dot_general %1651, %arg215, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1661 = stablehlo.broadcast_in_dim %1660, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1662 = stablehlo.multiply %1661, %273 : tensor<256x256xf32>
-    %1663 = stablehlo.broadcast_in_dim %1662, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1664 = stablehlo.broadcast_in_dim %arg216, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1665 = stablehlo.add %1663, %1664 : tensor<256x256xf32>
-    %1666 = stablehlo.convert %1665 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1667 = stablehlo.reshape %1666 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1668 = stablehlo.dot_general %1651, %arg217, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1669 = stablehlo.broadcast_in_dim %1668, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1670 = stablehlo.multiply %1669, %127 : tensor<256x1280xf32>
-    %1671 = stablehlo.broadcast_in_dim %1670, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1672 = stablehlo.broadcast_in_dim %arg218, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1673 = stablehlo.add %1671, %1672 : tensor<256x1280xf32>
-    %1674 = stablehlo.convert %1673 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1675 = stablehlo.reshape %1674 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1676 = stablehlo.reshape %1659 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1677 = stablehlo.transpose %1676, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1678 = stablehlo.reshape %1667 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1679 = stablehlo.transpose %1678, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1680 = stablehlo.reshape %1675 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %1681 = stablehlo.transpose %1680, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1682 = stablehlo.transpose %1679, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %1683 = stablehlo.reshape %1677 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1684 = stablehlo.reshape %1682 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1685 = stablehlo.broadcast_in_dim %1684, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1686 = stablehlo.dot_general %1683, %1685, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %1687 = stablehlo.reshape %1686 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1688 = stablehlo.broadcast_in_dim %1687, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1689 = stablehlo.divide %1688, %309 : tensor<1x8x256x256xbf16>
-    %1690 = stablehlo.convert %1689 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %1691 = stablehlo.reduce(%1690 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1692 = stablehlo.reshape %1691 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1693 = stablehlo.broadcast_in_dim %1690, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1694 = stablehlo.broadcast_in_dim %1692, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1695 = stablehlo.subtract %1693, %1694 : tensor<1x8x256x256xf32>
-    %1696 = stablehlo.exponential %1695 : tensor<1x8x256x256xf32>
-    %1697 = stablehlo.reduce(%1696 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1698 = stablehlo.reshape %1697 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1699 = stablehlo.broadcast_in_dim %1696, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1700 = stablehlo.broadcast_in_dim %1698, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1701 = stablehlo.divide %1699, %1700 : tensor<1x8x256x256xf32>
-    %1702 = stablehlo.convert %1701 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %1703 = stablehlo.reshape %1702 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %1704 = stablehlo.reshape %1681 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1705 = stablehlo.broadcast_in_dim %1704, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1706 = stablehlo.dot_general %1703, %1705, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1707 = stablehlo.reshape %1706 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1708 = stablehlo.transpose %1707, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %1709 = stablehlo.reshape %1708 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %1710 = stablehlo.reshape %1709 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1711 = stablehlo.convert %1710 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1712 = stablehlo.dot_general %1711, %arg219, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1713 = stablehlo.broadcast_in_dim %1712, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1714 = stablehlo.multiply %1713, %127 : tensor<256x1280xf32>
-    %1715 = stablehlo.broadcast_in_dim %1714, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1716 = stablehlo.broadcast_in_dim %arg220, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1717 = stablehlo.add %1715, %1716 : tensor<256x1280xf32>
-    %1718 = stablehlo.convert %1717 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1719 = stablehlo.reshape %1718 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1720 = stablehlo.add %1719, %1612 : tensor<1x256x1280xbf16>
-    %1721 = stablehlo.convert %1720 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1722 = stablehlo.convert %1721 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1723 = stablehlo.reduce(%1722 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1724 = stablehlo.reshape %1723 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1725 = stablehlo.broadcast_in_dim %1724, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1726 = stablehlo.divide %1725, %142 : tensor<1x256x1xf64>
-    %1727 = stablehlo.broadcast_in_dim %1722, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1728 = stablehlo.broadcast_in_dim %1726, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1729 = stablehlo.subtract %1727, %1728 : tensor<1x256x1280xf64>
-    %1730 = stablehlo.multiply %1729, %1729 : tensor<1x256x1280xf64>
-    %1731 = stablehlo.reduce(%1730 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1732 = stablehlo.reshape %1731 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1733 = stablehlo.broadcast_in_dim %1732, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1734 = stablehlo.divide %1733, %142 : tensor<1x256x1xf64>
-    %1735 = stablehlo.convert %1734 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1736 = stablehlo.reduce(%1721 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1737 = stablehlo.reshape %1736 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1738 = stablehlo.broadcast_in_dim %1737, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1739 = stablehlo.divide %1738, %158 : tensor<1x256x1xf32>
-    %1740 = stablehlo.broadcast_in_dim %1735, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1741 = stablehlo.add %1740, %161 : tensor<1x256x1xf32>
-    %1742 = stablehlo.rsqrt %1741 : tensor<1x256x1xf32>
-    %1743 = stablehlo.broadcast_in_dim %1721, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1744 = stablehlo.broadcast_in_dim %1739, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1745 = stablehlo.subtract %1743, %1744 : tensor<1x256x1280xf32>
-    %1746 = stablehlo.broadcast_in_dim %1745, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1747 = stablehlo.broadcast_in_dim %1742, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1748 = stablehlo.multiply %1746, %1747 : tensor<1x256x1280xf32>
-    %1749 = stablehlo.convert %arg37 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1750 = stablehlo.broadcast_in_dim %1748, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1751 = stablehlo.broadcast_in_dim %1749, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1752 = stablehlo.multiply %1750, %1751 : tensor<1x256x1280xf32>
-    %1753 = stablehlo.convert %arg38 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1754 = stablehlo.broadcast_in_dim %1752, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1755 = stablehlo.broadcast_in_dim %1753, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1756 = stablehlo.add %1754, %1755 : tensor<1x256x1280xf32>
-    %1757 = stablehlo.convert %1756 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1758 = stablehlo.reshape %1757 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1759 = stablehlo.convert %1758 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1760 = stablehlo.dot_general %1759, %arg221, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1761 = stablehlo.broadcast_in_dim %1760, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1762 = stablehlo.multiply %1761, %127 : tensor<256x1280xf32>
-    %1763 = stablehlo.broadcast_in_dim %1762, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1764 = stablehlo.broadcast_in_dim %arg222, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1765 = stablehlo.add %1763, %1764 : tensor<256x1280xf32>
-    %1766 = stablehlo.convert %1765 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1767 = stablehlo.reshape %1766 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1768 = stablehlo.multiply %1767, %cst_4 : tensor<1x256x1280xbf16>
-    %1769 = stablehlo.multiply %1767, %190 : tensor<1x256x1280xbf16>
-    %1770 = stablehlo.convert %1769 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1771 = stablehlo.clamp %cst_5, %1770, %cst_6 : tensor<1x256x1280xf32>
-    %1772 = stablehlo.multiply %1771, %1771 : tensor<1x256x1280xf32>
-    %1773 = stablehlo.multiply %cst_7, %1772 : tensor<1x256x1280xf32>
-    %1774 = stablehlo.add %1773, %cst_8 : tensor<1x256x1280xf32>
-    %1775 = stablehlo.multiply %1774, %1772 : tensor<1x256x1280xf32>
-    %1776 = stablehlo.add %1775, %cst_9 : tensor<1x256x1280xf32>
-    %1777 = stablehlo.multiply %1776, %1772 : tensor<1x256x1280xf32>
-    %1778 = stablehlo.add %1777, %cst_10 : tensor<1x256x1280xf32>
-    %1779 = stablehlo.multiply %1778, %1772 : tensor<1x256x1280xf32>
-    %1780 = stablehlo.add %1779, %cst_11 : tensor<1x256x1280xf32>
-    %1781 = stablehlo.multiply %1780, %1772 : tensor<1x256x1280xf32>
-    %1782 = stablehlo.add %1781, %cst_12 : tensor<1x256x1280xf32>
-    %1783 = stablehlo.multiply %1782, %1772 : tensor<1x256x1280xf32>
-    %1784 = stablehlo.add %1783, %cst_13 : tensor<1x256x1280xf32>
-    %1785 = stablehlo.multiply %cst_14, %1772 : tensor<1x256x1280xf32>
-    %1786 = stablehlo.add %1785, %cst_15 : tensor<1x256x1280xf32>
-    %1787 = stablehlo.multiply %1786, %1772 : tensor<1x256x1280xf32>
-    %1788 = stablehlo.add %1787, %cst_16 : tensor<1x256x1280xf32>
-    %1789 = stablehlo.multiply %1788, %1772 : tensor<1x256x1280xf32>
-    %1790 = stablehlo.add %1789, %cst_17 : tensor<1x256x1280xf32>
-    %1791 = stablehlo.multiply %1790, %1772 : tensor<1x256x1280xf32>
-    %1792 = stablehlo.add %1791, %cst_18 : tensor<1x256x1280xf32>
-    %1793 = stablehlo.multiply %1771, %1784 : tensor<1x256x1280xf32>
-    %1794 = stablehlo.divide %1793, %1792 : tensor<1x256x1280xf32>
-    %1795 = stablehlo.clamp %cst_19, %1794, %cst_20 : tensor<1x256x1280xf32>
-    %1796 = stablehlo.convert %1795 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1797 = stablehlo.add %1796, %cst_2 : tensor<1x256x1280xbf16>
-    %1798 = stablehlo.multiply %1797, %1768 : tensor<1x256x1280xbf16>
-    %1799 = stablehlo.reshape %1798 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1800 = stablehlo.convert %1799 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1801 = stablehlo.dot_general %1800, %arg223, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1802 = stablehlo.broadcast_in_dim %1801, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1803 = stablehlo.multiply %1802, %127 : tensor<256x1280xf32>
-    %1804 = stablehlo.broadcast_in_dim %1803, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1805 = stablehlo.broadcast_in_dim %arg224, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1806 = stablehlo.add %1804, %1805 : tensor<256x1280xf32>
-    %1807 = stablehlo.convert %1806 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1808 = stablehlo.reshape %1807 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1809 = stablehlo.add %1808, %1720 : tensor<1x256x1280xbf16>
-    %1810 = stablehlo.convert %1809 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1811 = stablehlo.convert %1810 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1812 = stablehlo.reduce(%1811 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1813 = stablehlo.reshape %1812 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1814 = stablehlo.broadcast_in_dim %1813, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1815 = stablehlo.divide %1814, %142 : tensor<1x256x1xf64>
-    %1816 = stablehlo.broadcast_in_dim %1811, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1817 = stablehlo.broadcast_in_dim %1815, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1818 = stablehlo.subtract %1816, %1817 : tensor<1x256x1280xf64>
-    %1819 = stablehlo.multiply %1818, %1818 : tensor<1x256x1280xf64>
-    %1820 = stablehlo.reduce(%1819 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1821 = stablehlo.reshape %1820 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1822 = stablehlo.broadcast_in_dim %1821, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1823 = stablehlo.divide %1822, %142 : tensor<1x256x1xf64>
-    %1824 = stablehlo.convert %1823 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1825 = stablehlo.reduce(%1810 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1826 = stablehlo.reshape %1825 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1827 = stablehlo.broadcast_in_dim %1826, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1828 = stablehlo.divide %1827, %158 : tensor<1x256x1xf32>
-    %1829 = stablehlo.broadcast_in_dim %1824, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1830 = stablehlo.add %1829, %161 : tensor<1x256x1xf32>
-    %1831 = stablehlo.rsqrt %1830 : tensor<1x256x1xf32>
-    %1832 = stablehlo.broadcast_in_dim %1810, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1833 = stablehlo.broadcast_in_dim %1828, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1834 = stablehlo.subtract %1832, %1833 : tensor<1x256x1280xf32>
-    %1835 = stablehlo.broadcast_in_dim %1834, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1836 = stablehlo.broadcast_in_dim %1831, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1837 = stablehlo.multiply %1835, %1836 : tensor<1x256x1280xf32>
-    %1838 = stablehlo.convert %arg39 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1839 = stablehlo.broadcast_in_dim %1837, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1840 = stablehlo.broadcast_in_dim %1838, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1841 = stablehlo.multiply %1839, %1840 : tensor<1x256x1280xf32>
-    %1842 = stablehlo.convert %arg40 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1843 = stablehlo.broadcast_in_dim %1841, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1844 = stablehlo.broadcast_in_dim %1842, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1845 = stablehlo.add %1843, %1844 : tensor<1x256x1280xf32>
-    %1846 = stablehlo.convert %1845 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1847 = stablehlo.reshape %1846 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1848 = stablehlo.convert %1847 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1849 = stablehlo.dot_general %1848, %arg225, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1850 = stablehlo.broadcast_in_dim %1849, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1851 = stablehlo.multiply %1850, %273 : tensor<256x256xf32>
-    %1852 = stablehlo.broadcast_in_dim %1851, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1853 = stablehlo.broadcast_in_dim %arg226, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1854 = stablehlo.add %1852, %1853 : tensor<256x256xf32>
-    %1855 = stablehlo.convert %1854 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1856 = stablehlo.reshape %1855 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1857 = stablehlo.dot_general %1848, %arg227, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %1858 = stablehlo.broadcast_in_dim %1857, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1859 = stablehlo.multiply %1858, %273 : tensor<256x256xf32>
-    %1860 = stablehlo.broadcast_in_dim %1859, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1861 = stablehlo.broadcast_in_dim %arg228, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1862 = stablehlo.add %1860, %1861 : tensor<256x256xf32>
-    %1863 = stablehlo.convert %1862 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1864 = stablehlo.reshape %1863 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1865 = stablehlo.dot_general %1848, %arg229, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1866 = stablehlo.broadcast_in_dim %1865, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1867 = stablehlo.multiply %1866, %127 : tensor<256x1280xf32>
-    %1868 = stablehlo.broadcast_in_dim %1867, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1869 = stablehlo.broadcast_in_dim %arg230, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1870 = stablehlo.add %1868, %1869 : tensor<256x1280xf32>
-    %1871 = stablehlo.convert %1870 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1872 = stablehlo.reshape %1871 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1873 = stablehlo.reshape %1856 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1874 = stablehlo.transpose %1873, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1875 = stablehlo.reshape %1864 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1876 = stablehlo.transpose %1875, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1877 = stablehlo.reshape %1872 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %1878 = stablehlo.transpose %1877, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1879 = stablehlo.transpose %1876, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %1880 = stablehlo.reshape %1874 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1881 = stablehlo.reshape %1879 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1882 = stablehlo.broadcast_in_dim %1881, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1883 = stablehlo.dot_general %1880, %1882, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %1884 = stablehlo.reshape %1883 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1885 = stablehlo.broadcast_in_dim %1884, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1886 = stablehlo.divide %1885, %309 : tensor<1x8x256x256xbf16>
-    %1887 = stablehlo.convert %1886 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %1888 = stablehlo.reduce(%1887 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1889 = stablehlo.reshape %1888 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1890 = stablehlo.broadcast_in_dim %1887, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1891 = stablehlo.broadcast_in_dim %1889, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1892 = stablehlo.subtract %1890, %1891 : tensor<1x8x256x256xf32>
-    %1893 = stablehlo.exponential %1892 : tensor<1x8x256x256xf32>
-    %1894 = stablehlo.reduce(%1893 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1895 = stablehlo.reshape %1894 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1896 = stablehlo.broadcast_in_dim %1893, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1897 = stablehlo.broadcast_in_dim %1895, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1898 = stablehlo.divide %1896, %1897 : tensor<1x8x256x256xf32>
-    %1899 = stablehlo.convert %1898 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %1900 = stablehlo.reshape %1899 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %1901 = stablehlo.reshape %1878 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1902 = stablehlo.broadcast_in_dim %1901, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1903 = stablehlo.dot_general %1900, %1902, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %1904 = stablehlo.reshape %1903 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %1905 = stablehlo.transpose %1904, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %1906 = stablehlo.reshape %1905 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %1907 = stablehlo.reshape %1906 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1908 = stablehlo.convert %1907 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1909 = stablehlo.dot_general %1908, %arg231, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1910 = stablehlo.broadcast_in_dim %1909, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1911 = stablehlo.multiply %1910, %127 : tensor<256x1280xf32>
-    %1912 = stablehlo.broadcast_in_dim %1911, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1913 = stablehlo.broadcast_in_dim %arg232, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1914 = stablehlo.add %1912, %1913 : tensor<256x1280xf32>
-    %1915 = stablehlo.convert %1914 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1916 = stablehlo.reshape %1915 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1917 = stablehlo.add %1916, %1809 : tensor<1x256x1280xbf16>
-    %1918 = stablehlo.convert %1917 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1919 = stablehlo.convert %1918 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %1920 = stablehlo.reduce(%1919 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1921 = stablehlo.reshape %1920 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1922 = stablehlo.broadcast_in_dim %1921, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1923 = stablehlo.divide %1922, %142 : tensor<1x256x1xf64>
-    %1924 = stablehlo.broadcast_in_dim %1919, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %1925 = stablehlo.broadcast_in_dim %1923, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %1926 = stablehlo.subtract %1924, %1925 : tensor<1x256x1280xf64>
-    %1927 = stablehlo.multiply %1926, %1926 : tensor<1x256x1280xf64>
-    %1928 = stablehlo.reduce(%1927 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1929 = stablehlo.reshape %1928 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1930 = stablehlo.broadcast_in_dim %1929, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1931 = stablehlo.divide %1930, %142 : tensor<1x256x1xf64>
-    %1932 = stablehlo.convert %1931 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1933 = stablehlo.reduce(%1918 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1934 = stablehlo.reshape %1933 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1935 = stablehlo.broadcast_in_dim %1934, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1936 = stablehlo.divide %1935, %158 : tensor<1x256x1xf32>
-    %1937 = stablehlo.broadcast_in_dim %1932, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1938 = stablehlo.add %1937, %161 : tensor<1x256x1xf32>
-    %1939 = stablehlo.rsqrt %1938 : tensor<1x256x1xf32>
-    %1940 = stablehlo.broadcast_in_dim %1918, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1941 = stablehlo.broadcast_in_dim %1936, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1942 = stablehlo.subtract %1940, %1941 : tensor<1x256x1280xf32>
-    %1943 = stablehlo.broadcast_in_dim %1942, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1944 = stablehlo.broadcast_in_dim %1939, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %1945 = stablehlo.multiply %1943, %1944 : tensor<1x256x1280xf32>
-    %1946 = stablehlo.convert %arg41 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1947 = stablehlo.broadcast_in_dim %1945, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1948 = stablehlo.broadcast_in_dim %1946, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1949 = stablehlo.multiply %1947, %1948 : tensor<1x256x1280xf32>
-    %1950 = stablehlo.convert %arg42 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %1951 = stablehlo.broadcast_in_dim %1949, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %1952 = stablehlo.broadcast_in_dim %1950, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %1953 = stablehlo.add %1951, %1952 : tensor<1x256x1280xf32>
-    %1954 = stablehlo.convert %1953 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1955 = stablehlo.reshape %1954 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1956 = stablehlo.convert %1955 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1957 = stablehlo.dot_general %1956, %arg233, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1958 = stablehlo.broadcast_in_dim %1957, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1959 = stablehlo.multiply %1958, %127 : tensor<256x1280xf32>
-    %1960 = stablehlo.broadcast_in_dim %1959, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %1961 = stablehlo.broadcast_in_dim %arg234, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %1962 = stablehlo.add %1960, %1961 : tensor<256x1280xf32>
-    %1963 = stablehlo.convert %1962 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %1964 = stablehlo.reshape %1963 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %1965 = stablehlo.multiply %1964, %cst_4 : tensor<1x256x1280xbf16>
-    %1966 = stablehlo.multiply %1964, %190 : tensor<1x256x1280xbf16>
-    %1967 = stablehlo.convert %1966 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %1968 = stablehlo.clamp %cst_5, %1967, %cst_6 : tensor<1x256x1280xf32>
-    %1969 = stablehlo.multiply %1968, %1968 : tensor<1x256x1280xf32>
-    %1970 = stablehlo.multiply %cst_7, %1969 : tensor<1x256x1280xf32>
-    %1971 = stablehlo.add %1970, %cst_8 : tensor<1x256x1280xf32>
-    %1972 = stablehlo.multiply %1971, %1969 : tensor<1x256x1280xf32>
-    %1973 = stablehlo.add %1972, %cst_9 : tensor<1x256x1280xf32>
-    %1974 = stablehlo.multiply %1973, %1969 : tensor<1x256x1280xf32>
-    %1975 = stablehlo.add %1974, %cst_10 : tensor<1x256x1280xf32>
-    %1976 = stablehlo.multiply %1975, %1969 : tensor<1x256x1280xf32>
-    %1977 = stablehlo.add %1976, %cst_11 : tensor<1x256x1280xf32>
-    %1978 = stablehlo.multiply %1977, %1969 : tensor<1x256x1280xf32>
-    %1979 = stablehlo.add %1978, %cst_12 : tensor<1x256x1280xf32>
-    %1980 = stablehlo.multiply %1979, %1969 : tensor<1x256x1280xf32>
-    %1981 = stablehlo.add %1980, %cst_13 : tensor<1x256x1280xf32>
-    %1982 = stablehlo.multiply %cst_14, %1969 : tensor<1x256x1280xf32>
-    %1983 = stablehlo.add %1982, %cst_15 : tensor<1x256x1280xf32>
-    %1984 = stablehlo.multiply %1983, %1969 : tensor<1x256x1280xf32>
-    %1985 = stablehlo.add %1984, %cst_16 : tensor<1x256x1280xf32>
-    %1986 = stablehlo.multiply %1985, %1969 : tensor<1x256x1280xf32>
-    %1987 = stablehlo.add %1986, %cst_17 : tensor<1x256x1280xf32>
-    %1988 = stablehlo.multiply %1987, %1969 : tensor<1x256x1280xf32>
-    %1989 = stablehlo.add %1988, %cst_18 : tensor<1x256x1280xf32>
-    %1990 = stablehlo.multiply %1968, %1981 : tensor<1x256x1280xf32>
-    %1991 = stablehlo.divide %1990, %1989 : tensor<1x256x1280xf32>
-    %1992 = stablehlo.clamp %cst_19, %1991, %cst_20 : tensor<1x256x1280xf32>
-    %1993 = stablehlo.convert %1992 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %1994 = stablehlo.add %1993, %cst_2 : tensor<1x256x1280xbf16>
-    %1995 = stablehlo.multiply %1994, %1965 : tensor<1x256x1280xbf16>
-    %1996 = stablehlo.reshape %1995 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %1997 = stablehlo.convert %1996 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %1998 = stablehlo.dot_general %1997, %arg235, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %1999 = stablehlo.broadcast_in_dim %1998, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2000 = stablehlo.multiply %1999, %127 : tensor<256x1280xf32>
-    %2001 = stablehlo.broadcast_in_dim %2000, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2002 = stablehlo.broadcast_in_dim %arg236, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2003 = stablehlo.add %2001, %2002 : tensor<256x1280xf32>
-    %2004 = stablehlo.convert %2003 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2005 = stablehlo.reshape %2004 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2006 = stablehlo.add %2005, %1917 : tensor<1x256x1280xbf16>
-    %2007 = stablehlo.convert %2006 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2008 = stablehlo.convert %2007 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2009 = stablehlo.reduce(%2008 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2010 = stablehlo.reshape %2009 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2011 = stablehlo.broadcast_in_dim %2010, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2012 = stablehlo.divide %2011, %142 : tensor<1x256x1xf64>
-    %2013 = stablehlo.broadcast_in_dim %2008, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2014 = stablehlo.broadcast_in_dim %2012, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2015 = stablehlo.subtract %2013, %2014 : tensor<1x256x1280xf64>
-    %2016 = stablehlo.multiply %2015, %2015 : tensor<1x256x1280xf64>
-    %2017 = stablehlo.reduce(%2016 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2018 = stablehlo.reshape %2017 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2019 = stablehlo.broadcast_in_dim %2018, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2020 = stablehlo.divide %2019, %142 : tensor<1x256x1xf64>
-    %2021 = stablehlo.convert %2020 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2022 = stablehlo.reduce(%2007 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2023 = stablehlo.reshape %2022 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2024 = stablehlo.broadcast_in_dim %2023, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2025 = stablehlo.divide %2024, %158 : tensor<1x256x1xf32>
-    %2026 = stablehlo.broadcast_in_dim %2021, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2027 = stablehlo.add %2026, %161 : tensor<1x256x1xf32>
-    %2028 = stablehlo.rsqrt %2027 : tensor<1x256x1xf32>
-    %2029 = stablehlo.broadcast_in_dim %2007, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2030 = stablehlo.broadcast_in_dim %2025, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2031 = stablehlo.subtract %2029, %2030 : tensor<1x256x1280xf32>
-    %2032 = stablehlo.broadcast_in_dim %2031, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2033 = stablehlo.broadcast_in_dim %2028, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2034 = stablehlo.multiply %2032, %2033 : tensor<1x256x1280xf32>
-    %2035 = stablehlo.convert %arg43 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2036 = stablehlo.broadcast_in_dim %2034, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2037 = stablehlo.broadcast_in_dim %2035, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2038 = stablehlo.multiply %2036, %2037 : tensor<1x256x1280xf32>
-    %2039 = stablehlo.convert %arg44 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2040 = stablehlo.broadcast_in_dim %2038, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2041 = stablehlo.broadcast_in_dim %2039, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2042 = stablehlo.add %2040, %2041 : tensor<1x256x1280xf32>
-    %2043 = stablehlo.convert %2042 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2044 = stablehlo.reshape %2043 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2045 = stablehlo.convert %2044 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2046 = stablehlo.dot_general %2045, %arg237, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2047 = stablehlo.broadcast_in_dim %2046, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2048 = stablehlo.multiply %2047, %273 : tensor<256x256xf32>
-    %2049 = stablehlo.broadcast_in_dim %2048, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2050 = stablehlo.broadcast_in_dim %arg238, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2051 = stablehlo.add %2049, %2050 : tensor<256x256xf32>
-    %2052 = stablehlo.convert %2051 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2053 = stablehlo.reshape %2052 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2054 = stablehlo.dot_general %2045, %arg239, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2055 = stablehlo.broadcast_in_dim %2054, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2056 = stablehlo.multiply %2055, %273 : tensor<256x256xf32>
-    %2057 = stablehlo.broadcast_in_dim %2056, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2058 = stablehlo.broadcast_in_dim %arg240, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2059 = stablehlo.add %2057, %2058 : tensor<256x256xf32>
-    %2060 = stablehlo.convert %2059 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2061 = stablehlo.reshape %2060 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2062 = stablehlo.dot_general %2045, %arg241, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2063 = stablehlo.broadcast_in_dim %2062, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2064 = stablehlo.multiply %2063, %127 : tensor<256x1280xf32>
-    %2065 = stablehlo.broadcast_in_dim %2064, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2066 = stablehlo.broadcast_in_dim %arg242, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2067 = stablehlo.add %2065, %2066 : tensor<256x1280xf32>
-    %2068 = stablehlo.convert %2067 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2069 = stablehlo.reshape %2068 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2070 = stablehlo.reshape %2053 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2071 = stablehlo.transpose %2070, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2072 = stablehlo.reshape %2061 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2073 = stablehlo.transpose %2072, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2074 = stablehlo.reshape %2069 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %2075 = stablehlo.transpose %2074, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2076 = stablehlo.transpose %2073, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %2077 = stablehlo.reshape %2071 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2078 = stablehlo.reshape %2076 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2079 = stablehlo.broadcast_in_dim %2078, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2080 = stablehlo.dot_general %2077, %2079, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %2081 = stablehlo.reshape %2080 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2082 = stablehlo.broadcast_in_dim %2081, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2083 = stablehlo.divide %2082, %309 : tensor<1x8x256x256xbf16>
-    %2084 = stablehlo.convert %2083 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %2085 = stablehlo.reduce(%2084 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2086 = stablehlo.reshape %2085 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2087 = stablehlo.broadcast_in_dim %2084, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2088 = stablehlo.broadcast_in_dim %2086, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2089 = stablehlo.subtract %2087, %2088 : tensor<1x8x256x256xf32>
-    %2090 = stablehlo.exponential %2089 : tensor<1x8x256x256xf32>
-    %2091 = stablehlo.reduce(%2090 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2092 = stablehlo.reshape %2091 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2093 = stablehlo.broadcast_in_dim %2090, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2094 = stablehlo.broadcast_in_dim %2092, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2095 = stablehlo.divide %2093, %2094 : tensor<1x8x256x256xf32>
-    %2096 = stablehlo.convert %2095 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %2097 = stablehlo.reshape %2096 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %2098 = stablehlo.reshape %2075 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2099 = stablehlo.broadcast_in_dim %2098, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2100 = stablehlo.dot_general %2097, %2099, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2101 = stablehlo.reshape %2100 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2102 = stablehlo.transpose %2101, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %2103 = stablehlo.reshape %2102 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %2104 = stablehlo.reshape %2103 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2105 = stablehlo.convert %2104 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2106 = stablehlo.dot_general %2105, %arg243, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2107 = stablehlo.broadcast_in_dim %2106, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2108 = stablehlo.multiply %2107, %127 : tensor<256x1280xf32>
-    %2109 = stablehlo.broadcast_in_dim %2108, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2110 = stablehlo.broadcast_in_dim %arg244, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2111 = stablehlo.add %2109, %2110 : tensor<256x1280xf32>
-    %2112 = stablehlo.convert %2111 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2113 = stablehlo.reshape %2112 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2114 = stablehlo.add %2113, %2006 : tensor<1x256x1280xbf16>
-    %2115 = stablehlo.convert %2114 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2116 = stablehlo.convert %2115 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2117 = stablehlo.reduce(%2116 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2118 = stablehlo.reshape %2117 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2119 = stablehlo.broadcast_in_dim %2118, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2120 = stablehlo.divide %2119, %142 : tensor<1x256x1xf64>
-    %2121 = stablehlo.broadcast_in_dim %2116, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2122 = stablehlo.broadcast_in_dim %2120, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2123 = stablehlo.subtract %2121, %2122 : tensor<1x256x1280xf64>
-    %2124 = stablehlo.multiply %2123, %2123 : tensor<1x256x1280xf64>
-    %2125 = stablehlo.reduce(%2124 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2126 = stablehlo.reshape %2125 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2127 = stablehlo.broadcast_in_dim %2126, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2128 = stablehlo.divide %2127, %142 : tensor<1x256x1xf64>
-    %2129 = stablehlo.convert %2128 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2130 = stablehlo.reduce(%2115 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2131 = stablehlo.reshape %2130 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2132 = stablehlo.broadcast_in_dim %2131, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2133 = stablehlo.divide %2132, %158 : tensor<1x256x1xf32>
-    %2134 = stablehlo.broadcast_in_dim %2129, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2135 = stablehlo.add %2134, %161 : tensor<1x256x1xf32>
-    %2136 = stablehlo.rsqrt %2135 : tensor<1x256x1xf32>
-    %2137 = stablehlo.broadcast_in_dim %2115, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2138 = stablehlo.broadcast_in_dim %2133, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2139 = stablehlo.subtract %2137, %2138 : tensor<1x256x1280xf32>
-    %2140 = stablehlo.broadcast_in_dim %2139, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2141 = stablehlo.broadcast_in_dim %2136, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2142 = stablehlo.multiply %2140, %2141 : tensor<1x256x1280xf32>
-    %2143 = stablehlo.convert %arg45 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2144 = stablehlo.broadcast_in_dim %2142, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2145 = stablehlo.broadcast_in_dim %2143, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2146 = stablehlo.multiply %2144, %2145 : tensor<1x256x1280xf32>
-    %2147 = stablehlo.convert %arg46 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2148 = stablehlo.broadcast_in_dim %2146, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2149 = stablehlo.broadcast_in_dim %2147, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2150 = stablehlo.add %2148, %2149 : tensor<1x256x1280xf32>
-    %2151 = stablehlo.convert %2150 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2152 = stablehlo.reshape %2151 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2153 = stablehlo.convert %2152 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2154 = stablehlo.dot_general %2153, %arg245, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2155 = stablehlo.broadcast_in_dim %2154, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2156 = stablehlo.multiply %2155, %127 : tensor<256x1280xf32>
-    %2157 = stablehlo.broadcast_in_dim %2156, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2158 = stablehlo.broadcast_in_dim %arg246, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2159 = stablehlo.add %2157, %2158 : tensor<256x1280xf32>
-    %2160 = stablehlo.convert %2159 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2161 = stablehlo.reshape %2160 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2162 = stablehlo.multiply %2161, %cst_4 : tensor<1x256x1280xbf16>
-    %2163 = stablehlo.multiply %2161, %190 : tensor<1x256x1280xbf16>
-    %2164 = stablehlo.convert %2163 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2165 = stablehlo.clamp %cst_5, %2164, %cst_6 : tensor<1x256x1280xf32>
-    %2166 = stablehlo.multiply %2165, %2165 : tensor<1x256x1280xf32>
-    %2167 = stablehlo.multiply %cst_7, %2166 : tensor<1x256x1280xf32>
-    %2168 = stablehlo.add %2167, %cst_8 : tensor<1x256x1280xf32>
-    %2169 = stablehlo.multiply %2168, %2166 : tensor<1x256x1280xf32>
-    %2170 = stablehlo.add %2169, %cst_9 : tensor<1x256x1280xf32>
-    %2171 = stablehlo.multiply %2170, %2166 : tensor<1x256x1280xf32>
-    %2172 = stablehlo.add %2171, %cst_10 : tensor<1x256x1280xf32>
-    %2173 = stablehlo.multiply %2172, %2166 : tensor<1x256x1280xf32>
-    %2174 = stablehlo.add %2173, %cst_11 : tensor<1x256x1280xf32>
-    %2175 = stablehlo.multiply %2174, %2166 : tensor<1x256x1280xf32>
-    %2176 = stablehlo.add %2175, %cst_12 : tensor<1x256x1280xf32>
-    %2177 = stablehlo.multiply %2176, %2166 : tensor<1x256x1280xf32>
-    %2178 = stablehlo.add %2177, %cst_13 : tensor<1x256x1280xf32>
-    %2179 = stablehlo.multiply %cst_14, %2166 : tensor<1x256x1280xf32>
-    %2180 = stablehlo.add %2179, %cst_15 : tensor<1x256x1280xf32>
-    %2181 = stablehlo.multiply %2180, %2166 : tensor<1x256x1280xf32>
-    %2182 = stablehlo.add %2181, %cst_16 : tensor<1x256x1280xf32>
-    %2183 = stablehlo.multiply %2182, %2166 : tensor<1x256x1280xf32>
-    %2184 = stablehlo.add %2183, %cst_17 : tensor<1x256x1280xf32>
-    %2185 = stablehlo.multiply %2184, %2166 : tensor<1x256x1280xf32>
-    %2186 = stablehlo.add %2185, %cst_18 : tensor<1x256x1280xf32>
-    %2187 = stablehlo.multiply %2165, %2178 : tensor<1x256x1280xf32>
-    %2188 = stablehlo.divide %2187, %2186 : tensor<1x256x1280xf32>
-    %2189 = stablehlo.clamp %cst_19, %2188, %cst_20 : tensor<1x256x1280xf32>
-    %2190 = stablehlo.convert %2189 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2191 = stablehlo.add %2190, %cst_2 : tensor<1x256x1280xbf16>
-    %2192 = stablehlo.multiply %2191, %2162 : tensor<1x256x1280xbf16>
-    %2193 = stablehlo.reshape %2192 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2194 = stablehlo.convert %2193 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2195 = stablehlo.dot_general %2194, %arg247, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2196 = stablehlo.broadcast_in_dim %2195, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2197 = stablehlo.multiply %2196, %127 : tensor<256x1280xf32>
-    %2198 = stablehlo.broadcast_in_dim %2197, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2199 = stablehlo.broadcast_in_dim %arg248, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2200 = stablehlo.add %2198, %2199 : tensor<256x1280xf32>
-    %2201 = stablehlo.convert %2200 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2202 = stablehlo.reshape %2201 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2203 = stablehlo.add %2202, %2114 : tensor<1x256x1280xbf16>
-    %2204 = stablehlo.convert %2203 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2205 = stablehlo.convert %2204 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2206 = stablehlo.reduce(%2205 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2207 = stablehlo.reshape %2206 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2208 = stablehlo.broadcast_in_dim %2207, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2209 = stablehlo.divide %2208, %142 : tensor<1x256x1xf64>
-    %2210 = stablehlo.broadcast_in_dim %2205, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2211 = stablehlo.broadcast_in_dim %2209, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2212 = stablehlo.subtract %2210, %2211 : tensor<1x256x1280xf64>
-    %2213 = stablehlo.multiply %2212, %2212 : tensor<1x256x1280xf64>
-    %2214 = stablehlo.reduce(%2213 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2215 = stablehlo.reshape %2214 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2216 = stablehlo.broadcast_in_dim %2215, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2217 = stablehlo.divide %2216, %142 : tensor<1x256x1xf64>
-    %2218 = stablehlo.convert %2217 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2219 = stablehlo.reduce(%2204 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2220 = stablehlo.reshape %2219 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2221 = stablehlo.broadcast_in_dim %2220, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2222 = stablehlo.divide %2221, %158 : tensor<1x256x1xf32>
-    %2223 = stablehlo.broadcast_in_dim %2218, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2224 = stablehlo.add %2223, %161 : tensor<1x256x1xf32>
-    %2225 = stablehlo.rsqrt %2224 : tensor<1x256x1xf32>
-    %2226 = stablehlo.broadcast_in_dim %2204, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2227 = stablehlo.broadcast_in_dim %2222, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2228 = stablehlo.subtract %2226, %2227 : tensor<1x256x1280xf32>
-    %2229 = stablehlo.broadcast_in_dim %2228, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2230 = stablehlo.broadcast_in_dim %2225, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2231 = stablehlo.multiply %2229, %2230 : tensor<1x256x1280xf32>
-    %2232 = stablehlo.convert %arg47 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2233 = stablehlo.broadcast_in_dim %2231, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2234 = stablehlo.broadcast_in_dim %2232, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2235 = stablehlo.multiply %2233, %2234 : tensor<1x256x1280xf32>
-    %2236 = stablehlo.convert %arg48 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2237 = stablehlo.broadcast_in_dim %2235, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2238 = stablehlo.broadcast_in_dim %2236, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2239 = stablehlo.add %2237, %2238 : tensor<1x256x1280xf32>
-    %2240 = stablehlo.convert %2239 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2241 = stablehlo.reshape %2240 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2242 = stablehlo.convert %2241 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2243 = stablehlo.dot_general %2242, %arg249, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2244 = stablehlo.broadcast_in_dim %2243, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2245 = stablehlo.multiply %2244, %273 : tensor<256x256xf32>
-    %2246 = stablehlo.broadcast_in_dim %2245, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2247 = stablehlo.broadcast_in_dim %arg250, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2248 = stablehlo.add %2246, %2247 : tensor<256x256xf32>
-    %2249 = stablehlo.convert %2248 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2250 = stablehlo.reshape %2249 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2251 = stablehlo.dot_general %2242, %arg251, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2252 = stablehlo.broadcast_in_dim %2251, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2253 = stablehlo.multiply %2252, %273 : tensor<256x256xf32>
-    %2254 = stablehlo.broadcast_in_dim %2253, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2255 = stablehlo.broadcast_in_dim %arg252, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2256 = stablehlo.add %2254, %2255 : tensor<256x256xf32>
-    %2257 = stablehlo.convert %2256 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2258 = stablehlo.reshape %2257 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2259 = stablehlo.dot_general %2242, %arg253, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2260 = stablehlo.broadcast_in_dim %2259, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2261 = stablehlo.multiply %2260, %127 : tensor<256x1280xf32>
-    %2262 = stablehlo.broadcast_in_dim %2261, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2263 = stablehlo.broadcast_in_dim %arg254, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2264 = stablehlo.add %2262, %2263 : tensor<256x1280xf32>
-    %2265 = stablehlo.convert %2264 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2266 = stablehlo.reshape %2265 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2267 = stablehlo.reshape %2250 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2268 = stablehlo.transpose %2267, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2269 = stablehlo.reshape %2258 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2270 = stablehlo.transpose %2269, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2271 = stablehlo.reshape %2266 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %2272 = stablehlo.transpose %2271, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2273 = stablehlo.transpose %2270, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %2274 = stablehlo.reshape %2268 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2275 = stablehlo.reshape %2273 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2276 = stablehlo.broadcast_in_dim %2275, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2277 = stablehlo.dot_general %2274, %2276, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %2278 = stablehlo.reshape %2277 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2279 = stablehlo.broadcast_in_dim %2278, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2280 = stablehlo.divide %2279, %309 : tensor<1x8x256x256xbf16>
-    %2281 = stablehlo.convert %2280 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %2282 = stablehlo.reduce(%2281 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2283 = stablehlo.reshape %2282 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2284 = stablehlo.broadcast_in_dim %2281, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2285 = stablehlo.broadcast_in_dim %2283, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2286 = stablehlo.subtract %2284, %2285 : tensor<1x8x256x256xf32>
-    %2287 = stablehlo.exponential %2286 : tensor<1x8x256x256xf32>
-    %2288 = stablehlo.reduce(%2287 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2289 = stablehlo.reshape %2288 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2290 = stablehlo.broadcast_in_dim %2287, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2291 = stablehlo.broadcast_in_dim %2289, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2292 = stablehlo.divide %2290, %2291 : tensor<1x8x256x256xf32>
-    %2293 = stablehlo.convert %2292 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %2294 = stablehlo.reshape %2293 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %2295 = stablehlo.reshape %2272 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2296 = stablehlo.broadcast_in_dim %2295, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2297 = stablehlo.dot_general %2294, %2296, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2298 = stablehlo.reshape %2297 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2299 = stablehlo.transpose %2298, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %2300 = stablehlo.reshape %2299 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %2301 = stablehlo.reshape %2300 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2302 = stablehlo.convert %2301 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2303 = stablehlo.dot_general %2302, %arg255, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2304 = stablehlo.broadcast_in_dim %2303, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2305 = stablehlo.multiply %2304, %127 : tensor<256x1280xf32>
-    %2306 = stablehlo.broadcast_in_dim %2305, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2307 = stablehlo.broadcast_in_dim %arg256, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2308 = stablehlo.add %2306, %2307 : tensor<256x1280xf32>
-    %2309 = stablehlo.convert %2308 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2310 = stablehlo.reshape %2309 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2311 = stablehlo.add %2310, %2203 : tensor<1x256x1280xbf16>
-    %2312 = stablehlo.convert %2311 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2313 = stablehlo.convert %2312 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2314 = stablehlo.reduce(%2313 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2315 = stablehlo.reshape %2314 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2316 = stablehlo.broadcast_in_dim %2315, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2317 = stablehlo.divide %2316, %142 : tensor<1x256x1xf64>
-    %2318 = stablehlo.broadcast_in_dim %2313, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2319 = stablehlo.broadcast_in_dim %2317, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2320 = stablehlo.subtract %2318, %2319 : tensor<1x256x1280xf64>
-    %2321 = stablehlo.multiply %2320, %2320 : tensor<1x256x1280xf64>
-    %2322 = stablehlo.reduce(%2321 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2323 = stablehlo.reshape %2322 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2324 = stablehlo.broadcast_in_dim %2323, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2325 = stablehlo.divide %2324, %142 : tensor<1x256x1xf64>
-    %2326 = stablehlo.convert %2325 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2327 = stablehlo.reduce(%2312 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2328 = stablehlo.reshape %2327 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2329 = stablehlo.broadcast_in_dim %2328, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2330 = stablehlo.divide %2329, %158 : tensor<1x256x1xf32>
-    %2331 = stablehlo.broadcast_in_dim %2326, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2332 = stablehlo.add %2331, %161 : tensor<1x256x1xf32>
-    %2333 = stablehlo.rsqrt %2332 : tensor<1x256x1xf32>
-    %2334 = stablehlo.broadcast_in_dim %2312, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2335 = stablehlo.broadcast_in_dim %2330, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2336 = stablehlo.subtract %2334, %2335 : tensor<1x256x1280xf32>
-    %2337 = stablehlo.broadcast_in_dim %2336, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2338 = stablehlo.broadcast_in_dim %2333, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2339 = stablehlo.multiply %2337, %2338 : tensor<1x256x1280xf32>
-    %2340 = stablehlo.convert %arg49 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2341 = stablehlo.broadcast_in_dim %2339, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2342 = stablehlo.broadcast_in_dim %2340, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2343 = stablehlo.multiply %2341, %2342 : tensor<1x256x1280xf32>
-    %2344 = stablehlo.convert %arg50 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2345 = stablehlo.broadcast_in_dim %2343, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2346 = stablehlo.broadcast_in_dim %2344, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2347 = stablehlo.add %2345, %2346 : tensor<1x256x1280xf32>
-    %2348 = stablehlo.convert %2347 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2349 = stablehlo.reshape %2348 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2350 = stablehlo.convert %2349 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2351 = stablehlo.dot_general %2350, %arg257, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2352 = stablehlo.broadcast_in_dim %2351, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2353 = stablehlo.multiply %2352, %127 : tensor<256x1280xf32>
-    %2354 = stablehlo.broadcast_in_dim %2353, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2355 = stablehlo.broadcast_in_dim %arg258, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2356 = stablehlo.add %2354, %2355 : tensor<256x1280xf32>
-    %2357 = stablehlo.convert %2356 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2358 = stablehlo.reshape %2357 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2359 = stablehlo.multiply %2358, %cst_4 : tensor<1x256x1280xbf16>
-    %2360 = stablehlo.multiply %2358, %190 : tensor<1x256x1280xbf16>
-    %2361 = stablehlo.convert %2360 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2362 = stablehlo.clamp %cst_5, %2361, %cst_6 : tensor<1x256x1280xf32>
-    %2363 = stablehlo.multiply %2362, %2362 : tensor<1x256x1280xf32>
-    %2364 = stablehlo.multiply %cst_7, %2363 : tensor<1x256x1280xf32>
-    %2365 = stablehlo.add %2364, %cst_8 : tensor<1x256x1280xf32>
-    %2366 = stablehlo.multiply %2365, %2363 : tensor<1x256x1280xf32>
-    %2367 = stablehlo.add %2366, %cst_9 : tensor<1x256x1280xf32>
-    %2368 = stablehlo.multiply %2367, %2363 : tensor<1x256x1280xf32>
-    %2369 = stablehlo.add %2368, %cst_10 : tensor<1x256x1280xf32>
-    %2370 = stablehlo.multiply %2369, %2363 : tensor<1x256x1280xf32>
-    %2371 = stablehlo.add %2370, %cst_11 : tensor<1x256x1280xf32>
-    %2372 = stablehlo.multiply %2371, %2363 : tensor<1x256x1280xf32>
-    %2373 = stablehlo.add %2372, %cst_12 : tensor<1x256x1280xf32>
-    %2374 = stablehlo.multiply %2373, %2363 : tensor<1x256x1280xf32>
-    %2375 = stablehlo.add %2374, %cst_13 : tensor<1x256x1280xf32>
-    %2376 = stablehlo.multiply %cst_14, %2363 : tensor<1x256x1280xf32>
-    %2377 = stablehlo.add %2376, %cst_15 : tensor<1x256x1280xf32>
-    %2378 = stablehlo.multiply %2377, %2363 : tensor<1x256x1280xf32>
-    %2379 = stablehlo.add %2378, %cst_16 : tensor<1x256x1280xf32>
-    %2380 = stablehlo.multiply %2379, %2363 : tensor<1x256x1280xf32>
-    %2381 = stablehlo.add %2380, %cst_17 : tensor<1x256x1280xf32>
-    %2382 = stablehlo.multiply %2381, %2363 : tensor<1x256x1280xf32>
-    %2383 = stablehlo.add %2382, %cst_18 : tensor<1x256x1280xf32>
-    %2384 = stablehlo.multiply %2362, %2375 : tensor<1x256x1280xf32>
-    %2385 = stablehlo.divide %2384, %2383 : tensor<1x256x1280xf32>
-    %2386 = stablehlo.clamp %cst_19, %2385, %cst_20 : tensor<1x256x1280xf32>
-    %2387 = stablehlo.convert %2386 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2388 = stablehlo.add %2387, %cst_2 : tensor<1x256x1280xbf16>
-    %2389 = stablehlo.multiply %2388, %2359 : tensor<1x256x1280xbf16>
-    %2390 = stablehlo.reshape %2389 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2391 = stablehlo.convert %2390 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2392 = stablehlo.dot_general %2391, %arg259, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2393 = stablehlo.broadcast_in_dim %2392, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2394 = stablehlo.multiply %2393, %127 : tensor<256x1280xf32>
-    %2395 = stablehlo.broadcast_in_dim %2394, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2396 = stablehlo.broadcast_in_dim %arg260, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2397 = stablehlo.add %2395, %2396 : tensor<256x1280xf32>
-    %2398 = stablehlo.convert %2397 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2399 = stablehlo.reshape %2398 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2400 = stablehlo.add %2399, %2311 : tensor<1x256x1280xbf16>
-    %2401 = stablehlo.convert %2400 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2402 = stablehlo.convert %2401 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2403 = stablehlo.reduce(%2402 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2404 = stablehlo.reshape %2403 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2405 = stablehlo.broadcast_in_dim %2404, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2406 = stablehlo.divide %2405, %142 : tensor<1x256x1xf64>
-    %2407 = stablehlo.broadcast_in_dim %2402, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2408 = stablehlo.broadcast_in_dim %2406, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2409 = stablehlo.subtract %2407, %2408 : tensor<1x256x1280xf64>
-    %2410 = stablehlo.multiply %2409, %2409 : tensor<1x256x1280xf64>
-    %2411 = stablehlo.reduce(%2410 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2412 = stablehlo.reshape %2411 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2413 = stablehlo.broadcast_in_dim %2412, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2414 = stablehlo.divide %2413, %142 : tensor<1x256x1xf64>
-    %2415 = stablehlo.convert %2414 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2416 = stablehlo.reduce(%2401 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2417 = stablehlo.reshape %2416 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2418 = stablehlo.broadcast_in_dim %2417, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2419 = stablehlo.divide %2418, %158 : tensor<1x256x1xf32>
-    %2420 = stablehlo.broadcast_in_dim %2415, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2421 = stablehlo.add %2420, %161 : tensor<1x256x1xf32>
-    %2422 = stablehlo.rsqrt %2421 : tensor<1x256x1xf32>
-    %2423 = stablehlo.broadcast_in_dim %2401, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2424 = stablehlo.broadcast_in_dim %2419, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2425 = stablehlo.subtract %2423, %2424 : tensor<1x256x1280xf32>
-    %2426 = stablehlo.broadcast_in_dim %2425, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2427 = stablehlo.broadcast_in_dim %2422, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2428 = stablehlo.multiply %2426, %2427 : tensor<1x256x1280xf32>
-    %2429 = stablehlo.convert %arg51 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2430 = stablehlo.broadcast_in_dim %2428, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2431 = stablehlo.broadcast_in_dim %2429, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2432 = stablehlo.multiply %2430, %2431 : tensor<1x256x1280xf32>
-    %2433 = stablehlo.convert %arg52 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2434 = stablehlo.broadcast_in_dim %2432, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2435 = stablehlo.broadcast_in_dim %2433, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2436 = stablehlo.add %2434, %2435 : tensor<1x256x1280xf32>
-    %2437 = stablehlo.convert %2436 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2438 = stablehlo.reshape %2437 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2439 = stablehlo.convert %2438 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2440 = stablehlo.dot_general %2439, %arg261, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2441 = stablehlo.broadcast_in_dim %2440, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2442 = stablehlo.multiply %2441, %273 : tensor<256x256xf32>
-    %2443 = stablehlo.broadcast_in_dim %2442, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2444 = stablehlo.broadcast_in_dim %arg262, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2445 = stablehlo.add %2443, %2444 : tensor<256x256xf32>
-    %2446 = stablehlo.convert %2445 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2447 = stablehlo.reshape %2446 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2448 = stablehlo.dot_general %2439, %arg263, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2449 = stablehlo.broadcast_in_dim %2448, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2450 = stablehlo.multiply %2449, %273 : tensor<256x256xf32>
-    %2451 = stablehlo.broadcast_in_dim %2450, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2452 = stablehlo.broadcast_in_dim %arg264, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2453 = stablehlo.add %2451, %2452 : tensor<256x256xf32>
-    %2454 = stablehlo.convert %2453 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2455 = stablehlo.reshape %2454 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2456 = stablehlo.dot_general %2439, %arg265, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2457 = stablehlo.broadcast_in_dim %2456, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2458 = stablehlo.multiply %2457, %127 : tensor<256x1280xf32>
-    %2459 = stablehlo.broadcast_in_dim %2458, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2460 = stablehlo.broadcast_in_dim %arg266, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2461 = stablehlo.add %2459, %2460 : tensor<256x1280xf32>
-    %2462 = stablehlo.convert %2461 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2463 = stablehlo.reshape %2462 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2464 = stablehlo.reshape %2447 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2465 = stablehlo.transpose %2464, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2466 = stablehlo.reshape %2455 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2467 = stablehlo.transpose %2466, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2468 = stablehlo.reshape %2463 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %2469 = stablehlo.transpose %2468, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2470 = stablehlo.transpose %2467, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %2471 = stablehlo.reshape %2465 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2472 = stablehlo.reshape %2470 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2473 = stablehlo.broadcast_in_dim %2472, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2474 = stablehlo.dot_general %2471, %2473, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %2475 = stablehlo.reshape %2474 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2476 = stablehlo.broadcast_in_dim %2475, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2477 = stablehlo.divide %2476, %309 : tensor<1x8x256x256xbf16>
-    %2478 = stablehlo.convert %2477 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %2479 = stablehlo.reduce(%2478 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2480 = stablehlo.reshape %2479 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2481 = stablehlo.broadcast_in_dim %2478, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2482 = stablehlo.broadcast_in_dim %2480, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2483 = stablehlo.subtract %2481, %2482 : tensor<1x8x256x256xf32>
-    %2484 = stablehlo.exponential %2483 : tensor<1x8x256x256xf32>
-    %2485 = stablehlo.reduce(%2484 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2486 = stablehlo.reshape %2485 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2487 = stablehlo.broadcast_in_dim %2484, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2488 = stablehlo.broadcast_in_dim %2486, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2489 = stablehlo.divide %2487, %2488 : tensor<1x8x256x256xf32>
-    %2490 = stablehlo.convert %2489 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %2491 = stablehlo.reshape %2490 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %2492 = stablehlo.reshape %2469 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2493 = stablehlo.broadcast_in_dim %2492, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2494 = stablehlo.dot_general %2491, %2493, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2495 = stablehlo.reshape %2494 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2496 = stablehlo.transpose %2495, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %2497 = stablehlo.reshape %2496 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %2498 = stablehlo.reshape %2497 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2499 = stablehlo.convert %2498 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2500 = stablehlo.dot_general %2499, %arg267, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2501 = stablehlo.broadcast_in_dim %2500, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2502 = stablehlo.multiply %2501, %127 : tensor<256x1280xf32>
-    %2503 = stablehlo.broadcast_in_dim %2502, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2504 = stablehlo.broadcast_in_dim %arg268, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2505 = stablehlo.add %2503, %2504 : tensor<256x1280xf32>
-    %2506 = stablehlo.convert %2505 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2507 = stablehlo.reshape %2506 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2508 = stablehlo.add %2507, %2400 : tensor<1x256x1280xbf16>
-    %2509 = stablehlo.convert %2508 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2510 = stablehlo.convert %2509 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2511 = stablehlo.reduce(%2510 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2512 = stablehlo.reshape %2511 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2513 = stablehlo.broadcast_in_dim %2512, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2514 = stablehlo.divide %2513, %142 : tensor<1x256x1xf64>
-    %2515 = stablehlo.broadcast_in_dim %2510, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2516 = stablehlo.broadcast_in_dim %2514, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2517 = stablehlo.subtract %2515, %2516 : tensor<1x256x1280xf64>
-    %2518 = stablehlo.multiply %2517, %2517 : tensor<1x256x1280xf64>
-    %2519 = stablehlo.reduce(%2518 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2520 = stablehlo.reshape %2519 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2521 = stablehlo.broadcast_in_dim %2520, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2522 = stablehlo.divide %2521, %142 : tensor<1x256x1xf64>
-    %2523 = stablehlo.convert %2522 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2524 = stablehlo.reduce(%2509 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2525 = stablehlo.reshape %2524 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2526 = stablehlo.broadcast_in_dim %2525, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2527 = stablehlo.divide %2526, %158 : tensor<1x256x1xf32>
-    %2528 = stablehlo.broadcast_in_dim %2523, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2529 = stablehlo.add %2528, %161 : tensor<1x256x1xf32>
-    %2530 = stablehlo.rsqrt %2529 : tensor<1x256x1xf32>
-    %2531 = stablehlo.broadcast_in_dim %2509, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2532 = stablehlo.broadcast_in_dim %2527, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2533 = stablehlo.subtract %2531, %2532 : tensor<1x256x1280xf32>
-    %2534 = stablehlo.broadcast_in_dim %2533, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2535 = stablehlo.broadcast_in_dim %2530, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2536 = stablehlo.multiply %2534, %2535 : tensor<1x256x1280xf32>
-    %2537 = stablehlo.convert %arg53 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2538 = stablehlo.broadcast_in_dim %2536, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2539 = stablehlo.broadcast_in_dim %2537, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2540 = stablehlo.multiply %2538, %2539 : tensor<1x256x1280xf32>
-    %2541 = stablehlo.convert %arg54 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2542 = stablehlo.broadcast_in_dim %2540, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2543 = stablehlo.broadcast_in_dim %2541, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2544 = stablehlo.add %2542, %2543 : tensor<1x256x1280xf32>
-    %2545 = stablehlo.convert %2544 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2546 = stablehlo.reshape %2545 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2547 = stablehlo.convert %2546 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2548 = stablehlo.dot_general %2547, %arg269, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2549 = stablehlo.broadcast_in_dim %2548, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2550 = stablehlo.multiply %2549, %127 : tensor<256x1280xf32>
-    %2551 = stablehlo.broadcast_in_dim %2550, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2552 = stablehlo.broadcast_in_dim %arg270, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2553 = stablehlo.add %2551, %2552 : tensor<256x1280xf32>
-    %2554 = stablehlo.convert %2553 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2555 = stablehlo.reshape %2554 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2556 = stablehlo.multiply %2555, %cst_4 : tensor<1x256x1280xbf16>
-    %2557 = stablehlo.multiply %2555, %190 : tensor<1x256x1280xbf16>
-    %2558 = stablehlo.convert %2557 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2559 = stablehlo.clamp %cst_5, %2558, %cst_6 : tensor<1x256x1280xf32>
-    %2560 = stablehlo.multiply %2559, %2559 : tensor<1x256x1280xf32>
-    %2561 = stablehlo.multiply %cst_7, %2560 : tensor<1x256x1280xf32>
-    %2562 = stablehlo.add %2561, %cst_8 : tensor<1x256x1280xf32>
-    %2563 = stablehlo.multiply %2562, %2560 : tensor<1x256x1280xf32>
-    %2564 = stablehlo.add %2563, %cst_9 : tensor<1x256x1280xf32>
-    %2565 = stablehlo.multiply %2564, %2560 : tensor<1x256x1280xf32>
-    %2566 = stablehlo.add %2565, %cst_10 : tensor<1x256x1280xf32>
-    %2567 = stablehlo.multiply %2566, %2560 : tensor<1x256x1280xf32>
-    %2568 = stablehlo.add %2567, %cst_11 : tensor<1x256x1280xf32>
-    %2569 = stablehlo.multiply %2568, %2560 : tensor<1x256x1280xf32>
-    %2570 = stablehlo.add %2569, %cst_12 : tensor<1x256x1280xf32>
-    %2571 = stablehlo.multiply %2570, %2560 : tensor<1x256x1280xf32>
-    %2572 = stablehlo.add %2571, %cst_13 : tensor<1x256x1280xf32>
-    %2573 = stablehlo.multiply %cst_14, %2560 : tensor<1x256x1280xf32>
-    %2574 = stablehlo.add %2573, %cst_15 : tensor<1x256x1280xf32>
-    %2575 = stablehlo.multiply %2574, %2560 : tensor<1x256x1280xf32>
-    %2576 = stablehlo.add %2575, %cst_16 : tensor<1x256x1280xf32>
-    %2577 = stablehlo.multiply %2576, %2560 : tensor<1x256x1280xf32>
-    %2578 = stablehlo.add %2577, %cst_17 : tensor<1x256x1280xf32>
-    %2579 = stablehlo.multiply %2578, %2560 : tensor<1x256x1280xf32>
-    %2580 = stablehlo.add %2579, %cst_18 : tensor<1x256x1280xf32>
-    %2581 = stablehlo.multiply %2559, %2572 : tensor<1x256x1280xf32>
-    %2582 = stablehlo.divide %2581, %2580 : tensor<1x256x1280xf32>
-    %2583 = stablehlo.clamp %cst_19, %2582, %cst_20 : tensor<1x256x1280xf32>
-    %2584 = stablehlo.convert %2583 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2585 = stablehlo.add %2584, %cst_2 : tensor<1x256x1280xbf16>
-    %2586 = stablehlo.multiply %2585, %2556 : tensor<1x256x1280xbf16>
-    %2587 = stablehlo.reshape %2586 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2588 = stablehlo.convert %2587 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2589 = stablehlo.dot_general %2588, %arg271, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2590 = stablehlo.broadcast_in_dim %2589, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2591 = stablehlo.multiply %2590, %127 : tensor<256x1280xf32>
-    %2592 = stablehlo.broadcast_in_dim %2591, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2593 = stablehlo.broadcast_in_dim %arg272, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2594 = stablehlo.add %2592, %2593 : tensor<256x1280xf32>
-    %2595 = stablehlo.convert %2594 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2596 = stablehlo.reshape %2595 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2597 = stablehlo.add %2596, %2508 : tensor<1x256x1280xbf16>
-    %2598 = stablehlo.convert %2597 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2599 = stablehlo.convert %2598 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2600 = stablehlo.reduce(%2599 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2601 = stablehlo.reshape %2600 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2602 = stablehlo.broadcast_in_dim %2601, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2603 = stablehlo.divide %2602, %142 : tensor<1x256x1xf64>
-    %2604 = stablehlo.broadcast_in_dim %2599, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2605 = stablehlo.broadcast_in_dim %2603, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2606 = stablehlo.subtract %2604, %2605 : tensor<1x256x1280xf64>
-    %2607 = stablehlo.multiply %2606, %2606 : tensor<1x256x1280xf64>
-    %2608 = stablehlo.reduce(%2607 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2609 = stablehlo.reshape %2608 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2610 = stablehlo.broadcast_in_dim %2609, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2611 = stablehlo.divide %2610, %142 : tensor<1x256x1xf64>
-    %2612 = stablehlo.convert %2611 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2613 = stablehlo.reduce(%2598 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2614 = stablehlo.reshape %2613 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2615 = stablehlo.broadcast_in_dim %2614, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2616 = stablehlo.divide %2615, %158 : tensor<1x256x1xf32>
-    %2617 = stablehlo.broadcast_in_dim %2612, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2618 = stablehlo.add %2617, %161 : tensor<1x256x1xf32>
-    %2619 = stablehlo.rsqrt %2618 : tensor<1x256x1xf32>
-    %2620 = stablehlo.broadcast_in_dim %2598, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2621 = stablehlo.broadcast_in_dim %2616, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2622 = stablehlo.subtract %2620, %2621 : tensor<1x256x1280xf32>
-    %2623 = stablehlo.broadcast_in_dim %2622, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2624 = stablehlo.broadcast_in_dim %2619, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2625 = stablehlo.multiply %2623, %2624 : tensor<1x256x1280xf32>
-    %2626 = stablehlo.convert %arg55 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2627 = stablehlo.broadcast_in_dim %2625, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2628 = stablehlo.broadcast_in_dim %2626, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2629 = stablehlo.multiply %2627, %2628 : tensor<1x256x1280xf32>
-    %2630 = stablehlo.convert %arg56 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2631 = stablehlo.broadcast_in_dim %2629, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2632 = stablehlo.broadcast_in_dim %2630, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2633 = stablehlo.add %2631, %2632 : tensor<1x256x1280xf32>
-    %2634 = stablehlo.convert %2633 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2635 = stablehlo.reshape %2634 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2636 = stablehlo.convert %2635 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2637 = stablehlo.dot_general %2636, %arg273, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2638 = stablehlo.broadcast_in_dim %2637, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2639 = stablehlo.multiply %2638, %273 : tensor<256x256xf32>
-    %2640 = stablehlo.broadcast_in_dim %2639, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2641 = stablehlo.broadcast_in_dim %arg274, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2642 = stablehlo.add %2640, %2641 : tensor<256x256xf32>
-    %2643 = stablehlo.convert %2642 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2644 = stablehlo.reshape %2643 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2645 = stablehlo.dot_general %2636, %arg275, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2646 = stablehlo.broadcast_in_dim %2645, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2647 = stablehlo.multiply %2646, %273 : tensor<256x256xf32>
-    %2648 = stablehlo.broadcast_in_dim %2647, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2649 = stablehlo.broadcast_in_dim %arg276, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2650 = stablehlo.add %2648, %2649 : tensor<256x256xf32>
-    %2651 = stablehlo.convert %2650 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2652 = stablehlo.reshape %2651 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2653 = stablehlo.dot_general %2636, %arg277, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2654 = stablehlo.broadcast_in_dim %2653, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2655 = stablehlo.multiply %2654, %127 : tensor<256x1280xf32>
-    %2656 = stablehlo.broadcast_in_dim %2655, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2657 = stablehlo.broadcast_in_dim %arg278, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2658 = stablehlo.add %2656, %2657 : tensor<256x1280xf32>
-    %2659 = stablehlo.convert %2658 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2660 = stablehlo.reshape %2659 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2661 = stablehlo.reshape %2644 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2662 = stablehlo.transpose %2661, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2663 = stablehlo.reshape %2652 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2664 = stablehlo.transpose %2663, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2665 = stablehlo.reshape %2660 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %2666 = stablehlo.transpose %2665, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2667 = stablehlo.transpose %2664, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %2668 = stablehlo.reshape %2662 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2669 = stablehlo.reshape %2667 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2670 = stablehlo.broadcast_in_dim %2669, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2671 = stablehlo.dot_general %2668, %2670, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %2672 = stablehlo.reshape %2671 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2673 = stablehlo.broadcast_in_dim %2672, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2674 = stablehlo.divide %2673, %309 : tensor<1x8x256x256xbf16>
-    %2675 = stablehlo.convert %2674 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %2676 = stablehlo.reduce(%2675 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2677 = stablehlo.reshape %2676 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2678 = stablehlo.broadcast_in_dim %2675, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2679 = stablehlo.broadcast_in_dim %2677, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2680 = stablehlo.subtract %2678, %2679 : tensor<1x8x256x256xf32>
-    %2681 = stablehlo.exponential %2680 : tensor<1x8x256x256xf32>
-    %2682 = stablehlo.reduce(%2681 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2683 = stablehlo.reshape %2682 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2684 = stablehlo.broadcast_in_dim %2681, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2685 = stablehlo.broadcast_in_dim %2683, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2686 = stablehlo.divide %2684, %2685 : tensor<1x8x256x256xf32>
-    %2687 = stablehlo.convert %2686 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %2688 = stablehlo.reshape %2687 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %2689 = stablehlo.reshape %2666 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2690 = stablehlo.broadcast_in_dim %2689, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2691 = stablehlo.dot_general %2688, %2690, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2692 = stablehlo.reshape %2691 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2693 = stablehlo.transpose %2692, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %2694 = stablehlo.reshape %2693 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %2695 = stablehlo.reshape %2694 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2696 = stablehlo.convert %2695 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2697 = stablehlo.dot_general %2696, %arg279, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2698 = stablehlo.broadcast_in_dim %2697, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2699 = stablehlo.multiply %2698, %127 : tensor<256x1280xf32>
-    %2700 = stablehlo.broadcast_in_dim %2699, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2701 = stablehlo.broadcast_in_dim %arg280, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2702 = stablehlo.add %2700, %2701 : tensor<256x1280xf32>
-    %2703 = stablehlo.convert %2702 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2704 = stablehlo.reshape %2703 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2705 = stablehlo.add %2704, %2597 : tensor<1x256x1280xbf16>
-    %2706 = stablehlo.convert %2705 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2707 = stablehlo.convert %2706 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2708 = stablehlo.reduce(%2707 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2709 = stablehlo.reshape %2708 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2710 = stablehlo.broadcast_in_dim %2709, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2711 = stablehlo.divide %2710, %142 : tensor<1x256x1xf64>
-    %2712 = stablehlo.broadcast_in_dim %2707, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2713 = stablehlo.broadcast_in_dim %2711, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2714 = stablehlo.subtract %2712, %2713 : tensor<1x256x1280xf64>
-    %2715 = stablehlo.multiply %2714, %2714 : tensor<1x256x1280xf64>
-    %2716 = stablehlo.reduce(%2715 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2717 = stablehlo.reshape %2716 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2718 = stablehlo.broadcast_in_dim %2717, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2719 = stablehlo.divide %2718, %142 : tensor<1x256x1xf64>
-    %2720 = stablehlo.convert %2719 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2721 = stablehlo.reduce(%2706 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2722 = stablehlo.reshape %2721 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2723 = stablehlo.broadcast_in_dim %2722, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2724 = stablehlo.divide %2723, %158 : tensor<1x256x1xf32>
-    %2725 = stablehlo.broadcast_in_dim %2720, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2726 = stablehlo.add %2725, %161 : tensor<1x256x1xf32>
-    %2727 = stablehlo.rsqrt %2726 : tensor<1x256x1xf32>
-    %2728 = stablehlo.broadcast_in_dim %2706, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2729 = stablehlo.broadcast_in_dim %2724, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2730 = stablehlo.subtract %2728, %2729 : tensor<1x256x1280xf32>
-    %2731 = stablehlo.broadcast_in_dim %2730, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2732 = stablehlo.broadcast_in_dim %2727, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2733 = stablehlo.multiply %2731, %2732 : tensor<1x256x1280xf32>
-    %2734 = stablehlo.convert %arg57 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2735 = stablehlo.broadcast_in_dim %2733, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2736 = stablehlo.broadcast_in_dim %2734, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2737 = stablehlo.multiply %2735, %2736 : tensor<1x256x1280xf32>
-    %2738 = stablehlo.convert %arg58 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2739 = stablehlo.broadcast_in_dim %2737, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2740 = stablehlo.broadcast_in_dim %2738, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2741 = stablehlo.add %2739, %2740 : tensor<1x256x1280xf32>
-    %2742 = stablehlo.convert %2741 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2743 = stablehlo.reshape %2742 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2744 = stablehlo.convert %2743 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2745 = stablehlo.dot_general %2744, %arg281, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2746 = stablehlo.broadcast_in_dim %2745, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2747 = stablehlo.multiply %2746, %127 : tensor<256x1280xf32>
-    %2748 = stablehlo.broadcast_in_dim %2747, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2749 = stablehlo.broadcast_in_dim %arg282, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2750 = stablehlo.add %2748, %2749 : tensor<256x1280xf32>
-    %2751 = stablehlo.convert %2750 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2752 = stablehlo.reshape %2751 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2753 = stablehlo.multiply %2752, %cst_4 : tensor<1x256x1280xbf16>
-    %2754 = stablehlo.multiply %2752, %190 : tensor<1x256x1280xbf16>
-    %2755 = stablehlo.convert %2754 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2756 = stablehlo.clamp %cst_5, %2755, %cst_6 : tensor<1x256x1280xf32>
-    %2757 = stablehlo.multiply %2756, %2756 : tensor<1x256x1280xf32>
-    %2758 = stablehlo.multiply %cst_7, %2757 : tensor<1x256x1280xf32>
-    %2759 = stablehlo.add %2758, %cst_8 : tensor<1x256x1280xf32>
-    %2760 = stablehlo.multiply %2759, %2757 : tensor<1x256x1280xf32>
-    %2761 = stablehlo.add %2760, %cst_9 : tensor<1x256x1280xf32>
-    %2762 = stablehlo.multiply %2761, %2757 : tensor<1x256x1280xf32>
-    %2763 = stablehlo.add %2762, %cst_10 : tensor<1x256x1280xf32>
-    %2764 = stablehlo.multiply %2763, %2757 : tensor<1x256x1280xf32>
-    %2765 = stablehlo.add %2764, %cst_11 : tensor<1x256x1280xf32>
-    %2766 = stablehlo.multiply %2765, %2757 : tensor<1x256x1280xf32>
-    %2767 = stablehlo.add %2766, %cst_12 : tensor<1x256x1280xf32>
-    %2768 = stablehlo.multiply %2767, %2757 : tensor<1x256x1280xf32>
-    %2769 = stablehlo.add %2768, %cst_13 : tensor<1x256x1280xf32>
-    %2770 = stablehlo.multiply %cst_14, %2757 : tensor<1x256x1280xf32>
-    %2771 = stablehlo.add %2770, %cst_15 : tensor<1x256x1280xf32>
-    %2772 = stablehlo.multiply %2771, %2757 : tensor<1x256x1280xf32>
-    %2773 = stablehlo.add %2772, %cst_16 : tensor<1x256x1280xf32>
-    %2774 = stablehlo.multiply %2773, %2757 : tensor<1x256x1280xf32>
-    %2775 = stablehlo.add %2774, %cst_17 : tensor<1x256x1280xf32>
-    %2776 = stablehlo.multiply %2775, %2757 : tensor<1x256x1280xf32>
-    %2777 = stablehlo.add %2776, %cst_18 : tensor<1x256x1280xf32>
-    %2778 = stablehlo.multiply %2756, %2769 : tensor<1x256x1280xf32>
-    %2779 = stablehlo.divide %2778, %2777 : tensor<1x256x1280xf32>
-    %2780 = stablehlo.clamp %cst_19, %2779, %cst_20 : tensor<1x256x1280xf32>
-    %2781 = stablehlo.convert %2780 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2782 = stablehlo.add %2781, %cst_2 : tensor<1x256x1280xbf16>
-    %2783 = stablehlo.multiply %2782, %2753 : tensor<1x256x1280xbf16>
-    %2784 = stablehlo.reshape %2783 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2785 = stablehlo.convert %2784 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2786 = stablehlo.dot_general %2785, %arg283, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2787 = stablehlo.broadcast_in_dim %2786, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2788 = stablehlo.multiply %2787, %127 : tensor<256x1280xf32>
-    %2789 = stablehlo.broadcast_in_dim %2788, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2790 = stablehlo.broadcast_in_dim %arg284, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2791 = stablehlo.add %2789, %2790 : tensor<256x1280xf32>
-    %2792 = stablehlo.convert %2791 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2793 = stablehlo.reshape %2792 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2794 = stablehlo.add %2793, %2705 : tensor<1x256x1280xbf16>
-    %2795 = stablehlo.convert %2794 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2796 = stablehlo.convert %2795 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2797 = stablehlo.reduce(%2796 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2798 = stablehlo.reshape %2797 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2799 = stablehlo.broadcast_in_dim %2798, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2800 = stablehlo.divide %2799, %142 : tensor<1x256x1xf64>
-    %2801 = stablehlo.broadcast_in_dim %2796, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2802 = stablehlo.broadcast_in_dim %2800, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2803 = stablehlo.subtract %2801, %2802 : tensor<1x256x1280xf64>
-    %2804 = stablehlo.multiply %2803, %2803 : tensor<1x256x1280xf64>
-    %2805 = stablehlo.reduce(%2804 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2806 = stablehlo.reshape %2805 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2807 = stablehlo.broadcast_in_dim %2806, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2808 = stablehlo.divide %2807, %142 : tensor<1x256x1xf64>
-    %2809 = stablehlo.convert %2808 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2810 = stablehlo.reduce(%2795 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2811 = stablehlo.reshape %2810 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2812 = stablehlo.broadcast_in_dim %2811, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2813 = stablehlo.divide %2812, %158 : tensor<1x256x1xf32>
-    %2814 = stablehlo.broadcast_in_dim %2809, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2815 = stablehlo.add %2814, %161 : tensor<1x256x1xf32>
-    %2816 = stablehlo.rsqrt %2815 : tensor<1x256x1xf32>
-    %2817 = stablehlo.broadcast_in_dim %2795, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2818 = stablehlo.broadcast_in_dim %2813, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2819 = stablehlo.subtract %2817, %2818 : tensor<1x256x1280xf32>
-    %2820 = stablehlo.broadcast_in_dim %2819, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2821 = stablehlo.broadcast_in_dim %2816, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2822 = stablehlo.multiply %2820, %2821 : tensor<1x256x1280xf32>
-    %2823 = stablehlo.convert %arg59 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2824 = stablehlo.broadcast_in_dim %2822, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2825 = stablehlo.broadcast_in_dim %2823, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2826 = stablehlo.multiply %2824, %2825 : tensor<1x256x1280xf32>
-    %2827 = stablehlo.convert %arg60 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2828 = stablehlo.broadcast_in_dim %2826, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2829 = stablehlo.broadcast_in_dim %2827, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2830 = stablehlo.add %2828, %2829 : tensor<1x256x1280xf32>
-    %2831 = stablehlo.convert %2830 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2832 = stablehlo.reshape %2831 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2833 = stablehlo.convert %2832 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2834 = stablehlo.dot_general %2833, %arg285, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2835 = stablehlo.broadcast_in_dim %2834, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2836 = stablehlo.multiply %2835, %273 : tensor<256x256xf32>
-    %2837 = stablehlo.broadcast_in_dim %2836, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2838 = stablehlo.broadcast_in_dim %arg286, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2839 = stablehlo.add %2837, %2838 : tensor<256x256xf32>
-    %2840 = stablehlo.convert %2839 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2841 = stablehlo.reshape %2840 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2842 = stablehlo.dot_general %2833, %arg287, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %2843 = stablehlo.broadcast_in_dim %2842, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2844 = stablehlo.multiply %2843, %273 : tensor<256x256xf32>
-    %2845 = stablehlo.broadcast_in_dim %2844, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2846 = stablehlo.broadcast_in_dim %arg288, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2847 = stablehlo.add %2845, %2846 : tensor<256x256xf32>
-    %2848 = stablehlo.convert %2847 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2849 = stablehlo.reshape %2848 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2850 = stablehlo.dot_general %2833, %arg289, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2851 = stablehlo.broadcast_in_dim %2850, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2852 = stablehlo.multiply %2851, %127 : tensor<256x1280xf32>
-    %2853 = stablehlo.broadcast_in_dim %2852, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2854 = stablehlo.broadcast_in_dim %arg290, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2855 = stablehlo.add %2853, %2854 : tensor<256x1280xf32>
-    %2856 = stablehlo.convert %2855 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2857 = stablehlo.reshape %2856 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2858 = stablehlo.reshape %2841 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2859 = stablehlo.transpose %2858, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2860 = stablehlo.reshape %2849 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2861 = stablehlo.transpose %2860, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2862 = stablehlo.reshape %2857 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %2863 = stablehlo.transpose %2862, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2864 = stablehlo.transpose %2861, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %2865 = stablehlo.reshape %2859 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2866 = stablehlo.reshape %2864 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2867 = stablehlo.broadcast_in_dim %2866, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2868 = stablehlo.dot_general %2865, %2867, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %2869 = stablehlo.reshape %2868 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2870 = stablehlo.broadcast_in_dim %2869, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2871 = stablehlo.divide %2870, %309 : tensor<1x8x256x256xbf16>
-    %2872 = stablehlo.convert %2871 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %2873 = stablehlo.reduce(%2872 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2874 = stablehlo.reshape %2873 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2875 = stablehlo.broadcast_in_dim %2872, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2876 = stablehlo.broadcast_in_dim %2874, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2877 = stablehlo.subtract %2875, %2876 : tensor<1x8x256x256xf32>
-    %2878 = stablehlo.exponential %2877 : tensor<1x8x256x256xf32>
-    %2879 = stablehlo.reduce(%2878 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2880 = stablehlo.reshape %2879 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2881 = stablehlo.broadcast_in_dim %2878, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2882 = stablehlo.broadcast_in_dim %2880, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2883 = stablehlo.divide %2881, %2882 : tensor<1x8x256x256xf32>
-    %2884 = stablehlo.convert %2883 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %2885 = stablehlo.reshape %2884 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %2886 = stablehlo.reshape %2863 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2887 = stablehlo.broadcast_in_dim %2886, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2888 = stablehlo.dot_general %2885, %2887, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %2889 = stablehlo.reshape %2888 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %2890 = stablehlo.transpose %2889, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %2891 = stablehlo.reshape %2890 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %2892 = stablehlo.reshape %2891 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2893 = stablehlo.convert %2892 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2894 = stablehlo.dot_general %2893, %arg291, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2895 = stablehlo.broadcast_in_dim %2894, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2896 = stablehlo.multiply %2895, %127 : tensor<256x1280xf32>
-    %2897 = stablehlo.broadcast_in_dim %2896, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2898 = stablehlo.broadcast_in_dim %arg292, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2899 = stablehlo.add %2897, %2898 : tensor<256x1280xf32>
-    %2900 = stablehlo.convert %2899 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2901 = stablehlo.reshape %2900 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2902 = stablehlo.add %2901, %2794 : tensor<1x256x1280xbf16>
-    %2903 = stablehlo.convert %2902 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2904 = stablehlo.convert %2903 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2905 = stablehlo.reduce(%2904 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2906 = stablehlo.reshape %2905 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2907 = stablehlo.broadcast_in_dim %2906, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2908 = stablehlo.divide %2907, %142 : tensor<1x256x1xf64>
-    %2909 = stablehlo.broadcast_in_dim %2904, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2910 = stablehlo.broadcast_in_dim %2908, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %2911 = stablehlo.subtract %2909, %2910 : tensor<1x256x1280xf64>
-    %2912 = stablehlo.multiply %2911, %2911 : tensor<1x256x1280xf64>
-    %2913 = stablehlo.reduce(%2912 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2914 = stablehlo.reshape %2913 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2915 = stablehlo.broadcast_in_dim %2914, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2916 = stablehlo.divide %2915, %142 : tensor<1x256x1xf64>
-    %2917 = stablehlo.convert %2916 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2918 = stablehlo.reduce(%2903 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2919 = stablehlo.reshape %2918 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2920 = stablehlo.broadcast_in_dim %2919, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2921 = stablehlo.divide %2920, %158 : tensor<1x256x1xf32>
-    %2922 = stablehlo.broadcast_in_dim %2917, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2923 = stablehlo.add %2922, %161 : tensor<1x256x1xf32>
-    %2924 = stablehlo.rsqrt %2923 : tensor<1x256x1xf32>
-    %2925 = stablehlo.broadcast_in_dim %2903, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2926 = stablehlo.broadcast_in_dim %2921, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2927 = stablehlo.subtract %2925, %2926 : tensor<1x256x1280xf32>
-    %2928 = stablehlo.broadcast_in_dim %2927, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2929 = stablehlo.broadcast_in_dim %2924, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %2930 = stablehlo.multiply %2928, %2929 : tensor<1x256x1280xf32>
-    %2931 = stablehlo.convert %arg61 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2932 = stablehlo.broadcast_in_dim %2930, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2933 = stablehlo.broadcast_in_dim %2931, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2934 = stablehlo.multiply %2932, %2933 : tensor<1x256x1280xf32>
-    %2935 = stablehlo.convert %arg62 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %2936 = stablehlo.broadcast_in_dim %2934, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %2937 = stablehlo.broadcast_in_dim %2935, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %2938 = stablehlo.add %2936, %2937 : tensor<1x256x1280xf32>
-    %2939 = stablehlo.convert %2938 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2940 = stablehlo.reshape %2939 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2941 = stablehlo.convert %2940 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2942 = stablehlo.dot_general %2941, %arg293, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2943 = stablehlo.broadcast_in_dim %2942, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2944 = stablehlo.multiply %2943, %127 : tensor<256x1280xf32>
-    %2945 = stablehlo.broadcast_in_dim %2944, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2946 = stablehlo.broadcast_in_dim %arg294, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2947 = stablehlo.add %2945, %2946 : tensor<256x1280xf32>
-    %2948 = stablehlo.convert %2947 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2949 = stablehlo.reshape %2948 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2950 = stablehlo.multiply %2949, %cst_4 : tensor<1x256x1280xbf16>
-    %2951 = stablehlo.multiply %2949, %190 : tensor<1x256x1280xbf16>
-    %2952 = stablehlo.convert %2951 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2953 = stablehlo.clamp %cst_5, %2952, %cst_6 : tensor<1x256x1280xf32>
-    %2954 = stablehlo.multiply %2953, %2953 : tensor<1x256x1280xf32>
-    %2955 = stablehlo.multiply %cst_7, %2954 : tensor<1x256x1280xf32>
-    %2956 = stablehlo.add %2955, %cst_8 : tensor<1x256x1280xf32>
-    %2957 = stablehlo.multiply %2956, %2954 : tensor<1x256x1280xf32>
-    %2958 = stablehlo.add %2957, %cst_9 : tensor<1x256x1280xf32>
-    %2959 = stablehlo.multiply %2958, %2954 : tensor<1x256x1280xf32>
-    %2960 = stablehlo.add %2959, %cst_10 : tensor<1x256x1280xf32>
-    %2961 = stablehlo.multiply %2960, %2954 : tensor<1x256x1280xf32>
-    %2962 = stablehlo.add %2961, %cst_11 : tensor<1x256x1280xf32>
-    %2963 = stablehlo.multiply %2962, %2954 : tensor<1x256x1280xf32>
-    %2964 = stablehlo.add %2963, %cst_12 : tensor<1x256x1280xf32>
-    %2965 = stablehlo.multiply %2964, %2954 : tensor<1x256x1280xf32>
-    %2966 = stablehlo.add %2965, %cst_13 : tensor<1x256x1280xf32>
-    %2967 = stablehlo.multiply %cst_14, %2954 : tensor<1x256x1280xf32>
-    %2968 = stablehlo.add %2967, %cst_15 : tensor<1x256x1280xf32>
-    %2969 = stablehlo.multiply %2968, %2954 : tensor<1x256x1280xf32>
-    %2970 = stablehlo.add %2969, %cst_16 : tensor<1x256x1280xf32>
-    %2971 = stablehlo.multiply %2970, %2954 : tensor<1x256x1280xf32>
-    %2972 = stablehlo.add %2971, %cst_17 : tensor<1x256x1280xf32>
-    %2973 = stablehlo.multiply %2972, %2954 : tensor<1x256x1280xf32>
-    %2974 = stablehlo.add %2973, %cst_18 : tensor<1x256x1280xf32>
-    %2975 = stablehlo.multiply %2953, %2966 : tensor<1x256x1280xf32>
-    %2976 = stablehlo.divide %2975, %2974 : tensor<1x256x1280xf32>
-    %2977 = stablehlo.clamp %cst_19, %2976, %cst_20 : tensor<1x256x1280xf32>
-    %2978 = stablehlo.convert %2977 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %2979 = stablehlo.add %2978, %cst_2 : tensor<1x256x1280xbf16>
-    %2980 = stablehlo.multiply %2979, %2950 : tensor<1x256x1280xbf16>
-    %2981 = stablehlo.reshape %2980 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %2982 = stablehlo.convert %2981 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %2983 = stablehlo.dot_general %2982, %arg295, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %2984 = stablehlo.broadcast_in_dim %2983, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2985 = stablehlo.multiply %2984, %127 : tensor<256x1280xf32>
-    %2986 = stablehlo.broadcast_in_dim %2985, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %2987 = stablehlo.broadcast_in_dim %arg296, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %2988 = stablehlo.add %2986, %2987 : tensor<256x1280xf32>
-    %2989 = stablehlo.convert %2988 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %2990 = stablehlo.reshape %2989 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %2991 = stablehlo.add %2990, %2902 : tensor<1x256x1280xbf16>
-    %2992 = stablehlo.convert %2991 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %2993 = stablehlo.convert %2992 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %2994 = stablehlo.reduce(%2993 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2995 = stablehlo.reshape %2994 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2996 = stablehlo.broadcast_in_dim %2995, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2997 = stablehlo.divide %2996, %142 : tensor<1x256x1xf64>
-    %2998 = stablehlo.broadcast_in_dim %2993, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %2999 = stablehlo.broadcast_in_dim %2997, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3000 = stablehlo.subtract %2998, %2999 : tensor<1x256x1280xf64>
-    %3001 = stablehlo.multiply %3000, %3000 : tensor<1x256x1280xf64>
-    %3002 = stablehlo.reduce(%3001 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3003 = stablehlo.reshape %3002 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3004 = stablehlo.broadcast_in_dim %3003, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3005 = stablehlo.divide %3004, %142 : tensor<1x256x1xf64>
-    %3006 = stablehlo.convert %3005 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3007 = stablehlo.reduce(%2992 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3008 = stablehlo.reshape %3007 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3009 = stablehlo.broadcast_in_dim %3008, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3010 = stablehlo.divide %3009, %158 : tensor<1x256x1xf32>
-    %3011 = stablehlo.broadcast_in_dim %3006, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3012 = stablehlo.add %3011, %161 : tensor<1x256x1xf32>
-    %3013 = stablehlo.rsqrt %3012 : tensor<1x256x1xf32>
-    %3014 = stablehlo.broadcast_in_dim %2992, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3015 = stablehlo.broadcast_in_dim %3010, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3016 = stablehlo.subtract %3014, %3015 : tensor<1x256x1280xf32>
-    %3017 = stablehlo.broadcast_in_dim %3016, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3018 = stablehlo.broadcast_in_dim %3013, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3019 = stablehlo.multiply %3017, %3018 : tensor<1x256x1280xf32>
-    %3020 = stablehlo.convert %arg63 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3021 = stablehlo.broadcast_in_dim %3019, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3022 = stablehlo.broadcast_in_dim %3020, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3023 = stablehlo.multiply %3021, %3022 : tensor<1x256x1280xf32>
-    %3024 = stablehlo.convert %arg64 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3025 = stablehlo.broadcast_in_dim %3023, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3026 = stablehlo.broadcast_in_dim %3024, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3027 = stablehlo.add %3025, %3026 : tensor<1x256x1280xf32>
-    %3028 = stablehlo.convert %3027 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3029 = stablehlo.reshape %3028 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3030 = stablehlo.convert %3029 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3031 = stablehlo.dot_general %3030, %arg297, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3032 = stablehlo.broadcast_in_dim %3031, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3033 = stablehlo.multiply %3032, %273 : tensor<256x256xf32>
-    %3034 = stablehlo.broadcast_in_dim %3033, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3035 = stablehlo.broadcast_in_dim %arg298, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3036 = stablehlo.add %3034, %3035 : tensor<256x256xf32>
-    %3037 = stablehlo.convert %3036 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3038 = stablehlo.reshape %3037 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3039 = stablehlo.dot_general %3030, %arg299, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3040 = stablehlo.broadcast_in_dim %3039, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3041 = stablehlo.multiply %3040, %273 : tensor<256x256xf32>
-    %3042 = stablehlo.broadcast_in_dim %3041, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3043 = stablehlo.broadcast_in_dim %arg300, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3044 = stablehlo.add %3042, %3043 : tensor<256x256xf32>
-    %3045 = stablehlo.convert %3044 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3046 = stablehlo.reshape %3045 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3047 = stablehlo.dot_general %3030, %arg301, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3048 = stablehlo.broadcast_in_dim %3047, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3049 = stablehlo.multiply %3048, %127 : tensor<256x1280xf32>
-    %3050 = stablehlo.broadcast_in_dim %3049, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3051 = stablehlo.broadcast_in_dim %arg302, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3052 = stablehlo.add %3050, %3051 : tensor<256x1280xf32>
-    %3053 = stablehlo.convert %3052 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3054 = stablehlo.reshape %3053 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3055 = stablehlo.reshape %3038 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3056 = stablehlo.transpose %3055, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3057 = stablehlo.reshape %3046 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3058 = stablehlo.transpose %3057, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3059 = stablehlo.reshape %3054 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %3060 = stablehlo.transpose %3059, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3061 = stablehlo.transpose %3058, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %3062 = stablehlo.reshape %3056 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %3063 = stablehlo.reshape %3061 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3064 = stablehlo.broadcast_in_dim %3063, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3065 = stablehlo.dot_general %3062, %3064, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %3066 = stablehlo.reshape %3065 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3067 = stablehlo.broadcast_in_dim %3066, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3068 = stablehlo.divide %3067, %309 : tensor<1x8x256x256xbf16>
-    %3069 = stablehlo.convert %3068 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %3070 = stablehlo.reduce(%3069 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3071 = stablehlo.reshape %3070 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3072 = stablehlo.broadcast_in_dim %3069, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3073 = stablehlo.broadcast_in_dim %3071, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3074 = stablehlo.subtract %3072, %3073 : tensor<1x8x256x256xf32>
-    %3075 = stablehlo.exponential %3074 : tensor<1x8x256x256xf32>
-    %3076 = stablehlo.reduce(%3075 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3077 = stablehlo.reshape %3076 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3078 = stablehlo.broadcast_in_dim %3075, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3079 = stablehlo.broadcast_in_dim %3077, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3080 = stablehlo.divide %3078, %3079 : tensor<1x8x256x256xf32>
-    %3081 = stablehlo.convert %3080 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %3082 = stablehlo.reshape %3081 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %3083 = stablehlo.reshape %3060 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3084 = stablehlo.broadcast_in_dim %3083, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3085 = stablehlo.dot_general %3082, %3084, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3086 = stablehlo.reshape %3085 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3087 = stablehlo.transpose %3086, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %3088 = stablehlo.reshape %3087 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %3089 = stablehlo.reshape %3088 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3090 = stablehlo.convert %3089 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3091 = stablehlo.dot_general %3090, %arg303, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3092 = stablehlo.broadcast_in_dim %3091, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3093 = stablehlo.multiply %3092, %127 : tensor<256x1280xf32>
-    %3094 = stablehlo.broadcast_in_dim %3093, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3095 = stablehlo.broadcast_in_dim %arg304, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3096 = stablehlo.add %3094, %3095 : tensor<256x1280xf32>
-    %3097 = stablehlo.convert %3096 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3098 = stablehlo.reshape %3097 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3099 = stablehlo.add %3098, %2991 : tensor<1x256x1280xbf16>
-    %3100 = stablehlo.convert %3099 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3101 = stablehlo.convert %3100 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3102 = stablehlo.reduce(%3101 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3103 = stablehlo.reshape %3102 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3104 = stablehlo.broadcast_in_dim %3103, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3105 = stablehlo.divide %3104, %142 : tensor<1x256x1xf64>
-    %3106 = stablehlo.broadcast_in_dim %3101, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3107 = stablehlo.broadcast_in_dim %3105, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3108 = stablehlo.subtract %3106, %3107 : tensor<1x256x1280xf64>
-    %3109 = stablehlo.multiply %3108, %3108 : tensor<1x256x1280xf64>
-    %3110 = stablehlo.reduce(%3109 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3111 = stablehlo.reshape %3110 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3112 = stablehlo.broadcast_in_dim %3111, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3113 = stablehlo.divide %3112, %142 : tensor<1x256x1xf64>
-    %3114 = stablehlo.convert %3113 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3115 = stablehlo.reduce(%3100 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3116 = stablehlo.reshape %3115 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3117 = stablehlo.broadcast_in_dim %3116, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3118 = stablehlo.divide %3117, %158 : tensor<1x256x1xf32>
-    %3119 = stablehlo.broadcast_in_dim %3114, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3120 = stablehlo.add %3119, %161 : tensor<1x256x1xf32>
-    %3121 = stablehlo.rsqrt %3120 : tensor<1x256x1xf32>
-    %3122 = stablehlo.broadcast_in_dim %3100, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3123 = stablehlo.broadcast_in_dim %3118, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3124 = stablehlo.subtract %3122, %3123 : tensor<1x256x1280xf32>
-    %3125 = stablehlo.broadcast_in_dim %3124, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3126 = stablehlo.broadcast_in_dim %3121, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3127 = stablehlo.multiply %3125, %3126 : tensor<1x256x1280xf32>
-    %3128 = stablehlo.convert %arg65 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3129 = stablehlo.broadcast_in_dim %3127, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3130 = stablehlo.broadcast_in_dim %3128, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3131 = stablehlo.multiply %3129, %3130 : tensor<1x256x1280xf32>
-    %3132 = stablehlo.convert %arg66 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3133 = stablehlo.broadcast_in_dim %3131, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3134 = stablehlo.broadcast_in_dim %3132, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3135 = stablehlo.add %3133, %3134 : tensor<1x256x1280xf32>
-    %3136 = stablehlo.convert %3135 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3137 = stablehlo.reshape %3136 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3138 = stablehlo.convert %3137 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3139 = stablehlo.dot_general %3138, %arg305, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3140 = stablehlo.broadcast_in_dim %3139, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3141 = stablehlo.multiply %3140, %127 : tensor<256x1280xf32>
-    %3142 = stablehlo.broadcast_in_dim %3141, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3143 = stablehlo.broadcast_in_dim %arg306, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3144 = stablehlo.add %3142, %3143 : tensor<256x1280xf32>
-    %3145 = stablehlo.convert %3144 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3146 = stablehlo.reshape %3145 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3147 = stablehlo.multiply %3146, %cst_4 : tensor<1x256x1280xbf16>
-    %3148 = stablehlo.multiply %3146, %190 : tensor<1x256x1280xbf16>
-    %3149 = stablehlo.convert %3148 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3150 = stablehlo.clamp %cst_5, %3149, %cst_6 : tensor<1x256x1280xf32>
-    %3151 = stablehlo.multiply %3150, %3150 : tensor<1x256x1280xf32>
-    %3152 = stablehlo.multiply %cst_7, %3151 : tensor<1x256x1280xf32>
-    %3153 = stablehlo.add %3152, %cst_8 : tensor<1x256x1280xf32>
-    %3154 = stablehlo.multiply %3153, %3151 : tensor<1x256x1280xf32>
-    %3155 = stablehlo.add %3154, %cst_9 : tensor<1x256x1280xf32>
-    %3156 = stablehlo.multiply %3155, %3151 : tensor<1x256x1280xf32>
-    %3157 = stablehlo.add %3156, %cst_10 : tensor<1x256x1280xf32>
-    %3158 = stablehlo.multiply %3157, %3151 : tensor<1x256x1280xf32>
-    %3159 = stablehlo.add %3158, %cst_11 : tensor<1x256x1280xf32>
-    %3160 = stablehlo.multiply %3159, %3151 : tensor<1x256x1280xf32>
-    %3161 = stablehlo.add %3160, %cst_12 : tensor<1x256x1280xf32>
-    %3162 = stablehlo.multiply %3161, %3151 : tensor<1x256x1280xf32>
-    %3163 = stablehlo.add %3162, %cst_13 : tensor<1x256x1280xf32>
-    %3164 = stablehlo.multiply %cst_14, %3151 : tensor<1x256x1280xf32>
-    %3165 = stablehlo.add %3164, %cst_15 : tensor<1x256x1280xf32>
-    %3166 = stablehlo.multiply %3165, %3151 : tensor<1x256x1280xf32>
-    %3167 = stablehlo.add %3166, %cst_16 : tensor<1x256x1280xf32>
-    %3168 = stablehlo.multiply %3167, %3151 : tensor<1x256x1280xf32>
-    %3169 = stablehlo.add %3168, %cst_17 : tensor<1x256x1280xf32>
-    %3170 = stablehlo.multiply %3169, %3151 : tensor<1x256x1280xf32>
-    %3171 = stablehlo.add %3170, %cst_18 : tensor<1x256x1280xf32>
-    %3172 = stablehlo.multiply %3150, %3163 : tensor<1x256x1280xf32>
-    %3173 = stablehlo.divide %3172, %3171 : tensor<1x256x1280xf32>
-    %3174 = stablehlo.clamp %cst_19, %3173, %cst_20 : tensor<1x256x1280xf32>
-    %3175 = stablehlo.convert %3174 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3176 = stablehlo.add %3175, %cst_2 : tensor<1x256x1280xbf16>
-    %3177 = stablehlo.multiply %3176, %3147 : tensor<1x256x1280xbf16>
-    %3178 = stablehlo.reshape %3177 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3179 = stablehlo.convert %3178 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3180 = stablehlo.dot_general %3179, %arg307, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3181 = stablehlo.broadcast_in_dim %3180, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3182 = stablehlo.multiply %3181, %127 : tensor<256x1280xf32>
-    %3183 = stablehlo.broadcast_in_dim %3182, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3184 = stablehlo.broadcast_in_dim %arg308, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3185 = stablehlo.add %3183, %3184 : tensor<256x1280xf32>
-    %3186 = stablehlo.convert %3185 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3187 = stablehlo.reshape %3186 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3188 = stablehlo.add %3187, %3099 : tensor<1x256x1280xbf16>
-    %3189 = stablehlo.convert %3188 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3190 = stablehlo.convert %3189 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3191 = stablehlo.reduce(%3190 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3192 = stablehlo.reshape %3191 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3193 = stablehlo.broadcast_in_dim %3192, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3194 = stablehlo.divide %3193, %142 : tensor<1x256x1xf64>
-    %3195 = stablehlo.broadcast_in_dim %3190, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3196 = stablehlo.broadcast_in_dim %3194, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3197 = stablehlo.subtract %3195, %3196 : tensor<1x256x1280xf64>
-    %3198 = stablehlo.multiply %3197, %3197 : tensor<1x256x1280xf64>
-    %3199 = stablehlo.reduce(%3198 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3200 = stablehlo.reshape %3199 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3201 = stablehlo.broadcast_in_dim %3200, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3202 = stablehlo.divide %3201, %142 : tensor<1x256x1xf64>
-    %3203 = stablehlo.convert %3202 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3204 = stablehlo.reduce(%3189 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3205 = stablehlo.reshape %3204 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3206 = stablehlo.broadcast_in_dim %3205, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3207 = stablehlo.divide %3206, %158 : tensor<1x256x1xf32>
-    %3208 = stablehlo.broadcast_in_dim %3203, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3209 = stablehlo.add %3208, %161 : tensor<1x256x1xf32>
-    %3210 = stablehlo.rsqrt %3209 : tensor<1x256x1xf32>
-    %3211 = stablehlo.broadcast_in_dim %3189, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3212 = stablehlo.broadcast_in_dim %3207, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3213 = stablehlo.subtract %3211, %3212 : tensor<1x256x1280xf32>
-    %3214 = stablehlo.broadcast_in_dim %3213, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3215 = stablehlo.broadcast_in_dim %3210, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3216 = stablehlo.multiply %3214, %3215 : tensor<1x256x1280xf32>
-    %3217 = stablehlo.convert %arg67 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3218 = stablehlo.broadcast_in_dim %3216, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3219 = stablehlo.broadcast_in_dim %3217, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3220 = stablehlo.multiply %3218, %3219 : tensor<1x256x1280xf32>
-    %3221 = stablehlo.convert %arg68 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3222 = stablehlo.broadcast_in_dim %3220, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3223 = stablehlo.broadcast_in_dim %3221, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3224 = stablehlo.add %3222, %3223 : tensor<1x256x1280xf32>
-    %3225 = stablehlo.convert %3224 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3226 = stablehlo.reshape %3225 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3227 = stablehlo.convert %3226 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3228 = stablehlo.dot_general %3227, %arg309, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3229 = stablehlo.broadcast_in_dim %3228, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3230 = stablehlo.multiply %3229, %273 : tensor<256x256xf32>
-    %3231 = stablehlo.broadcast_in_dim %3230, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3232 = stablehlo.broadcast_in_dim %arg310, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3233 = stablehlo.add %3231, %3232 : tensor<256x256xf32>
-    %3234 = stablehlo.convert %3233 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3235 = stablehlo.reshape %3234 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3236 = stablehlo.dot_general %3227, %arg311, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3237 = stablehlo.broadcast_in_dim %3236, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3238 = stablehlo.multiply %3237, %273 : tensor<256x256xf32>
-    %3239 = stablehlo.broadcast_in_dim %3238, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3240 = stablehlo.broadcast_in_dim %arg312, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3241 = stablehlo.add %3239, %3240 : tensor<256x256xf32>
-    %3242 = stablehlo.convert %3241 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3243 = stablehlo.reshape %3242 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3244 = stablehlo.dot_general %3227, %arg313, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3245 = stablehlo.broadcast_in_dim %3244, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3246 = stablehlo.multiply %3245, %127 : tensor<256x1280xf32>
-    %3247 = stablehlo.broadcast_in_dim %3246, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3248 = stablehlo.broadcast_in_dim %arg314, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3249 = stablehlo.add %3247, %3248 : tensor<256x1280xf32>
-    %3250 = stablehlo.convert %3249 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3251 = stablehlo.reshape %3250 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3252 = stablehlo.reshape %3235 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3253 = stablehlo.transpose %3252, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3254 = stablehlo.reshape %3243 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3255 = stablehlo.transpose %3254, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3256 = stablehlo.reshape %3251 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %3257 = stablehlo.transpose %3256, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3258 = stablehlo.transpose %3255, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %3259 = stablehlo.reshape %3253 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %3260 = stablehlo.reshape %3258 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3261 = stablehlo.broadcast_in_dim %3260, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3262 = stablehlo.dot_general %3259, %3261, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %3263 = stablehlo.reshape %3262 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3264 = stablehlo.broadcast_in_dim %3263, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3265 = stablehlo.divide %3264, %309 : tensor<1x8x256x256xbf16>
-    %3266 = stablehlo.convert %3265 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %3267 = stablehlo.reduce(%3266 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3268 = stablehlo.reshape %3267 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3269 = stablehlo.broadcast_in_dim %3266, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3270 = stablehlo.broadcast_in_dim %3268, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3271 = stablehlo.subtract %3269, %3270 : tensor<1x8x256x256xf32>
-    %3272 = stablehlo.exponential %3271 : tensor<1x8x256x256xf32>
-    %3273 = stablehlo.reduce(%3272 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3274 = stablehlo.reshape %3273 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3275 = stablehlo.broadcast_in_dim %3272, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3276 = stablehlo.broadcast_in_dim %3274, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3277 = stablehlo.divide %3275, %3276 : tensor<1x8x256x256xf32>
-    %3278 = stablehlo.convert %3277 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %3279 = stablehlo.reshape %3278 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %3280 = stablehlo.reshape %3257 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3281 = stablehlo.broadcast_in_dim %3280, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3282 = stablehlo.dot_general %3279, %3281, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3283 = stablehlo.reshape %3282 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3284 = stablehlo.transpose %3283, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %3285 = stablehlo.reshape %3284 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %3286 = stablehlo.reshape %3285 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3287 = stablehlo.convert %3286 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3288 = stablehlo.dot_general %3287, %arg315, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3289 = stablehlo.broadcast_in_dim %3288, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3290 = stablehlo.multiply %3289, %127 : tensor<256x1280xf32>
-    %3291 = stablehlo.broadcast_in_dim %3290, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3292 = stablehlo.broadcast_in_dim %arg316, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3293 = stablehlo.add %3291, %3292 : tensor<256x1280xf32>
-    %3294 = stablehlo.convert %3293 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3295 = stablehlo.reshape %3294 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3296 = stablehlo.add %3295, %3188 : tensor<1x256x1280xbf16>
-    %3297 = stablehlo.convert %3296 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3298 = stablehlo.convert %3297 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3299 = stablehlo.reduce(%3298 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3300 = stablehlo.reshape %3299 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3301 = stablehlo.broadcast_in_dim %3300, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3302 = stablehlo.divide %3301, %142 : tensor<1x256x1xf64>
-    %3303 = stablehlo.broadcast_in_dim %3298, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3304 = stablehlo.broadcast_in_dim %3302, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3305 = stablehlo.subtract %3303, %3304 : tensor<1x256x1280xf64>
-    %3306 = stablehlo.multiply %3305, %3305 : tensor<1x256x1280xf64>
-    %3307 = stablehlo.reduce(%3306 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3308 = stablehlo.reshape %3307 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3309 = stablehlo.broadcast_in_dim %3308, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3310 = stablehlo.divide %3309, %142 : tensor<1x256x1xf64>
-    %3311 = stablehlo.convert %3310 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3312 = stablehlo.reduce(%3297 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3313 = stablehlo.reshape %3312 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3314 = stablehlo.broadcast_in_dim %3313, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3315 = stablehlo.divide %3314, %158 : tensor<1x256x1xf32>
-    %3316 = stablehlo.broadcast_in_dim %3311, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3317 = stablehlo.add %3316, %161 : tensor<1x256x1xf32>
-    %3318 = stablehlo.rsqrt %3317 : tensor<1x256x1xf32>
-    %3319 = stablehlo.broadcast_in_dim %3297, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3320 = stablehlo.broadcast_in_dim %3315, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3321 = stablehlo.subtract %3319, %3320 : tensor<1x256x1280xf32>
-    %3322 = stablehlo.broadcast_in_dim %3321, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3323 = stablehlo.broadcast_in_dim %3318, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3324 = stablehlo.multiply %3322, %3323 : tensor<1x256x1280xf32>
-    %3325 = stablehlo.convert %arg69 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3326 = stablehlo.broadcast_in_dim %3324, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3327 = stablehlo.broadcast_in_dim %3325, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3328 = stablehlo.multiply %3326, %3327 : tensor<1x256x1280xf32>
-    %3329 = stablehlo.convert %arg70 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3330 = stablehlo.broadcast_in_dim %3328, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3331 = stablehlo.broadcast_in_dim %3329, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3332 = stablehlo.add %3330, %3331 : tensor<1x256x1280xf32>
-    %3333 = stablehlo.convert %3332 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3334 = stablehlo.reshape %3333 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3335 = stablehlo.convert %3334 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3336 = stablehlo.dot_general %3335, %arg317, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3337 = stablehlo.broadcast_in_dim %3336, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3338 = stablehlo.multiply %3337, %127 : tensor<256x1280xf32>
-    %3339 = stablehlo.broadcast_in_dim %3338, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3340 = stablehlo.broadcast_in_dim %arg318, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3341 = stablehlo.add %3339, %3340 : tensor<256x1280xf32>
-    %3342 = stablehlo.convert %3341 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3343 = stablehlo.reshape %3342 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3344 = stablehlo.multiply %3343, %cst_4 : tensor<1x256x1280xbf16>
-    %3345 = stablehlo.multiply %3343, %190 : tensor<1x256x1280xbf16>
-    %3346 = stablehlo.convert %3345 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3347 = stablehlo.clamp %cst_5, %3346, %cst_6 : tensor<1x256x1280xf32>
-    %3348 = stablehlo.multiply %3347, %3347 : tensor<1x256x1280xf32>
-    %3349 = stablehlo.multiply %cst_7, %3348 : tensor<1x256x1280xf32>
-    %3350 = stablehlo.add %3349, %cst_8 : tensor<1x256x1280xf32>
-    %3351 = stablehlo.multiply %3350, %3348 : tensor<1x256x1280xf32>
-    %3352 = stablehlo.add %3351, %cst_9 : tensor<1x256x1280xf32>
-    %3353 = stablehlo.multiply %3352, %3348 : tensor<1x256x1280xf32>
-    %3354 = stablehlo.add %3353, %cst_10 : tensor<1x256x1280xf32>
-    %3355 = stablehlo.multiply %3354, %3348 : tensor<1x256x1280xf32>
-    %3356 = stablehlo.add %3355, %cst_11 : tensor<1x256x1280xf32>
-    %3357 = stablehlo.multiply %3356, %3348 : tensor<1x256x1280xf32>
-    %3358 = stablehlo.add %3357, %cst_12 : tensor<1x256x1280xf32>
-    %3359 = stablehlo.multiply %3358, %3348 : tensor<1x256x1280xf32>
-    %3360 = stablehlo.add %3359, %cst_13 : tensor<1x256x1280xf32>
-    %3361 = stablehlo.multiply %cst_14, %3348 : tensor<1x256x1280xf32>
-    %3362 = stablehlo.add %3361, %cst_15 : tensor<1x256x1280xf32>
-    %3363 = stablehlo.multiply %3362, %3348 : tensor<1x256x1280xf32>
-    %3364 = stablehlo.add %3363, %cst_16 : tensor<1x256x1280xf32>
-    %3365 = stablehlo.multiply %3364, %3348 : tensor<1x256x1280xf32>
-    %3366 = stablehlo.add %3365, %cst_17 : tensor<1x256x1280xf32>
-    %3367 = stablehlo.multiply %3366, %3348 : tensor<1x256x1280xf32>
-    %3368 = stablehlo.add %3367, %cst_18 : tensor<1x256x1280xf32>
-    %3369 = stablehlo.multiply %3347, %3360 : tensor<1x256x1280xf32>
-    %3370 = stablehlo.divide %3369, %3368 : tensor<1x256x1280xf32>
-    %3371 = stablehlo.clamp %cst_19, %3370, %cst_20 : tensor<1x256x1280xf32>
-    %3372 = stablehlo.convert %3371 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3373 = stablehlo.add %3372, %cst_2 : tensor<1x256x1280xbf16>
-    %3374 = stablehlo.multiply %3373, %3344 : tensor<1x256x1280xbf16>
-    %3375 = stablehlo.reshape %3374 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3376 = stablehlo.convert %3375 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3377 = stablehlo.dot_general %3376, %arg319, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3378 = stablehlo.broadcast_in_dim %3377, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3379 = stablehlo.multiply %3378, %127 : tensor<256x1280xf32>
-    %3380 = stablehlo.broadcast_in_dim %3379, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3381 = stablehlo.broadcast_in_dim %arg320, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3382 = stablehlo.add %3380, %3381 : tensor<256x1280xf32>
-    %3383 = stablehlo.convert %3382 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3384 = stablehlo.reshape %3383 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3385 = stablehlo.add %3384, %3296 : tensor<1x256x1280xbf16>
-    %3386 = stablehlo.convert %3385 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3387 = stablehlo.convert %3386 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3388 = stablehlo.reduce(%3387 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3389 = stablehlo.reshape %3388 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3390 = stablehlo.broadcast_in_dim %3389, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3391 = stablehlo.divide %3390, %142 : tensor<1x256x1xf64>
-    %3392 = stablehlo.broadcast_in_dim %3387, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3393 = stablehlo.broadcast_in_dim %3391, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3394 = stablehlo.subtract %3392, %3393 : tensor<1x256x1280xf64>
-    %3395 = stablehlo.multiply %3394, %3394 : tensor<1x256x1280xf64>
-    %3396 = stablehlo.reduce(%3395 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3397 = stablehlo.reshape %3396 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3398 = stablehlo.broadcast_in_dim %3397, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3399 = stablehlo.divide %3398, %142 : tensor<1x256x1xf64>
-    %3400 = stablehlo.convert %3399 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3401 = stablehlo.reduce(%3386 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3402 = stablehlo.reshape %3401 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3403 = stablehlo.broadcast_in_dim %3402, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3404 = stablehlo.divide %3403, %158 : tensor<1x256x1xf32>
-    %3405 = stablehlo.broadcast_in_dim %3400, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3406 = stablehlo.add %3405, %161 : tensor<1x256x1xf32>
-    %3407 = stablehlo.rsqrt %3406 : tensor<1x256x1xf32>
-    %3408 = stablehlo.broadcast_in_dim %3386, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3409 = stablehlo.broadcast_in_dim %3404, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3410 = stablehlo.subtract %3408, %3409 : tensor<1x256x1280xf32>
-    %3411 = stablehlo.broadcast_in_dim %3410, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3412 = stablehlo.broadcast_in_dim %3407, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3413 = stablehlo.multiply %3411, %3412 : tensor<1x256x1280xf32>
-    %3414 = stablehlo.convert %arg71 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3415 = stablehlo.broadcast_in_dim %3413, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3416 = stablehlo.broadcast_in_dim %3414, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3417 = stablehlo.multiply %3415, %3416 : tensor<1x256x1280xf32>
-    %3418 = stablehlo.convert %arg72 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3419 = stablehlo.broadcast_in_dim %3417, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3420 = stablehlo.broadcast_in_dim %3418, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3421 = stablehlo.add %3419, %3420 : tensor<1x256x1280xf32>
-    %3422 = stablehlo.convert %3421 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3423 = stablehlo.reshape %3422 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3424 = stablehlo.convert %3423 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3425 = stablehlo.dot_general %3424, %arg321, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3426 = stablehlo.broadcast_in_dim %3425, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3427 = stablehlo.multiply %3426, %273 : tensor<256x256xf32>
-    %3428 = stablehlo.broadcast_in_dim %3427, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3429 = stablehlo.broadcast_in_dim %arg322, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3430 = stablehlo.add %3428, %3429 : tensor<256x256xf32>
-    %3431 = stablehlo.convert %3430 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3432 = stablehlo.reshape %3431 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3433 = stablehlo.dot_general %3424, %arg323, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3434 = stablehlo.broadcast_in_dim %3433, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3435 = stablehlo.multiply %3434, %273 : tensor<256x256xf32>
-    %3436 = stablehlo.broadcast_in_dim %3435, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3437 = stablehlo.broadcast_in_dim %arg324, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3438 = stablehlo.add %3436, %3437 : tensor<256x256xf32>
-    %3439 = stablehlo.convert %3438 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3440 = stablehlo.reshape %3439 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3441 = stablehlo.dot_general %3424, %arg325, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3442 = stablehlo.broadcast_in_dim %3441, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3443 = stablehlo.multiply %3442, %127 : tensor<256x1280xf32>
-    %3444 = stablehlo.broadcast_in_dim %3443, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3445 = stablehlo.broadcast_in_dim %arg326, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3446 = stablehlo.add %3444, %3445 : tensor<256x1280xf32>
-    %3447 = stablehlo.convert %3446 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3448 = stablehlo.reshape %3447 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3449 = stablehlo.reshape %3432 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3450 = stablehlo.transpose %3449, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3451 = stablehlo.reshape %3440 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3452 = stablehlo.transpose %3451, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3453 = stablehlo.reshape %3448 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %3454 = stablehlo.transpose %3453, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3455 = stablehlo.transpose %3452, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %3456 = stablehlo.reshape %3450 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %3457 = stablehlo.reshape %3455 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3458 = stablehlo.broadcast_in_dim %3457, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3459 = stablehlo.dot_general %3456, %3458, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %3460 = stablehlo.reshape %3459 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3461 = stablehlo.broadcast_in_dim %3460, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3462 = stablehlo.divide %3461, %309 : tensor<1x8x256x256xbf16>
-    %3463 = stablehlo.convert %3462 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %3464 = stablehlo.reduce(%3463 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3465 = stablehlo.reshape %3464 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3466 = stablehlo.broadcast_in_dim %3463, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3467 = stablehlo.broadcast_in_dim %3465, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3468 = stablehlo.subtract %3466, %3467 : tensor<1x8x256x256xf32>
-    %3469 = stablehlo.exponential %3468 : tensor<1x8x256x256xf32>
-    %3470 = stablehlo.reduce(%3469 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3471 = stablehlo.reshape %3470 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3472 = stablehlo.broadcast_in_dim %3469, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3473 = stablehlo.broadcast_in_dim %3471, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3474 = stablehlo.divide %3472, %3473 : tensor<1x8x256x256xf32>
-    %3475 = stablehlo.convert %3474 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %3476 = stablehlo.reshape %3475 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %3477 = stablehlo.reshape %3454 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3478 = stablehlo.broadcast_in_dim %3477, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3479 = stablehlo.dot_general %3476, %3478, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3480 = stablehlo.reshape %3479 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3481 = stablehlo.transpose %3480, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %3482 = stablehlo.reshape %3481 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %3483 = stablehlo.reshape %3482 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3484 = stablehlo.convert %3483 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3485 = stablehlo.dot_general %3484, %arg327, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3486 = stablehlo.broadcast_in_dim %3485, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3487 = stablehlo.multiply %3486, %127 : tensor<256x1280xf32>
-    %3488 = stablehlo.broadcast_in_dim %3487, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3489 = stablehlo.broadcast_in_dim %arg328, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3490 = stablehlo.add %3488, %3489 : tensor<256x1280xf32>
-    %3491 = stablehlo.convert %3490 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3492 = stablehlo.reshape %3491 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3493 = stablehlo.add %3492, %3385 : tensor<1x256x1280xbf16>
-    %3494 = stablehlo.convert %3493 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3495 = stablehlo.convert %3494 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3496 = stablehlo.reduce(%3495 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3497 = stablehlo.reshape %3496 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3498 = stablehlo.broadcast_in_dim %3497, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3499 = stablehlo.divide %3498, %142 : tensor<1x256x1xf64>
-    %3500 = stablehlo.broadcast_in_dim %3495, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3501 = stablehlo.broadcast_in_dim %3499, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3502 = stablehlo.subtract %3500, %3501 : tensor<1x256x1280xf64>
-    %3503 = stablehlo.multiply %3502, %3502 : tensor<1x256x1280xf64>
-    %3504 = stablehlo.reduce(%3503 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3505 = stablehlo.reshape %3504 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3506 = stablehlo.broadcast_in_dim %3505, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3507 = stablehlo.divide %3506, %142 : tensor<1x256x1xf64>
-    %3508 = stablehlo.convert %3507 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3509 = stablehlo.reduce(%3494 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3510 = stablehlo.reshape %3509 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3511 = stablehlo.broadcast_in_dim %3510, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3512 = stablehlo.divide %3511, %158 : tensor<1x256x1xf32>
-    %3513 = stablehlo.broadcast_in_dim %3508, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3514 = stablehlo.add %3513, %161 : tensor<1x256x1xf32>
-    %3515 = stablehlo.rsqrt %3514 : tensor<1x256x1xf32>
-    %3516 = stablehlo.broadcast_in_dim %3494, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3517 = stablehlo.broadcast_in_dim %3512, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3518 = stablehlo.subtract %3516, %3517 : tensor<1x256x1280xf32>
-    %3519 = stablehlo.broadcast_in_dim %3518, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3520 = stablehlo.broadcast_in_dim %3515, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3521 = stablehlo.multiply %3519, %3520 : tensor<1x256x1280xf32>
-    %3522 = stablehlo.convert %arg73 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3523 = stablehlo.broadcast_in_dim %3521, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3524 = stablehlo.broadcast_in_dim %3522, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3525 = stablehlo.multiply %3523, %3524 : tensor<1x256x1280xf32>
-    %3526 = stablehlo.convert %arg74 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3527 = stablehlo.broadcast_in_dim %3525, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3528 = stablehlo.broadcast_in_dim %3526, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3529 = stablehlo.add %3527, %3528 : tensor<1x256x1280xf32>
-    %3530 = stablehlo.convert %3529 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3531 = stablehlo.reshape %3530 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3532 = stablehlo.convert %3531 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3533 = stablehlo.dot_general %3532, %arg329, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3534 = stablehlo.broadcast_in_dim %3533, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3535 = stablehlo.multiply %3534, %127 : tensor<256x1280xf32>
-    %3536 = stablehlo.broadcast_in_dim %3535, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3537 = stablehlo.broadcast_in_dim %arg330, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3538 = stablehlo.add %3536, %3537 : tensor<256x1280xf32>
-    %3539 = stablehlo.convert %3538 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3540 = stablehlo.reshape %3539 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3541 = stablehlo.multiply %3540, %cst_4 : tensor<1x256x1280xbf16>
-    %3542 = stablehlo.multiply %3540, %190 : tensor<1x256x1280xbf16>
-    %3543 = stablehlo.convert %3542 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3544 = stablehlo.clamp %cst_5, %3543, %cst_6 : tensor<1x256x1280xf32>
-    %3545 = stablehlo.multiply %3544, %3544 : tensor<1x256x1280xf32>
-    %3546 = stablehlo.multiply %cst_7, %3545 : tensor<1x256x1280xf32>
-    %3547 = stablehlo.add %3546, %cst_8 : tensor<1x256x1280xf32>
-    %3548 = stablehlo.multiply %3547, %3545 : tensor<1x256x1280xf32>
-    %3549 = stablehlo.add %3548, %cst_9 : tensor<1x256x1280xf32>
-    %3550 = stablehlo.multiply %3549, %3545 : tensor<1x256x1280xf32>
-    %3551 = stablehlo.add %3550, %cst_10 : tensor<1x256x1280xf32>
-    %3552 = stablehlo.multiply %3551, %3545 : tensor<1x256x1280xf32>
-    %3553 = stablehlo.add %3552, %cst_11 : tensor<1x256x1280xf32>
-    %3554 = stablehlo.multiply %3553, %3545 : tensor<1x256x1280xf32>
-    %3555 = stablehlo.add %3554, %cst_12 : tensor<1x256x1280xf32>
-    %3556 = stablehlo.multiply %3555, %3545 : tensor<1x256x1280xf32>
-    %3557 = stablehlo.add %3556, %cst_13 : tensor<1x256x1280xf32>
-    %3558 = stablehlo.multiply %cst_14, %3545 : tensor<1x256x1280xf32>
-    %3559 = stablehlo.add %3558, %cst_15 : tensor<1x256x1280xf32>
-    %3560 = stablehlo.multiply %3559, %3545 : tensor<1x256x1280xf32>
-    %3561 = stablehlo.add %3560, %cst_16 : tensor<1x256x1280xf32>
-    %3562 = stablehlo.multiply %3561, %3545 : tensor<1x256x1280xf32>
-    %3563 = stablehlo.add %3562, %cst_17 : tensor<1x256x1280xf32>
-    %3564 = stablehlo.multiply %3563, %3545 : tensor<1x256x1280xf32>
-    %3565 = stablehlo.add %3564, %cst_18 : tensor<1x256x1280xf32>
-    %3566 = stablehlo.multiply %3544, %3557 : tensor<1x256x1280xf32>
-    %3567 = stablehlo.divide %3566, %3565 : tensor<1x256x1280xf32>
-    %3568 = stablehlo.clamp %cst_19, %3567, %cst_20 : tensor<1x256x1280xf32>
-    %3569 = stablehlo.convert %3568 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3570 = stablehlo.add %3569, %cst_2 : tensor<1x256x1280xbf16>
-    %3571 = stablehlo.multiply %3570, %3541 : tensor<1x256x1280xbf16>
-    %3572 = stablehlo.reshape %3571 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3573 = stablehlo.convert %3572 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3574 = stablehlo.dot_general %3573, %arg331, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3575 = stablehlo.broadcast_in_dim %3574, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3576 = stablehlo.multiply %3575, %127 : tensor<256x1280xf32>
-    %3577 = stablehlo.broadcast_in_dim %3576, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3578 = stablehlo.broadcast_in_dim %arg332, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3579 = stablehlo.add %3577, %3578 : tensor<256x1280xf32>
-    %3580 = stablehlo.convert %3579 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3581 = stablehlo.reshape %3580 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3582 = stablehlo.add %3581, %3493 : tensor<1x256x1280xbf16>
-    %3583 = stablehlo.convert %3582 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3584 = stablehlo.convert %3583 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3585 = stablehlo.reduce(%3584 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3586 = stablehlo.reshape %3585 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3587 = stablehlo.broadcast_in_dim %3586, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3588 = stablehlo.divide %3587, %142 : tensor<1x256x1xf64>
-    %3589 = stablehlo.broadcast_in_dim %3584, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3590 = stablehlo.broadcast_in_dim %3588, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3591 = stablehlo.subtract %3589, %3590 : tensor<1x256x1280xf64>
-    %3592 = stablehlo.multiply %3591, %3591 : tensor<1x256x1280xf64>
-    %3593 = stablehlo.reduce(%3592 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3594 = stablehlo.reshape %3593 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3595 = stablehlo.broadcast_in_dim %3594, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3596 = stablehlo.divide %3595, %142 : tensor<1x256x1xf64>
-    %3597 = stablehlo.convert %3596 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3598 = stablehlo.reduce(%3583 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3599 = stablehlo.reshape %3598 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3600 = stablehlo.broadcast_in_dim %3599, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3601 = stablehlo.divide %3600, %158 : tensor<1x256x1xf32>
-    %3602 = stablehlo.broadcast_in_dim %3597, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3603 = stablehlo.add %3602, %161 : tensor<1x256x1xf32>
-    %3604 = stablehlo.rsqrt %3603 : tensor<1x256x1xf32>
-    %3605 = stablehlo.broadcast_in_dim %3583, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3606 = stablehlo.broadcast_in_dim %3601, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3607 = stablehlo.subtract %3605, %3606 : tensor<1x256x1280xf32>
-    %3608 = stablehlo.broadcast_in_dim %3607, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3609 = stablehlo.broadcast_in_dim %3604, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3610 = stablehlo.multiply %3608, %3609 : tensor<1x256x1280xf32>
-    %3611 = stablehlo.convert %arg75 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3612 = stablehlo.broadcast_in_dim %3610, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3613 = stablehlo.broadcast_in_dim %3611, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3614 = stablehlo.multiply %3612, %3613 : tensor<1x256x1280xf32>
-    %3615 = stablehlo.convert %arg76 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3616 = stablehlo.broadcast_in_dim %3614, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3617 = stablehlo.broadcast_in_dim %3615, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3618 = stablehlo.add %3616, %3617 : tensor<1x256x1280xf32>
-    %3619 = stablehlo.convert %3618 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3620 = stablehlo.reshape %3619 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3621 = stablehlo.convert %3620 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3622 = stablehlo.dot_general %3621, %arg333, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3623 = stablehlo.broadcast_in_dim %3622, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3624 = stablehlo.multiply %3623, %273 : tensor<256x256xf32>
-    %3625 = stablehlo.broadcast_in_dim %3624, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3626 = stablehlo.broadcast_in_dim %arg334, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3627 = stablehlo.add %3625, %3626 : tensor<256x256xf32>
-    %3628 = stablehlo.convert %3627 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3629 = stablehlo.reshape %3628 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3630 = stablehlo.dot_general %3621, %arg335, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3631 = stablehlo.broadcast_in_dim %3630, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3632 = stablehlo.multiply %3631, %273 : tensor<256x256xf32>
-    %3633 = stablehlo.broadcast_in_dim %3632, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3634 = stablehlo.broadcast_in_dim %arg336, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3635 = stablehlo.add %3633, %3634 : tensor<256x256xf32>
-    %3636 = stablehlo.convert %3635 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3637 = stablehlo.reshape %3636 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3638 = stablehlo.dot_general %3621, %arg337, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3639 = stablehlo.broadcast_in_dim %3638, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3640 = stablehlo.multiply %3639, %127 : tensor<256x1280xf32>
-    %3641 = stablehlo.broadcast_in_dim %3640, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3642 = stablehlo.broadcast_in_dim %arg338, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3643 = stablehlo.add %3641, %3642 : tensor<256x1280xf32>
-    %3644 = stablehlo.convert %3643 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3645 = stablehlo.reshape %3644 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3646 = stablehlo.reshape %3629 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3647 = stablehlo.transpose %3646, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3648 = stablehlo.reshape %3637 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3649 = stablehlo.transpose %3648, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3650 = stablehlo.reshape %3645 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %3651 = stablehlo.transpose %3650, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3652 = stablehlo.transpose %3649, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %3653 = stablehlo.reshape %3647 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %3654 = stablehlo.reshape %3652 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3655 = stablehlo.broadcast_in_dim %3654, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3656 = stablehlo.dot_general %3653, %3655, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %3657 = stablehlo.reshape %3656 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3658 = stablehlo.broadcast_in_dim %3657, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3659 = stablehlo.divide %3658, %309 : tensor<1x8x256x256xbf16>
-    %3660 = stablehlo.convert %3659 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %3661 = stablehlo.reduce(%3660 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3662 = stablehlo.reshape %3661 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3663 = stablehlo.broadcast_in_dim %3660, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3664 = stablehlo.broadcast_in_dim %3662, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3665 = stablehlo.subtract %3663, %3664 : tensor<1x8x256x256xf32>
-    %3666 = stablehlo.exponential %3665 : tensor<1x8x256x256xf32>
-    %3667 = stablehlo.reduce(%3666 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3668 = stablehlo.reshape %3667 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3669 = stablehlo.broadcast_in_dim %3666, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3670 = stablehlo.broadcast_in_dim %3668, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3671 = stablehlo.divide %3669, %3670 : tensor<1x8x256x256xf32>
-    %3672 = stablehlo.convert %3671 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %3673 = stablehlo.reshape %3672 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %3674 = stablehlo.reshape %3651 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3675 = stablehlo.broadcast_in_dim %3674, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3676 = stablehlo.dot_general %3673, %3675, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3677 = stablehlo.reshape %3676 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3678 = stablehlo.transpose %3677, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %3679 = stablehlo.reshape %3678 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %3680 = stablehlo.reshape %3679 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3681 = stablehlo.convert %3680 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3682 = stablehlo.dot_general %3681, %arg339, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3683 = stablehlo.broadcast_in_dim %3682, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3684 = stablehlo.multiply %3683, %127 : tensor<256x1280xf32>
-    %3685 = stablehlo.broadcast_in_dim %3684, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3686 = stablehlo.broadcast_in_dim %arg340, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3687 = stablehlo.add %3685, %3686 : tensor<256x1280xf32>
-    %3688 = stablehlo.convert %3687 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3689 = stablehlo.reshape %3688 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3690 = stablehlo.add %3689, %3582 : tensor<1x256x1280xbf16>
-    %3691 = stablehlo.convert %3690 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3692 = stablehlo.convert %3691 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3693 = stablehlo.reduce(%3692 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3694 = stablehlo.reshape %3693 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3695 = stablehlo.broadcast_in_dim %3694, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3696 = stablehlo.divide %3695, %142 : tensor<1x256x1xf64>
-    %3697 = stablehlo.broadcast_in_dim %3692, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3698 = stablehlo.broadcast_in_dim %3696, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3699 = stablehlo.subtract %3697, %3698 : tensor<1x256x1280xf64>
-    %3700 = stablehlo.multiply %3699, %3699 : tensor<1x256x1280xf64>
-    %3701 = stablehlo.reduce(%3700 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3702 = stablehlo.reshape %3701 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3703 = stablehlo.broadcast_in_dim %3702, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3704 = stablehlo.divide %3703, %142 : tensor<1x256x1xf64>
-    %3705 = stablehlo.convert %3704 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3706 = stablehlo.reduce(%3691 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3707 = stablehlo.reshape %3706 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3708 = stablehlo.broadcast_in_dim %3707, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3709 = stablehlo.divide %3708, %158 : tensor<1x256x1xf32>
-    %3710 = stablehlo.broadcast_in_dim %3705, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3711 = stablehlo.add %3710, %161 : tensor<1x256x1xf32>
-    %3712 = stablehlo.rsqrt %3711 : tensor<1x256x1xf32>
-    %3713 = stablehlo.broadcast_in_dim %3691, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3714 = stablehlo.broadcast_in_dim %3709, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3715 = stablehlo.subtract %3713, %3714 : tensor<1x256x1280xf32>
-    %3716 = stablehlo.broadcast_in_dim %3715, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3717 = stablehlo.broadcast_in_dim %3712, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3718 = stablehlo.multiply %3716, %3717 : tensor<1x256x1280xf32>
-    %3719 = stablehlo.convert %arg77 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3720 = stablehlo.broadcast_in_dim %3718, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3721 = stablehlo.broadcast_in_dim %3719, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3722 = stablehlo.multiply %3720, %3721 : tensor<1x256x1280xf32>
-    %3723 = stablehlo.convert %arg78 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3724 = stablehlo.broadcast_in_dim %3722, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3725 = stablehlo.broadcast_in_dim %3723, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3726 = stablehlo.add %3724, %3725 : tensor<1x256x1280xf32>
-    %3727 = stablehlo.convert %3726 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3728 = stablehlo.reshape %3727 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3729 = stablehlo.convert %3728 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3730 = stablehlo.dot_general %3729, %arg341, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3731 = stablehlo.broadcast_in_dim %3730, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3732 = stablehlo.multiply %3731, %127 : tensor<256x1280xf32>
-    %3733 = stablehlo.broadcast_in_dim %3732, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3734 = stablehlo.broadcast_in_dim %arg342, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3735 = stablehlo.add %3733, %3734 : tensor<256x1280xf32>
-    %3736 = stablehlo.convert %3735 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3737 = stablehlo.reshape %3736 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3738 = stablehlo.multiply %3737, %cst_4 : tensor<1x256x1280xbf16>
-    %3739 = stablehlo.multiply %3737, %190 : tensor<1x256x1280xbf16>
-    %3740 = stablehlo.convert %3739 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3741 = stablehlo.clamp %cst_5, %3740, %cst_6 : tensor<1x256x1280xf32>
-    %3742 = stablehlo.multiply %3741, %3741 : tensor<1x256x1280xf32>
-    %3743 = stablehlo.multiply %cst_7, %3742 : tensor<1x256x1280xf32>
-    %3744 = stablehlo.add %3743, %cst_8 : tensor<1x256x1280xf32>
-    %3745 = stablehlo.multiply %3744, %3742 : tensor<1x256x1280xf32>
-    %3746 = stablehlo.add %3745, %cst_9 : tensor<1x256x1280xf32>
-    %3747 = stablehlo.multiply %3746, %3742 : tensor<1x256x1280xf32>
-    %3748 = stablehlo.add %3747, %cst_10 : tensor<1x256x1280xf32>
-    %3749 = stablehlo.multiply %3748, %3742 : tensor<1x256x1280xf32>
-    %3750 = stablehlo.add %3749, %cst_11 : tensor<1x256x1280xf32>
-    %3751 = stablehlo.multiply %3750, %3742 : tensor<1x256x1280xf32>
-    %3752 = stablehlo.add %3751, %cst_12 : tensor<1x256x1280xf32>
-    %3753 = stablehlo.multiply %3752, %3742 : tensor<1x256x1280xf32>
-    %3754 = stablehlo.add %3753, %cst_13 : tensor<1x256x1280xf32>
-    %3755 = stablehlo.multiply %cst_14, %3742 : tensor<1x256x1280xf32>
-    %3756 = stablehlo.add %3755, %cst_15 : tensor<1x256x1280xf32>
-    %3757 = stablehlo.multiply %3756, %3742 : tensor<1x256x1280xf32>
-    %3758 = stablehlo.add %3757, %cst_16 : tensor<1x256x1280xf32>
-    %3759 = stablehlo.multiply %3758, %3742 : tensor<1x256x1280xf32>
-    %3760 = stablehlo.add %3759, %cst_17 : tensor<1x256x1280xf32>
-    %3761 = stablehlo.multiply %3760, %3742 : tensor<1x256x1280xf32>
-    %3762 = stablehlo.add %3761, %cst_18 : tensor<1x256x1280xf32>
-    %3763 = stablehlo.multiply %3741, %3754 : tensor<1x256x1280xf32>
-    %3764 = stablehlo.divide %3763, %3762 : tensor<1x256x1280xf32>
-    %3765 = stablehlo.clamp %cst_19, %3764, %cst_20 : tensor<1x256x1280xf32>
-    %3766 = stablehlo.convert %3765 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3767 = stablehlo.add %3766, %cst_2 : tensor<1x256x1280xbf16>
-    %3768 = stablehlo.multiply %3767, %3738 : tensor<1x256x1280xbf16>
-    %3769 = stablehlo.reshape %3768 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3770 = stablehlo.convert %3769 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3771 = stablehlo.dot_general %3770, %arg343, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3772 = stablehlo.broadcast_in_dim %3771, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3773 = stablehlo.multiply %3772, %127 : tensor<256x1280xf32>
-    %3774 = stablehlo.broadcast_in_dim %3773, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3775 = stablehlo.broadcast_in_dim %arg344, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3776 = stablehlo.add %3774, %3775 : tensor<256x1280xf32>
-    %3777 = stablehlo.convert %3776 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3778 = stablehlo.reshape %3777 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3779 = stablehlo.add %3778, %3690 : tensor<1x256x1280xbf16>
-    %3780 = stablehlo.convert %3779 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3781 = stablehlo.convert %3780 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3782 = stablehlo.reduce(%3781 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3783 = stablehlo.reshape %3782 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3784 = stablehlo.broadcast_in_dim %3783, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3785 = stablehlo.divide %3784, %142 : tensor<1x256x1xf64>
-    %3786 = stablehlo.broadcast_in_dim %3781, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3787 = stablehlo.broadcast_in_dim %3785, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3788 = stablehlo.subtract %3786, %3787 : tensor<1x256x1280xf64>
-    %3789 = stablehlo.multiply %3788, %3788 : tensor<1x256x1280xf64>
-    %3790 = stablehlo.reduce(%3789 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3791 = stablehlo.reshape %3790 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3792 = stablehlo.broadcast_in_dim %3791, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3793 = stablehlo.divide %3792, %142 : tensor<1x256x1xf64>
-    %3794 = stablehlo.convert %3793 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3795 = stablehlo.reduce(%3780 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3796 = stablehlo.reshape %3795 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3797 = stablehlo.broadcast_in_dim %3796, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3798 = stablehlo.divide %3797, %158 : tensor<1x256x1xf32>
-    %3799 = stablehlo.broadcast_in_dim %3794, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3800 = stablehlo.add %3799, %161 : tensor<1x256x1xf32>
-    %3801 = stablehlo.rsqrt %3800 : tensor<1x256x1xf32>
-    %3802 = stablehlo.broadcast_in_dim %3780, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3803 = stablehlo.broadcast_in_dim %3798, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3804 = stablehlo.subtract %3802, %3803 : tensor<1x256x1280xf32>
-    %3805 = stablehlo.broadcast_in_dim %3804, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3806 = stablehlo.broadcast_in_dim %3801, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3807 = stablehlo.multiply %3805, %3806 : tensor<1x256x1280xf32>
-    %3808 = stablehlo.convert %arg79 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3809 = stablehlo.broadcast_in_dim %3807, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3810 = stablehlo.broadcast_in_dim %3808, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3811 = stablehlo.multiply %3809, %3810 : tensor<1x256x1280xf32>
-    %3812 = stablehlo.convert %arg80 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3813 = stablehlo.broadcast_in_dim %3811, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3814 = stablehlo.broadcast_in_dim %3812, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3815 = stablehlo.add %3813, %3814 : tensor<1x256x1280xf32>
-    %3816 = stablehlo.convert %3815 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3817 = stablehlo.reshape %3816 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3818 = stablehlo.convert %3817 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3819 = stablehlo.dot_general %3818, %arg345, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3820 = stablehlo.broadcast_in_dim %3819, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3821 = stablehlo.multiply %3820, %273 : tensor<256x256xf32>
-    %3822 = stablehlo.broadcast_in_dim %3821, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3823 = stablehlo.broadcast_in_dim %arg346, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3824 = stablehlo.add %3822, %3823 : tensor<256x256xf32>
-    %3825 = stablehlo.convert %3824 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3826 = stablehlo.reshape %3825 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3827 = stablehlo.dot_general %3818, %arg347, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %3828 = stablehlo.broadcast_in_dim %3827, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3829 = stablehlo.multiply %3828, %273 : tensor<256x256xf32>
-    %3830 = stablehlo.broadcast_in_dim %3829, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %3831 = stablehlo.broadcast_in_dim %arg348, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %3832 = stablehlo.add %3830, %3831 : tensor<256x256xf32>
-    %3833 = stablehlo.convert %3832 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %3834 = stablehlo.reshape %3833 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %3835 = stablehlo.dot_general %3818, %arg349, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3836 = stablehlo.broadcast_in_dim %3835, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3837 = stablehlo.multiply %3836, %127 : tensor<256x1280xf32>
-    %3838 = stablehlo.broadcast_in_dim %3837, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3839 = stablehlo.broadcast_in_dim %arg350, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3840 = stablehlo.add %3838, %3839 : tensor<256x1280xf32>
-    %3841 = stablehlo.convert %3840 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3842 = stablehlo.reshape %3841 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3843 = stablehlo.reshape %3826 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3844 = stablehlo.transpose %3843, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3845 = stablehlo.reshape %3834 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %3846 = stablehlo.transpose %3845, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %3847 = stablehlo.reshape %3842 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %3848 = stablehlo.transpose %3847, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3849 = stablehlo.transpose %3846, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %3850 = stablehlo.reshape %3844 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %3851 = stablehlo.reshape %3849 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3852 = stablehlo.broadcast_in_dim %3851, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %3853 = stablehlo.dot_general %3850, %3852, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %3854 = stablehlo.reshape %3853 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3855 = stablehlo.broadcast_in_dim %3854, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %3856 = stablehlo.divide %3855, %309 : tensor<1x8x256x256xbf16>
-    %3857 = stablehlo.convert %3856 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %3858 = stablehlo.reduce(%3857 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3859 = stablehlo.reshape %3858 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3860 = stablehlo.broadcast_in_dim %3857, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3861 = stablehlo.broadcast_in_dim %3859, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3862 = stablehlo.subtract %3860, %3861 : tensor<1x8x256x256xf32>
-    %3863 = stablehlo.exponential %3862 : tensor<1x8x256x256xf32>
-    %3864 = stablehlo.reduce(%3863 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %3865 = stablehlo.reshape %3864 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %3866 = stablehlo.broadcast_in_dim %3863, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %3867 = stablehlo.broadcast_in_dim %3865, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %3868 = stablehlo.divide %3866, %3867 : tensor<1x8x256x256xf32>
-    %3869 = stablehlo.convert %3868 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %3870 = stablehlo.reshape %3869 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %3871 = stablehlo.reshape %3848 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3872 = stablehlo.broadcast_in_dim %3871, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3873 = stablehlo.dot_general %3870, %3872, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %3874 = stablehlo.reshape %3873 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %3875 = stablehlo.transpose %3874, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %3876 = stablehlo.reshape %3875 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %3877 = stablehlo.reshape %3876 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3878 = stablehlo.convert %3877 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3879 = stablehlo.dot_general %3878, %arg351, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3880 = stablehlo.broadcast_in_dim %3879, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3881 = stablehlo.multiply %3880, %127 : tensor<256x1280xf32>
-    %3882 = stablehlo.broadcast_in_dim %3881, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3883 = stablehlo.broadcast_in_dim %arg352, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3884 = stablehlo.add %3882, %3883 : tensor<256x1280xf32>
-    %3885 = stablehlo.convert %3884 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3886 = stablehlo.reshape %3885 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3887 = stablehlo.add %3886, %3779 : tensor<1x256x1280xbf16>
-    %3888 = stablehlo.convert %3887 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3889 = stablehlo.convert %3888 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3890 = stablehlo.reduce(%3889 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3891 = stablehlo.reshape %3890 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3892 = stablehlo.broadcast_in_dim %3891, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3893 = stablehlo.divide %3892, %142 : tensor<1x256x1xf64>
-    %3894 = stablehlo.broadcast_in_dim %3889, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3895 = stablehlo.broadcast_in_dim %3893, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3896 = stablehlo.subtract %3894, %3895 : tensor<1x256x1280xf64>
-    %3897 = stablehlo.multiply %3896, %3896 : tensor<1x256x1280xf64>
-    %3898 = stablehlo.reduce(%3897 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3899 = stablehlo.reshape %3898 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3900 = stablehlo.broadcast_in_dim %3899, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3901 = stablehlo.divide %3900, %142 : tensor<1x256x1xf64>
-    %3902 = stablehlo.convert %3901 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3903 = stablehlo.reduce(%3888 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3904 = stablehlo.reshape %3903 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3905 = stablehlo.broadcast_in_dim %3904, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3906 = stablehlo.divide %3905, %158 : tensor<1x256x1xf32>
-    %3907 = stablehlo.broadcast_in_dim %3902, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3908 = stablehlo.add %3907, %161 : tensor<1x256x1xf32>
-    %3909 = stablehlo.rsqrt %3908 : tensor<1x256x1xf32>
-    %3910 = stablehlo.broadcast_in_dim %3888, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3911 = stablehlo.broadcast_in_dim %3906, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3912 = stablehlo.subtract %3910, %3911 : tensor<1x256x1280xf32>
-    %3913 = stablehlo.broadcast_in_dim %3912, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3914 = stablehlo.broadcast_in_dim %3909, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %3915 = stablehlo.multiply %3913, %3914 : tensor<1x256x1280xf32>
-    %3916 = stablehlo.convert %arg81 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3917 = stablehlo.broadcast_in_dim %3915, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3918 = stablehlo.broadcast_in_dim %3916, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3919 = stablehlo.multiply %3917, %3918 : tensor<1x256x1280xf32>
-    %3920 = stablehlo.convert %arg82 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %3921 = stablehlo.broadcast_in_dim %3919, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %3922 = stablehlo.broadcast_in_dim %3920, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %3923 = stablehlo.add %3921, %3922 : tensor<1x256x1280xf32>
-    %3924 = stablehlo.convert %3923 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3925 = stablehlo.reshape %3924 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3926 = stablehlo.convert %3925 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3927 = stablehlo.dot_general %3926, %arg353, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3928 = stablehlo.broadcast_in_dim %3927, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3929 = stablehlo.multiply %3928, %127 : tensor<256x1280xf32>
-    %3930 = stablehlo.broadcast_in_dim %3929, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3931 = stablehlo.broadcast_in_dim %arg354, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3932 = stablehlo.add %3930, %3931 : tensor<256x1280xf32>
-    %3933 = stablehlo.convert %3932 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3934 = stablehlo.reshape %3933 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3935 = stablehlo.multiply %3934, %cst_4 : tensor<1x256x1280xbf16>
-    %3936 = stablehlo.multiply %3934, %190 : tensor<1x256x1280xbf16>
-    %3937 = stablehlo.convert %3936 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3938 = stablehlo.clamp %cst_5, %3937, %cst_6 : tensor<1x256x1280xf32>
-    %3939 = stablehlo.multiply %3938, %3938 : tensor<1x256x1280xf32>
-    %3940 = stablehlo.multiply %cst_7, %3939 : tensor<1x256x1280xf32>
-    %3941 = stablehlo.add %3940, %cst_8 : tensor<1x256x1280xf32>
-    %3942 = stablehlo.multiply %3941, %3939 : tensor<1x256x1280xf32>
-    %3943 = stablehlo.add %3942, %cst_9 : tensor<1x256x1280xf32>
-    %3944 = stablehlo.multiply %3943, %3939 : tensor<1x256x1280xf32>
-    %3945 = stablehlo.add %3944, %cst_10 : tensor<1x256x1280xf32>
-    %3946 = stablehlo.multiply %3945, %3939 : tensor<1x256x1280xf32>
-    %3947 = stablehlo.add %3946, %cst_11 : tensor<1x256x1280xf32>
-    %3948 = stablehlo.multiply %3947, %3939 : tensor<1x256x1280xf32>
-    %3949 = stablehlo.add %3948, %cst_12 : tensor<1x256x1280xf32>
-    %3950 = stablehlo.multiply %3949, %3939 : tensor<1x256x1280xf32>
-    %3951 = stablehlo.add %3950, %cst_13 : tensor<1x256x1280xf32>
-    %3952 = stablehlo.multiply %cst_14, %3939 : tensor<1x256x1280xf32>
-    %3953 = stablehlo.add %3952, %cst_15 : tensor<1x256x1280xf32>
-    %3954 = stablehlo.multiply %3953, %3939 : tensor<1x256x1280xf32>
-    %3955 = stablehlo.add %3954, %cst_16 : tensor<1x256x1280xf32>
-    %3956 = stablehlo.multiply %3955, %3939 : tensor<1x256x1280xf32>
-    %3957 = stablehlo.add %3956, %cst_17 : tensor<1x256x1280xf32>
-    %3958 = stablehlo.multiply %3957, %3939 : tensor<1x256x1280xf32>
-    %3959 = stablehlo.add %3958, %cst_18 : tensor<1x256x1280xf32>
-    %3960 = stablehlo.multiply %3938, %3951 : tensor<1x256x1280xf32>
-    %3961 = stablehlo.divide %3960, %3959 : tensor<1x256x1280xf32>
-    %3962 = stablehlo.clamp %cst_19, %3961, %cst_20 : tensor<1x256x1280xf32>
-    %3963 = stablehlo.convert %3962 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %3964 = stablehlo.add %3963, %cst_2 : tensor<1x256x1280xbf16>
-    %3965 = stablehlo.multiply %3964, %3935 : tensor<1x256x1280xbf16>
-    %3966 = stablehlo.reshape %3965 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %3967 = stablehlo.convert %3966 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %3968 = stablehlo.dot_general %3967, %arg355, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %3969 = stablehlo.broadcast_in_dim %3968, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3970 = stablehlo.multiply %3969, %127 : tensor<256x1280xf32>
-    %3971 = stablehlo.broadcast_in_dim %3970, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %3972 = stablehlo.broadcast_in_dim %arg356, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %3973 = stablehlo.add %3971, %3972 : tensor<256x1280xf32>
-    %3974 = stablehlo.convert %3973 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %3975 = stablehlo.reshape %3974 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %3976 = stablehlo.add %3975, %3887 : tensor<1x256x1280xbf16>
-    %3977 = stablehlo.convert %3976 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %3978 = stablehlo.convert %3977 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %3979 = stablehlo.reduce(%3978 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3980 = stablehlo.reshape %3979 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3981 = stablehlo.broadcast_in_dim %3980, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3982 = stablehlo.divide %3981, %142 : tensor<1x256x1xf64>
-    %3983 = stablehlo.broadcast_in_dim %3978, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %3984 = stablehlo.broadcast_in_dim %3982, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %3985 = stablehlo.subtract %3983, %3984 : tensor<1x256x1280xf64>
-    %3986 = stablehlo.multiply %3985, %3985 : tensor<1x256x1280xf64>
-    %3987 = stablehlo.reduce(%3986 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %3988 = stablehlo.reshape %3987 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %3989 = stablehlo.broadcast_in_dim %3988, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %3990 = stablehlo.divide %3989, %142 : tensor<1x256x1xf64>
-    %3991 = stablehlo.convert %3990 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %3992 = stablehlo.reduce(%3977 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %3993 = stablehlo.reshape %3992 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %3994 = stablehlo.broadcast_in_dim %3993, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3995 = stablehlo.divide %3994, %158 : tensor<1x256x1xf32>
-    %3996 = stablehlo.broadcast_in_dim %3991, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %3997 = stablehlo.add %3996, %161 : tensor<1x256x1xf32>
-    %3998 = stablehlo.rsqrt %3997 : tensor<1x256x1xf32>
-    %3999 = stablehlo.broadcast_in_dim %3977, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4000 = stablehlo.broadcast_in_dim %3995, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4001 = stablehlo.subtract %3999, %4000 : tensor<1x256x1280xf32>
-    %4002 = stablehlo.broadcast_in_dim %4001, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4003 = stablehlo.broadcast_in_dim %3998, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4004 = stablehlo.multiply %4002, %4003 : tensor<1x256x1280xf32>
-    %4005 = stablehlo.convert %arg83 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4006 = stablehlo.broadcast_in_dim %4004, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4007 = stablehlo.broadcast_in_dim %4005, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4008 = stablehlo.multiply %4006, %4007 : tensor<1x256x1280xf32>
-    %4009 = stablehlo.convert %arg84 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4010 = stablehlo.broadcast_in_dim %4008, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4011 = stablehlo.broadcast_in_dim %4009, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4012 = stablehlo.add %4010, %4011 : tensor<1x256x1280xf32>
-    %4013 = stablehlo.convert %4012 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4014 = stablehlo.reshape %4013 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4015 = stablehlo.convert %4014 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4016 = stablehlo.dot_general %4015, %arg357, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4017 = stablehlo.broadcast_in_dim %4016, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4018 = stablehlo.multiply %4017, %273 : tensor<256x256xf32>
-    %4019 = stablehlo.broadcast_in_dim %4018, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4020 = stablehlo.broadcast_in_dim %arg358, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4021 = stablehlo.add %4019, %4020 : tensor<256x256xf32>
-    %4022 = stablehlo.convert %4021 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4023 = stablehlo.reshape %4022 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4024 = stablehlo.dot_general %4015, %arg359, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4025 = stablehlo.broadcast_in_dim %4024, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4026 = stablehlo.multiply %4025, %273 : tensor<256x256xf32>
-    %4027 = stablehlo.broadcast_in_dim %4026, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4028 = stablehlo.broadcast_in_dim %arg360, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4029 = stablehlo.add %4027, %4028 : tensor<256x256xf32>
-    %4030 = stablehlo.convert %4029 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4031 = stablehlo.reshape %4030 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4032 = stablehlo.dot_general %4015, %arg361, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4033 = stablehlo.broadcast_in_dim %4032, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4034 = stablehlo.multiply %4033, %127 : tensor<256x1280xf32>
-    %4035 = stablehlo.broadcast_in_dim %4034, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4036 = stablehlo.broadcast_in_dim %arg362, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4037 = stablehlo.add %4035, %4036 : tensor<256x1280xf32>
-    %4038 = stablehlo.convert %4037 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4039 = stablehlo.reshape %4038 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4040 = stablehlo.reshape %4023 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4041 = stablehlo.transpose %4040, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4042 = stablehlo.reshape %4031 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4043 = stablehlo.transpose %4042, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4044 = stablehlo.reshape %4039 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %4045 = stablehlo.transpose %4044, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4046 = stablehlo.transpose %4043, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %4047 = stablehlo.reshape %4041 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %4048 = stablehlo.reshape %4046 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4049 = stablehlo.broadcast_in_dim %4048, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4050 = stablehlo.dot_general %4047, %4049, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %4051 = stablehlo.reshape %4050 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4052 = stablehlo.broadcast_in_dim %4051, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4053 = stablehlo.divide %4052, %309 : tensor<1x8x256x256xbf16>
-    %4054 = stablehlo.convert %4053 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %4055 = stablehlo.reduce(%4054 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4056 = stablehlo.reshape %4055 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4057 = stablehlo.broadcast_in_dim %4054, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4058 = stablehlo.broadcast_in_dim %4056, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4059 = stablehlo.subtract %4057, %4058 : tensor<1x8x256x256xf32>
-    %4060 = stablehlo.exponential %4059 : tensor<1x8x256x256xf32>
-    %4061 = stablehlo.reduce(%4060 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4062 = stablehlo.reshape %4061 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4063 = stablehlo.broadcast_in_dim %4060, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4064 = stablehlo.broadcast_in_dim %4062, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4065 = stablehlo.divide %4063, %4064 : tensor<1x8x256x256xf32>
-    %4066 = stablehlo.convert %4065 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %4067 = stablehlo.reshape %4066 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %4068 = stablehlo.reshape %4045 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4069 = stablehlo.broadcast_in_dim %4068, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4070 = stablehlo.dot_general %4067, %4069, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4071 = stablehlo.reshape %4070 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4072 = stablehlo.transpose %4071, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %4073 = stablehlo.reshape %4072 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %4074 = stablehlo.reshape %4073 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4075 = stablehlo.convert %4074 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4076 = stablehlo.dot_general %4075, %arg363, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4077 = stablehlo.broadcast_in_dim %4076, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4078 = stablehlo.multiply %4077, %127 : tensor<256x1280xf32>
-    %4079 = stablehlo.broadcast_in_dim %4078, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4080 = stablehlo.broadcast_in_dim %arg364, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4081 = stablehlo.add %4079, %4080 : tensor<256x1280xf32>
-    %4082 = stablehlo.convert %4081 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4083 = stablehlo.reshape %4082 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4084 = stablehlo.add %4083, %3976 : tensor<1x256x1280xbf16>
-    %4085 = stablehlo.convert %4084 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4086 = stablehlo.convert %4085 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4087 = stablehlo.reduce(%4086 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4088 = stablehlo.reshape %4087 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4089 = stablehlo.broadcast_in_dim %4088, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4090 = stablehlo.divide %4089, %142 : tensor<1x256x1xf64>
-    %4091 = stablehlo.broadcast_in_dim %4086, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4092 = stablehlo.broadcast_in_dim %4090, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4093 = stablehlo.subtract %4091, %4092 : tensor<1x256x1280xf64>
-    %4094 = stablehlo.multiply %4093, %4093 : tensor<1x256x1280xf64>
-    %4095 = stablehlo.reduce(%4094 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4096 = stablehlo.reshape %4095 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4097 = stablehlo.broadcast_in_dim %4096, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4098 = stablehlo.divide %4097, %142 : tensor<1x256x1xf64>
-    %4099 = stablehlo.convert %4098 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4100 = stablehlo.reduce(%4085 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4101 = stablehlo.reshape %4100 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4102 = stablehlo.broadcast_in_dim %4101, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4103 = stablehlo.divide %4102, %158 : tensor<1x256x1xf32>
-    %4104 = stablehlo.broadcast_in_dim %4099, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4105 = stablehlo.add %4104, %161 : tensor<1x256x1xf32>
-    %4106 = stablehlo.rsqrt %4105 : tensor<1x256x1xf32>
-    %4107 = stablehlo.broadcast_in_dim %4085, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4108 = stablehlo.broadcast_in_dim %4103, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4109 = stablehlo.subtract %4107, %4108 : tensor<1x256x1280xf32>
-    %4110 = stablehlo.broadcast_in_dim %4109, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4111 = stablehlo.broadcast_in_dim %4106, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4112 = stablehlo.multiply %4110, %4111 : tensor<1x256x1280xf32>
-    %4113 = stablehlo.convert %arg85 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4114 = stablehlo.broadcast_in_dim %4112, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4115 = stablehlo.broadcast_in_dim %4113, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4116 = stablehlo.multiply %4114, %4115 : tensor<1x256x1280xf32>
-    %4117 = stablehlo.convert %arg86 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4118 = stablehlo.broadcast_in_dim %4116, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4119 = stablehlo.broadcast_in_dim %4117, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4120 = stablehlo.add %4118, %4119 : tensor<1x256x1280xf32>
-    %4121 = stablehlo.convert %4120 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4122 = stablehlo.reshape %4121 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4123 = stablehlo.convert %4122 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4124 = stablehlo.dot_general %4123, %arg365, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4125 = stablehlo.broadcast_in_dim %4124, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4126 = stablehlo.multiply %4125, %127 : tensor<256x1280xf32>
-    %4127 = stablehlo.broadcast_in_dim %4126, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4128 = stablehlo.broadcast_in_dim %arg366, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4129 = stablehlo.add %4127, %4128 : tensor<256x1280xf32>
-    %4130 = stablehlo.convert %4129 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4131 = stablehlo.reshape %4130 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4132 = stablehlo.multiply %4131, %cst_4 : tensor<1x256x1280xbf16>
-    %4133 = stablehlo.multiply %4131, %190 : tensor<1x256x1280xbf16>
-    %4134 = stablehlo.convert %4133 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4135 = stablehlo.clamp %cst_5, %4134, %cst_6 : tensor<1x256x1280xf32>
-    %4136 = stablehlo.multiply %4135, %4135 : tensor<1x256x1280xf32>
-    %4137 = stablehlo.multiply %cst_7, %4136 : tensor<1x256x1280xf32>
-    %4138 = stablehlo.add %4137, %cst_8 : tensor<1x256x1280xf32>
-    %4139 = stablehlo.multiply %4138, %4136 : tensor<1x256x1280xf32>
-    %4140 = stablehlo.add %4139, %cst_9 : tensor<1x256x1280xf32>
-    %4141 = stablehlo.multiply %4140, %4136 : tensor<1x256x1280xf32>
-    %4142 = stablehlo.add %4141, %cst_10 : tensor<1x256x1280xf32>
-    %4143 = stablehlo.multiply %4142, %4136 : tensor<1x256x1280xf32>
-    %4144 = stablehlo.add %4143, %cst_11 : tensor<1x256x1280xf32>
-    %4145 = stablehlo.multiply %4144, %4136 : tensor<1x256x1280xf32>
-    %4146 = stablehlo.add %4145, %cst_12 : tensor<1x256x1280xf32>
-    %4147 = stablehlo.multiply %4146, %4136 : tensor<1x256x1280xf32>
-    %4148 = stablehlo.add %4147, %cst_13 : tensor<1x256x1280xf32>
-    %4149 = stablehlo.multiply %cst_14, %4136 : tensor<1x256x1280xf32>
-    %4150 = stablehlo.add %4149, %cst_15 : tensor<1x256x1280xf32>
-    %4151 = stablehlo.multiply %4150, %4136 : tensor<1x256x1280xf32>
-    %4152 = stablehlo.add %4151, %cst_16 : tensor<1x256x1280xf32>
-    %4153 = stablehlo.multiply %4152, %4136 : tensor<1x256x1280xf32>
-    %4154 = stablehlo.add %4153, %cst_17 : tensor<1x256x1280xf32>
-    %4155 = stablehlo.multiply %4154, %4136 : tensor<1x256x1280xf32>
-    %4156 = stablehlo.add %4155, %cst_18 : tensor<1x256x1280xf32>
-    %4157 = stablehlo.multiply %4135, %4148 : tensor<1x256x1280xf32>
-    %4158 = stablehlo.divide %4157, %4156 : tensor<1x256x1280xf32>
-    %4159 = stablehlo.clamp %cst_19, %4158, %cst_20 : tensor<1x256x1280xf32>
-    %4160 = stablehlo.convert %4159 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4161 = stablehlo.add %4160, %cst_2 : tensor<1x256x1280xbf16>
-    %4162 = stablehlo.multiply %4161, %4132 : tensor<1x256x1280xbf16>
-    %4163 = stablehlo.reshape %4162 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4164 = stablehlo.convert %4163 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4165 = stablehlo.dot_general %4164, %arg367, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4166 = stablehlo.broadcast_in_dim %4165, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4167 = stablehlo.multiply %4166, %127 : tensor<256x1280xf32>
-    %4168 = stablehlo.broadcast_in_dim %4167, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4169 = stablehlo.broadcast_in_dim %arg368, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4170 = stablehlo.add %4168, %4169 : tensor<256x1280xf32>
-    %4171 = stablehlo.convert %4170 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4172 = stablehlo.reshape %4171 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4173 = stablehlo.add %4172, %4084 : tensor<1x256x1280xbf16>
-    %4174 = stablehlo.convert %4173 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4175 = stablehlo.convert %4174 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4176 = stablehlo.reduce(%4175 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4177 = stablehlo.reshape %4176 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4178 = stablehlo.broadcast_in_dim %4177, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4179 = stablehlo.divide %4178, %142 : tensor<1x256x1xf64>
-    %4180 = stablehlo.broadcast_in_dim %4175, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4181 = stablehlo.broadcast_in_dim %4179, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4182 = stablehlo.subtract %4180, %4181 : tensor<1x256x1280xf64>
-    %4183 = stablehlo.multiply %4182, %4182 : tensor<1x256x1280xf64>
-    %4184 = stablehlo.reduce(%4183 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4185 = stablehlo.reshape %4184 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4186 = stablehlo.broadcast_in_dim %4185, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4187 = stablehlo.divide %4186, %142 : tensor<1x256x1xf64>
-    %4188 = stablehlo.convert %4187 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4189 = stablehlo.reduce(%4174 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4190 = stablehlo.reshape %4189 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4191 = stablehlo.broadcast_in_dim %4190, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4192 = stablehlo.divide %4191, %158 : tensor<1x256x1xf32>
-    %4193 = stablehlo.broadcast_in_dim %4188, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4194 = stablehlo.add %4193, %161 : tensor<1x256x1xf32>
-    %4195 = stablehlo.rsqrt %4194 : tensor<1x256x1xf32>
-    %4196 = stablehlo.broadcast_in_dim %4174, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4197 = stablehlo.broadcast_in_dim %4192, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4198 = stablehlo.subtract %4196, %4197 : tensor<1x256x1280xf32>
-    %4199 = stablehlo.broadcast_in_dim %4198, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4200 = stablehlo.broadcast_in_dim %4195, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4201 = stablehlo.multiply %4199, %4200 : tensor<1x256x1280xf32>
-    %4202 = stablehlo.convert %arg87 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4203 = stablehlo.broadcast_in_dim %4201, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4204 = stablehlo.broadcast_in_dim %4202, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4205 = stablehlo.multiply %4203, %4204 : tensor<1x256x1280xf32>
-    %4206 = stablehlo.convert %arg88 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4207 = stablehlo.broadcast_in_dim %4205, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4208 = stablehlo.broadcast_in_dim %4206, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4209 = stablehlo.add %4207, %4208 : tensor<1x256x1280xf32>
-    %4210 = stablehlo.convert %4209 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4211 = stablehlo.reshape %4210 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4212 = stablehlo.convert %4211 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4213 = stablehlo.dot_general %4212, %arg369, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4214 = stablehlo.broadcast_in_dim %4213, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4215 = stablehlo.multiply %4214, %273 : tensor<256x256xf32>
-    %4216 = stablehlo.broadcast_in_dim %4215, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4217 = stablehlo.broadcast_in_dim %arg370, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4218 = stablehlo.add %4216, %4217 : tensor<256x256xf32>
-    %4219 = stablehlo.convert %4218 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4220 = stablehlo.reshape %4219 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4221 = stablehlo.dot_general %4212, %arg371, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4222 = stablehlo.broadcast_in_dim %4221, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4223 = stablehlo.multiply %4222, %273 : tensor<256x256xf32>
-    %4224 = stablehlo.broadcast_in_dim %4223, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4225 = stablehlo.broadcast_in_dim %arg372, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4226 = stablehlo.add %4224, %4225 : tensor<256x256xf32>
-    %4227 = stablehlo.convert %4226 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4228 = stablehlo.reshape %4227 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4229 = stablehlo.dot_general %4212, %arg373, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4230 = stablehlo.broadcast_in_dim %4229, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4231 = stablehlo.multiply %4230, %127 : tensor<256x1280xf32>
-    %4232 = stablehlo.broadcast_in_dim %4231, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4233 = stablehlo.broadcast_in_dim %arg374, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4234 = stablehlo.add %4232, %4233 : tensor<256x1280xf32>
-    %4235 = stablehlo.convert %4234 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4236 = stablehlo.reshape %4235 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4237 = stablehlo.reshape %4220 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4238 = stablehlo.transpose %4237, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4239 = stablehlo.reshape %4228 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4240 = stablehlo.transpose %4239, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4241 = stablehlo.reshape %4236 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %4242 = stablehlo.transpose %4241, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4243 = stablehlo.transpose %4240, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %4244 = stablehlo.reshape %4238 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %4245 = stablehlo.reshape %4243 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4246 = stablehlo.broadcast_in_dim %4245, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4247 = stablehlo.dot_general %4244, %4246, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %4248 = stablehlo.reshape %4247 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4249 = stablehlo.broadcast_in_dim %4248, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4250 = stablehlo.divide %4249, %309 : tensor<1x8x256x256xbf16>
-    %4251 = stablehlo.convert %4250 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %4252 = stablehlo.reduce(%4251 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4253 = stablehlo.reshape %4252 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4254 = stablehlo.broadcast_in_dim %4251, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4255 = stablehlo.broadcast_in_dim %4253, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4256 = stablehlo.subtract %4254, %4255 : tensor<1x8x256x256xf32>
-    %4257 = stablehlo.exponential %4256 : tensor<1x8x256x256xf32>
-    %4258 = stablehlo.reduce(%4257 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4259 = stablehlo.reshape %4258 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4260 = stablehlo.broadcast_in_dim %4257, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4261 = stablehlo.broadcast_in_dim %4259, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4262 = stablehlo.divide %4260, %4261 : tensor<1x8x256x256xf32>
-    %4263 = stablehlo.convert %4262 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %4264 = stablehlo.reshape %4263 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %4265 = stablehlo.reshape %4242 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4266 = stablehlo.broadcast_in_dim %4265, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4267 = stablehlo.dot_general %4264, %4266, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4268 = stablehlo.reshape %4267 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4269 = stablehlo.transpose %4268, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %4270 = stablehlo.reshape %4269 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %4271 = stablehlo.reshape %4270 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4272 = stablehlo.convert %4271 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4273 = stablehlo.dot_general %4272, %arg375, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4274 = stablehlo.broadcast_in_dim %4273, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4275 = stablehlo.multiply %4274, %127 : tensor<256x1280xf32>
-    %4276 = stablehlo.broadcast_in_dim %4275, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4277 = stablehlo.broadcast_in_dim %arg376, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4278 = stablehlo.add %4276, %4277 : tensor<256x1280xf32>
-    %4279 = stablehlo.convert %4278 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4280 = stablehlo.reshape %4279 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4281 = stablehlo.add %4280, %4173 : tensor<1x256x1280xbf16>
-    %4282 = stablehlo.convert %4281 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4283 = stablehlo.convert %4282 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4284 = stablehlo.reduce(%4283 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4285 = stablehlo.reshape %4284 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4286 = stablehlo.broadcast_in_dim %4285, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4287 = stablehlo.divide %4286, %142 : tensor<1x256x1xf64>
-    %4288 = stablehlo.broadcast_in_dim %4283, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4289 = stablehlo.broadcast_in_dim %4287, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4290 = stablehlo.subtract %4288, %4289 : tensor<1x256x1280xf64>
-    %4291 = stablehlo.multiply %4290, %4290 : tensor<1x256x1280xf64>
-    %4292 = stablehlo.reduce(%4291 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4293 = stablehlo.reshape %4292 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4294 = stablehlo.broadcast_in_dim %4293, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4295 = stablehlo.divide %4294, %142 : tensor<1x256x1xf64>
-    %4296 = stablehlo.convert %4295 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4297 = stablehlo.reduce(%4282 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4298 = stablehlo.reshape %4297 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4299 = stablehlo.broadcast_in_dim %4298, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4300 = stablehlo.divide %4299, %158 : tensor<1x256x1xf32>
-    %4301 = stablehlo.broadcast_in_dim %4296, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4302 = stablehlo.add %4301, %161 : tensor<1x256x1xf32>
-    %4303 = stablehlo.rsqrt %4302 : tensor<1x256x1xf32>
-    %4304 = stablehlo.broadcast_in_dim %4282, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4305 = stablehlo.broadcast_in_dim %4300, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4306 = stablehlo.subtract %4304, %4305 : tensor<1x256x1280xf32>
-    %4307 = stablehlo.broadcast_in_dim %4306, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4308 = stablehlo.broadcast_in_dim %4303, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4309 = stablehlo.multiply %4307, %4308 : tensor<1x256x1280xf32>
-    %4310 = stablehlo.convert %arg89 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4311 = stablehlo.broadcast_in_dim %4309, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4312 = stablehlo.broadcast_in_dim %4310, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4313 = stablehlo.multiply %4311, %4312 : tensor<1x256x1280xf32>
-    %4314 = stablehlo.convert %arg90 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4315 = stablehlo.broadcast_in_dim %4313, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4316 = stablehlo.broadcast_in_dim %4314, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4317 = stablehlo.add %4315, %4316 : tensor<1x256x1280xf32>
-    %4318 = stablehlo.convert %4317 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4319 = stablehlo.reshape %4318 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4320 = stablehlo.convert %4319 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4321 = stablehlo.dot_general %4320, %arg377, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4322 = stablehlo.broadcast_in_dim %4321, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4323 = stablehlo.multiply %4322, %127 : tensor<256x1280xf32>
-    %4324 = stablehlo.broadcast_in_dim %4323, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4325 = stablehlo.broadcast_in_dim %arg378, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4326 = stablehlo.add %4324, %4325 : tensor<256x1280xf32>
-    %4327 = stablehlo.convert %4326 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4328 = stablehlo.reshape %4327 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4329 = stablehlo.multiply %4328, %cst_4 : tensor<1x256x1280xbf16>
-    %4330 = stablehlo.multiply %4328, %190 : tensor<1x256x1280xbf16>
-    %4331 = stablehlo.convert %4330 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4332 = stablehlo.clamp %cst_5, %4331, %cst_6 : tensor<1x256x1280xf32>
-    %4333 = stablehlo.multiply %4332, %4332 : tensor<1x256x1280xf32>
-    %4334 = stablehlo.multiply %cst_7, %4333 : tensor<1x256x1280xf32>
-    %4335 = stablehlo.add %4334, %cst_8 : tensor<1x256x1280xf32>
-    %4336 = stablehlo.multiply %4335, %4333 : tensor<1x256x1280xf32>
-    %4337 = stablehlo.add %4336, %cst_9 : tensor<1x256x1280xf32>
-    %4338 = stablehlo.multiply %4337, %4333 : tensor<1x256x1280xf32>
-    %4339 = stablehlo.add %4338, %cst_10 : tensor<1x256x1280xf32>
-    %4340 = stablehlo.multiply %4339, %4333 : tensor<1x256x1280xf32>
-    %4341 = stablehlo.add %4340, %cst_11 : tensor<1x256x1280xf32>
-    %4342 = stablehlo.multiply %4341, %4333 : tensor<1x256x1280xf32>
-    %4343 = stablehlo.add %4342, %cst_12 : tensor<1x256x1280xf32>
-    %4344 = stablehlo.multiply %4343, %4333 : tensor<1x256x1280xf32>
-    %4345 = stablehlo.add %4344, %cst_13 : tensor<1x256x1280xf32>
-    %4346 = stablehlo.multiply %cst_14, %4333 : tensor<1x256x1280xf32>
-    %4347 = stablehlo.add %4346, %cst_15 : tensor<1x256x1280xf32>
-    %4348 = stablehlo.multiply %4347, %4333 : tensor<1x256x1280xf32>
-    %4349 = stablehlo.add %4348, %cst_16 : tensor<1x256x1280xf32>
-    %4350 = stablehlo.multiply %4349, %4333 : tensor<1x256x1280xf32>
-    %4351 = stablehlo.add %4350, %cst_17 : tensor<1x256x1280xf32>
-    %4352 = stablehlo.multiply %4351, %4333 : tensor<1x256x1280xf32>
-    %4353 = stablehlo.add %4352, %cst_18 : tensor<1x256x1280xf32>
-    %4354 = stablehlo.multiply %4332, %4345 : tensor<1x256x1280xf32>
-    %4355 = stablehlo.divide %4354, %4353 : tensor<1x256x1280xf32>
-    %4356 = stablehlo.clamp %cst_19, %4355, %cst_20 : tensor<1x256x1280xf32>
-    %4357 = stablehlo.convert %4356 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4358 = stablehlo.add %4357, %cst_2 : tensor<1x256x1280xbf16>
-    %4359 = stablehlo.multiply %4358, %4329 : tensor<1x256x1280xbf16>
-    %4360 = stablehlo.reshape %4359 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4361 = stablehlo.convert %4360 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4362 = stablehlo.dot_general %4361, %arg379, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4363 = stablehlo.broadcast_in_dim %4362, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4364 = stablehlo.multiply %4363, %127 : tensor<256x1280xf32>
-    %4365 = stablehlo.broadcast_in_dim %4364, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4366 = stablehlo.broadcast_in_dim %arg380, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4367 = stablehlo.add %4365, %4366 : tensor<256x1280xf32>
-    %4368 = stablehlo.convert %4367 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4369 = stablehlo.reshape %4368 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4370 = stablehlo.add %4369, %4281 : tensor<1x256x1280xbf16>
-    %4371 = stablehlo.convert %4370 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4372 = stablehlo.convert %4371 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4373 = stablehlo.reduce(%4372 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4374 = stablehlo.reshape %4373 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4375 = stablehlo.broadcast_in_dim %4374, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4376 = stablehlo.divide %4375, %142 : tensor<1x256x1xf64>
-    %4377 = stablehlo.broadcast_in_dim %4372, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4378 = stablehlo.broadcast_in_dim %4376, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4379 = stablehlo.subtract %4377, %4378 : tensor<1x256x1280xf64>
-    %4380 = stablehlo.multiply %4379, %4379 : tensor<1x256x1280xf64>
-    %4381 = stablehlo.reduce(%4380 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4382 = stablehlo.reshape %4381 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4383 = stablehlo.broadcast_in_dim %4382, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4384 = stablehlo.divide %4383, %142 : tensor<1x256x1xf64>
-    %4385 = stablehlo.convert %4384 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4386 = stablehlo.reduce(%4371 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4387 = stablehlo.reshape %4386 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4388 = stablehlo.broadcast_in_dim %4387, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4389 = stablehlo.divide %4388, %158 : tensor<1x256x1xf32>
-    %4390 = stablehlo.broadcast_in_dim %4385, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4391 = stablehlo.add %4390, %161 : tensor<1x256x1xf32>
-    %4392 = stablehlo.rsqrt %4391 : tensor<1x256x1xf32>
-    %4393 = stablehlo.broadcast_in_dim %4371, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4394 = stablehlo.broadcast_in_dim %4389, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4395 = stablehlo.subtract %4393, %4394 : tensor<1x256x1280xf32>
-    %4396 = stablehlo.broadcast_in_dim %4395, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4397 = stablehlo.broadcast_in_dim %4392, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4398 = stablehlo.multiply %4396, %4397 : tensor<1x256x1280xf32>
-    %4399 = stablehlo.convert %arg91 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4400 = stablehlo.broadcast_in_dim %4398, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4401 = stablehlo.broadcast_in_dim %4399, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4402 = stablehlo.multiply %4400, %4401 : tensor<1x256x1280xf32>
-    %4403 = stablehlo.convert %arg92 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4404 = stablehlo.broadcast_in_dim %4402, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4405 = stablehlo.broadcast_in_dim %4403, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4406 = stablehlo.add %4404, %4405 : tensor<1x256x1280xf32>
-    %4407 = stablehlo.convert %4406 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4408 = stablehlo.reshape %4407 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4409 = stablehlo.convert %4408 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4410 = stablehlo.dot_general %4409, %arg381, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4411 = stablehlo.broadcast_in_dim %4410, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4412 = stablehlo.multiply %4411, %273 : tensor<256x256xf32>
-    %4413 = stablehlo.broadcast_in_dim %4412, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4414 = stablehlo.broadcast_in_dim %arg382, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4415 = stablehlo.add %4413, %4414 : tensor<256x256xf32>
-    %4416 = stablehlo.convert %4415 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4417 = stablehlo.reshape %4416 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4418 = stablehlo.dot_general %4409, %arg383, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4419 = stablehlo.broadcast_in_dim %4418, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4420 = stablehlo.multiply %4419, %273 : tensor<256x256xf32>
-    %4421 = stablehlo.broadcast_in_dim %4420, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4422 = stablehlo.broadcast_in_dim %arg384, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4423 = stablehlo.add %4421, %4422 : tensor<256x256xf32>
-    %4424 = stablehlo.convert %4423 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4425 = stablehlo.reshape %4424 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4426 = stablehlo.dot_general %4409, %arg385, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4427 = stablehlo.broadcast_in_dim %4426, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4428 = stablehlo.multiply %4427, %127 : tensor<256x1280xf32>
-    %4429 = stablehlo.broadcast_in_dim %4428, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4430 = stablehlo.broadcast_in_dim %arg386, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4431 = stablehlo.add %4429, %4430 : tensor<256x1280xf32>
-    %4432 = stablehlo.convert %4431 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4433 = stablehlo.reshape %4432 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4434 = stablehlo.reshape %4417 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4435 = stablehlo.transpose %4434, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4436 = stablehlo.reshape %4425 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4437 = stablehlo.transpose %4436, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4438 = stablehlo.reshape %4433 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %4439 = stablehlo.transpose %4438, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4440 = stablehlo.transpose %4437, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %4441 = stablehlo.reshape %4435 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %4442 = stablehlo.reshape %4440 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4443 = stablehlo.broadcast_in_dim %4442, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4444 = stablehlo.dot_general %4441, %4443, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %4445 = stablehlo.reshape %4444 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4446 = stablehlo.broadcast_in_dim %4445, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4447 = stablehlo.divide %4446, %309 : tensor<1x8x256x256xbf16>
-    %4448 = stablehlo.convert %4447 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %4449 = stablehlo.reduce(%4448 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4450 = stablehlo.reshape %4449 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4451 = stablehlo.broadcast_in_dim %4448, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4452 = stablehlo.broadcast_in_dim %4450, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4453 = stablehlo.subtract %4451, %4452 : tensor<1x8x256x256xf32>
-    %4454 = stablehlo.exponential %4453 : tensor<1x8x256x256xf32>
-    %4455 = stablehlo.reduce(%4454 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4456 = stablehlo.reshape %4455 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4457 = stablehlo.broadcast_in_dim %4454, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4458 = stablehlo.broadcast_in_dim %4456, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4459 = stablehlo.divide %4457, %4458 : tensor<1x8x256x256xf32>
-    %4460 = stablehlo.convert %4459 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %4461 = stablehlo.reshape %4460 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %4462 = stablehlo.reshape %4439 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4463 = stablehlo.broadcast_in_dim %4462, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4464 = stablehlo.dot_general %4461, %4463, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4465 = stablehlo.reshape %4464 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4466 = stablehlo.transpose %4465, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %4467 = stablehlo.reshape %4466 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %4468 = stablehlo.reshape %4467 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4469 = stablehlo.convert %4468 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4470 = stablehlo.dot_general %4469, %arg387, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4471 = stablehlo.broadcast_in_dim %4470, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4472 = stablehlo.multiply %4471, %127 : tensor<256x1280xf32>
-    %4473 = stablehlo.broadcast_in_dim %4472, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4474 = stablehlo.broadcast_in_dim %arg388, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4475 = stablehlo.add %4473, %4474 : tensor<256x1280xf32>
-    %4476 = stablehlo.convert %4475 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4477 = stablehlo.reshape %4476 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4478 = stablehlo.add %4477, %4370 : tensor<1x256x1280xbf16>
-    %4479 = stablehlo.convert %4478 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4480 = stablehlo.convert %4479 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4481 = stablehlo.reduce(%4480 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4482 = stablehlo.reshape %4481 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4483 = stablehlo.broadcast_in_dim %4482, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4484 = stablehlo.divide %4483, %142 : tensor<1x256x1xf64>
-    %4485 = stablehlo.broadcast_in_dim %4480, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4486 = stablehlo.broadcast_in_dim %4484, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4487 = stablehlo.subtract %4485, %4486 : tensor<1x256x1280xf64>
-    %4488 = stablehlo.multiply %4487, %4487 : tensor<1x256x1280xf64>
-    %4489 = stablehlo.reduce(%4488 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4490 = stablehlo.reshape %4489 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4491 = stablehlo.broadcast_in_dim %4490, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4492 = stablehlo.divide %4491, %142 : tensor<1x256x1xf64>
-    %4493 = stablehlo.convert %4492 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4494 = stablehlo.reduce(%4479 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4495 = stablehlo.reshape %4494 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4496 = stablehlo.broadcast_in_dim %4495, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4497 = stablehlo.divide %4496, %158 : tensor<1x256x1xf32>
-    %4498 = stablehlo.broadcast_in_dim %4493, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4499 = stablehlo.add %4498, %161 : tensor<1x256x1xf32>
-    %4500 = stablehlo.rsqrt %4499 : tensor<1x256x1xf32>
-    %4501 = stablehlo.broadcast_in_dim %4479, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4502 = stablehlo.broadcast_in_dim %4497, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4503 = stablehlo.subtract %4501, %4502 : tensor<1x256x1280xf32>
-    %4504 = stablehlo.broadcast_in_dim %4503, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4505 = stablehlo.broadcast_in_dim %4500, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4506 = stablehlo.multiply %4504, %4505 : tensor<1x256x1280xf32>
-    %4507 = stablehlo.convert %arg93 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4508 = stablehlo.broadcast_in_dim %4506, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4509 = stablehlo.broadcast_in_dim %4507, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4510 = stablehlo.multiply %4508, %4509 : tensor<1x256x1280xf32>
-    %4511 = stablehlo.convert %arg94 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4512 = stablehlo.broadcast_in_dim %4510, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4513 = stablehlo.broadcast_in_dim %4511, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4514 = stablehlo.add %4512, %4513 : tensor<1x256x1280xf32>
-    %4515 = stablehlo.convert %4514 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4516 = stablehlo.reshape %4515 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4517 = stablehlo.convert %4516 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4518 = stablehlo.dot_general %4517, %arg389, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4519 = stablehlo.broadcast_in_dim %4518, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4520 = stablehlo.multiply %4519, %127 : tensor<256x1280xf32>
-    %4521 = stablehlo.broadcast_in_dim %4520, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4522 = stablehlo.broadcast_in_dim %arg390, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4523 = stablehlo.add %4521, %4522 : tensor<256x1280xf32>
-    %4524 = stablehlo.convert %4523 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4525 = stablehlo.reshape %4524 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4526 = stablehlo.multiply %4525, %cst_4 : tensor<1x256x1280xbf16>
-    %4527 = stablehlo.multiply %4525, %190 : tensor<1x256x1280xbf16>
-    %4528 = stablehlo.convert %4527 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4529 = stablehlo.clamp %cst_5, %4528, %cst_6 : tensor<1x256x1280xf32>
-    %4530 = stablehlo.multiply %4529, %4529 : tensor<1x256x1280xf32>
-    %4531 = stablehlo.multiply %cst_7, %4530 : tensor<1x256x1280xf32>
-    %4532 = stablehlo.add %4531, %cst_8 : tensor<1x256x1280xf32>
-    %4533 = stablehlo.multiply %4532, %4530 : tensor<1x256x1280xf32>
-    %4534 = stablehlo.add %4533, %cst_9 : tensor<1x256x1280xf32>
-    %4535 = stablehlo.multiply %4534, %4530 : tensor<1x256x1280xf32>
-    %4536 = stablehlo.add %4535, %cst_10 : tensor<1x256x1280xf32>
-    %4537 = stablehlo.multiply %4536, %4530 : tensor<1x256x1280xf32>
-    %4538 = stablehlo.add %4537, %cst_11 : tensor<1x256x1280xf32>
-    %4539 = stablehlo.multiply %4538, %4530 : tensor<1x256x1280xf32>
-    %4540 = stablehlo.add %4539, %cst_12 : tensor<1x256x1280xf32>
-    %4541 = stablehlo.multiply %4540, %4530 : tensor<1x256x1280xf32>
-    %4542 = stablehlo.add %4541, %cst_13 : tensor<1x256x1280xf32>
-    %4543 = stablehlo.multiply %cst_14, %4530 : tensor<1x256x1280xf32>
-    %4544 = stablehlo.add %4543, %cst_15 : tensor<1x256x1280xf32>
-    %4545 = stablehlo.multiply %4544, %4530 : tensor<1x256x1280xf32>
-    %4546 = stablehlo.add %4545, %cst_16 : tensor<1x256x1280xf32>
-    %4547 = stablehlo.multiply %4546, %4530 : tensor<1x256x1280xf32>
-    %4548 = stablehlo.add %4547, %cst_17 : tensor<1x256x1280xf32>
-    %4549 = stablehlo.multiply %4548, %4530 : tensor<1x256x1280xf32>
-    %4550 = stablehlo.add %4549, %cst_18 : tensor<1x256x1280xf32>
-    %4551 = stablehlo.multiply %4529, %4542 : tensor<1x256x1280xf32>
-    %4552 = stablehlo.divide %4551, %4550 : tensor<1x256x1280xf32>
-    %4553 = stablehlo.clamp %cst_19, %4552, %cst_20 : tensor<1x256x1280xf32>
-    %4554 = stablehlo.convert %4553 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4555 = stablehlo.add %4554, %cst_2 : tensor<1x256x1280xbf16>
-    %4556 = stablehlo.multiply %4555, %4526 : tensor<1x256x1280xbf16>
-    %4557 = stablehlo.reshape %4556 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4558 = stablehlo.convert %4557 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4559 = stablehlo.dot_general %4558, %arg391, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4560 = stablehlo.broadcast_in_dim %4559, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4561 = stablehlo.multiply %4560, %127 : tensor<256x1280xf32>
-    %4562 = stablehlo.broadcast_in_dim %4561, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4563 = stablehlo.broadcast_in_dim %arg392, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4564 = stablehlo.add %4562, %4563 : tensor<256x1280xf32>
-    %4565 = stablehlo.convert %4564 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4566 = stablehlo.reshape %4565 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4567 = stablehlo.add %4566, %4478 : tensor<1x256x1280xbf16>
-    %4568 = stablehlo.convert %4567 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4569 = stablehlo.convert %4568 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4570 = stablehlo.reduce(%4569 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4571 = stablehlo.reshape %4570 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4572 = stablehlo.broadcast_in_dim %4571, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4573 = stablehlo.divide %4572, %142 : tensor<1x256x1xf64>
-    %4574 = stablehlo.broadcast_in_dim %4569, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4575 = stablehlo.broadcast_in_dim %4573, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4576 = stablehlo.subtract %4574, %4575 : tensor<1x256x1280xf64>
-    %4577 = stablehlo.multiply %4576, %4576 : tensor<1x256x1280xf64>
-    %4578 = stablehlo.reduce(%4577 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4579 = stablehlo.reshape %4578 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4580 = stablehlo.broadcast_in_dim %4579, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4581 = stablehlo.divide %4580, %142 : tensor<1x256x1xf64>
-    %4582 = stablehlo.convert %4581 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4583 = stablehlo.reduce(%4568 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4584 = stablehlo.reshape %4583 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4585 = stablehlo.broadcast_in_dim %4584, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4586 = stablehlo.divide %4585, %158 : tensor<1x256x1xf32>
-    %4587 = stablehlo.broadcast_in_dim %4582, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4588 = stablehlo.add %4587, %161 : tensor<1x256x1xf32>
-    %4589 = stablehlo.rsqrt %4588 : tensor<1x256x1xf32>
-    %4590 = stablehlo.broadcast_in_dim %4568, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4591 = stablehlo.broadcast_in_dim %4586, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4592 = stablehlo.subtract %4590, %4591 : tensor<1x256x1280xf32>
-    %4593 = stablehlo.broadcast_in_dim %4592, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4594 = stablehlo.broadcast_in_dim %4589, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4595 = stablehlo.multiply %4593, %4594 : tensor<1x256x1280xf32>
-    %4596 = stablehlo.convert %arg95 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4597 = stablehlo.broadcast_in_dim %4595, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4598 = stablehlo.broadcast_in_dim %4596, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4599 = stablehlo.multiply %4597, %4598 : tensor<1x256x1280xf32>
-    %4600 = stablehlo.convert %arg96 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4601 = stablehlo.broadcast_in_dim %4599, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4602 = stablehlo.broadcast_in_dim %4600, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4603 = stablehlo.add %4601, %4602 : tensor<1x256x1280xf32>
-    %4604 = stablehlo.convert %4603 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4605 = stablehlo.reshape %4604 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4606 = stablehlo.convert %4605 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4607 = stablehlo.dot_general %4606, %arg393, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4608 = stablehlo.broadcast_in_dim %4607, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4609 = stablehlo.multiply %4608, %273 : tensor<256x256xf32>
-    %4610 = stablehlo.broadcast_in_dim %4609, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4611 = stablehlo.broadcast_in_dim %arg394, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4612 = stablehlo.add %4610, %4611 : tensor<256x256xf32>
-    %4613 = stablehlo.convert %4612 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4614 = stablehlo.reshape %4613 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4615 = stablehlo.dot_general %4606, %arg395, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4616 = stablehlo.broadcast_in_dim %4615, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4617 = stablehlo.multiply %4616, %273 : tensor<256x256xf32>
-    %4618 = stablehlo.broadcast_in_dim %4617, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4619 = stablehlo.broadcast_in_dim %arg396, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4620 = stablehlo.add %4618, %4619 : tensor<256x256xf32>
-    %4621 = stablehlo.convert %4620 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4622 = stablehlo.reshape %4621 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4623 = stablehlo.dot_general %4606, %arg397, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4624 = stablehlo.broadcast_in_dim %4623, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4625 = stablehlo.multiply %4624, %127 : tensor<256x1280xf32>
-    %4626 = stablehlo.broadcast_in_dim %4625, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4627 = stablehlo.broadcast_in_dim %arg398, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4628 = stablehlo.add %4626, %4627 : tensor<256x1280xf32>
-    %4629 = stablehlo.convert %4628 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4630 = stablehlo.reshape %4629 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4631 = stablehlo.reshape %4614 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4632 = stablehlo.transpose %4631, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4633 = stablehlo.reshape %4622 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4634 = stablehlo.transpose %4633, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4635 = stablehlo.reshape %4630 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %4636 = stablehlo.transpose %4635, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4637 = stablehlo.transpose %4634, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %4638 = stablehlo.reshape %4632 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %4639 = stablehlo.reshape %4637 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4640 = stablehlo.broadcast_in_dim %4639, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4641 = stablehlo.dot_general %4638, %4640, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %4642 = stablehlo.reshape %4641 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4643 = stablehlo.broadcast_in_dim %4642, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4644 = stablehlo.divide %4643, %309 : tensor<1x8x256x256xbf16>
-    %4645 = stablehlo.convert %4644 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %4646 = stablehlo.reduce(%4645 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4647 = stablehlo.reshape %4646 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4648 = stablehlo.broadcast_in_dim %4645, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4649 = stablehlo.broadcast_in_dim %4647, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4650 = stablehlo.subtract %4648, %4649 : tensor<1x8x256x256xf32>
-    %4651 = stablehlo.exponential %4650 : tensor<1x8x256x256xf32>
-    %4652 = stablehlo.reduce(%4651 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4653 = stablehlo.reshape %4652 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4654 = stablehlo.broadcast_in_dim %4651, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4655 = stablehlo.broadcast_in_dim %4653, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4656 = stablehlo.divide %4654, %4655 : tensor<1x8x256x256xf32>
-    %4657 = stablehlo.convert %4656 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %4658 = stablehlo.reshape %4657 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %4659 = stablehlo.reshape %4636 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4660 = stablehlo.broadcast_in_dim %4659, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4661 = stablehlo.dot_general %4658, %4660, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4662 = stablehlo.reshape %4661 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4663 = stablehlo.transpose %4662, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %4664 = stablehlo.reshape %4663 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %4665 = stablehlo.reshape %4664 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4666 = stablehlo.convert %4665 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4667 = stablehlo.dot_general %4666, %arg399, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4668 = stablehlo.broadcast_in_dim %4667, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4669 = stablehlo.multiply %4668, %127 : tensor<256x1280xf32>
-    %4670 = stablehlo.broadcast_in_dim %4669, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4671 = stablehlo.broadcast_in_dim %arg400, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4672 = stablehlo.add %4670, %4671 : tensor<256x1280xf32>
-    %4673 = stablehlo.convert %4672 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4674 = stablehlo.reshape %4673 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4675 = stablehlo.add %4674, %4567 : tensor<1x256x1280xbf16>
-    %4676 = stablehlo.convert %4675 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4677 = stablehlo.convert %4676 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4678 = stablehlo.reduce(%4677 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4679 = stablehlo.reshape %4678 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4680 = stablehlo.broadcast_in_dim %4679, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4681 = stablehlo.divide %4680, %142 : tensor<1x256x1xf64>
-    %4682 = stablehlo.broadcast_in_dim %4677, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4683 = stablehlo.broadcast_in_dim %4681, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4684 = stablehlo.subtract %4682, %4683 : tensor<1x256x1280xf64>
-    %4685 = stablehlo.multiply %4684, %4684 : tensor<1x256x1280xf64>
-    %4686 = stablehlo.reduce(%4685 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4687 = stablehlo.reshape %4686 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4688 = stablehlo.broadcast_in_dim %4687, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4689 = stablehlo.divide %4688, %142 : tensor<1x256x1xf64>
-    %4690 = stablehlo.convert %4689 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4691 = stablehlo.reduce(%4676 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4692 = stablehlo.reshape %4691 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4693 = stablehlo.broadcast_in_dim %4692, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4694 = stablehlo.divide %4693, %158 : tensor<1x256x1xf32>
-    %4695 = stablehlo.broadcast_in_dim %4690, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4696 = stablehlo.add %4695, %161 : tensor<1x256x1xf32>
-    %4697 = stablehlo.rsqrt %4696 : tensor<1x256x1xf32>
-    %4698 = stablehlo.broadcast_in_dim %4676, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4699 = stablehlo.broadcast_in_dim %4694, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4700 = stablehlo.subtract %4698, %4699 : tensor<1x256x1280xf32>
-    %4701 = stablehlo.broadcast_in_dim %4700, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4702 = stablehlo.broadcast_in_dim %4697, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4703 = stablehlo.multiply %4701, %4702 : tensor<1x256x1280xf32>
-    %4704 = stablehlo.convert %arg97 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4705 = stablehlo.broadcast_in_dim %4703, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4706 = stablehlo.broadcast_in_dim %4704, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4707 = stablehlo.multiply %4705, %4706 : tensor<1x256x1280xf32>
-    %4708 = stablehlo.convert %arg98 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4709 = stablehlo.broadcast_in_dim %4707, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4710 = stablehlo.broadcast_in_dim %4708, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4711 = stablehlo.add %4709, %4710 : tensor<1x256x1280xf32>
-    %4712 = stablehlo.convert %4711 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4713 = stablehlo.reshape %4712 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4714 = stablehlo.convert %4713 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4715 = stablehlo.dot_general %4714, %arg401, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4716 = stablehlo.broadcast_in_dim %4715, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4717 = stablehlo.multiply %4716, %127 : tensor<256x1280xf32>
-    %4718 = stablehlo.broadcast_in_dim %4717, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4719 = stablehlo.broadcast_in_dim %arg402, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4720 = stablehlo.add %4718, %4719 : tensor<256x1280xf32>
-    %4721 = stablehlo.convert %4720 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4722 = stablehlo.reshape %4721 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4723 = stablehlo.multiply %4722, %cst_4 : tensor<1x256x1280xbf16>
-    %4724 = stablehlo.multiply %4722, %190 : tensor<1x256x1280xbf16>
-    %4725 = stablehlo.convert %4724 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4726 = stablehlo.clamp %cst_5, %4725, %cst_6 : tensor<1x256x1280xf32>
-    %4727 = stablehlo.multiply %4726, %4726 : tensor<1x256x1280xf32>
-    %4728 = stablehlo.multiply %cst_7, %4727 : tensor<1x256x1280xf32>
-    %4729 = stablehlo.add %4728, %cst_8 : tensor<1x256x1280xf32>
-    %4730 = stablehlo.multiply %4729, %4727 : tensor<1x256x1280xf32>
-    %4731 = stablehlo.add %4730, %cst_9 : tensor<1x256x1280xf32>
-    %4732 = stablehlo.multiply %4731, %4727 : tensor<1x256x1280xf32>
-    %4733 = stablehlo.add %4732, %cst_10 : tensor<1x256x1280xf32>
-    %4734 = stablehlo.multiply %4733, %4727 : tensor<1x256x1280xf32>
-    %4735 = stablehlo.add %4734, %cst_11 : tensor<1x256x1280xf32>
-    %4736 = stablehlo.multiply %4735, %4727 : tensor<1x256x1280xf32>
-    %4737 = stablehlo.add %4736, %cst_12 : tensor<1x256x1280xf32>
-    %4738 = stablehlo.multiply %4737, %4727 : tensor<1x256x1280xf32>
-    %4739 = stablehlo.add %4738, %cst_13 : tensor<1x256x1280xf32>
-    %4740 = stablehlo.multiply %cst_14, %4727 : tensor<1x256x1280xf32>
-    %4741 = stablehlo.add %4740, %cst_15 : tensor<1x256x1280xf32>
-    %4742 = stablehlo.multiply %4741, %4727 : tensor<1x256x1280xf32>
-    %4743 = stablehlo.add %4742, %cst_16 : tensor<1x256x1280xf32>
-    %4744 = stablehlo.multiply %4743, %4727 : tensor<1x256x1280xf32>
-    %4745 = stablehlo.add %4744, %cst_17 : tensor<1x256x1280xf32>
-    %4746 = stablehlo.multiply %4745, %4727 : tensor<1x256x1280xf32>
-    %4747 = stablehlo.add %4746, %cst_18 : tensor<1x256x1280xf32>
-    %4748 = stablehlo.multiply %4726, %4739 : tensor<1x256x1280xf32>
-    %4749 = stablehlo.divide %4748, %4747 : tensor<1x256x1280xf32>
-    %4750 = stablehlo.clamp %cst_19, %4749, %cst_20 : tensor<1x256x1280xf32>
-    %4751 = stablehlo.convert %4750 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4752 = stablehlo.add %4751, %cst_2 : tensor<1x256x1280xbf16>
-    %4753 = stablehlo.multiply %4752, %4723 : tensor<1x256x1280xbf16>
-    %4754 = stablehlo.reshape %4753 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4755 = stablehlo.convert %4754 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4756 = stablehlo.dot_general %4755, %arg403, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4757 = stablehlo.broadcast_in_dim %4756, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4758 = stablehlo.multiply %4757, %127 : tensor<256x1280xf32>
-    %4759 = stablehlo.broadcast_in_dim %4758, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4760 = stablehlo.broadcast_in_dim %arg404, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4761 = stablehlo.add %4759, %4760 : tensor<256x1280xf32>
-    %4762 = stablehlo.convert %4761 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4763 = stablehlo.reshape %4762 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4764 = stablehlo.add %4763, %4675 : tensor<1x256x1280xbf16>
-    %4765 = stablehlo.convert %4764 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4766 = stablehlo.convert %4765 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4767 = stablehlo.reduce(%4766 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4768 = stablehlo.reshape %4767 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4769 = stablehlo.broadcast_in_dim %4768, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4770 = stablehlo.divide %4769, %142 : tensor<1x256x1xf64>
-    %4771 = stablehlo.broadcast_in_dim %4766, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4772 = stablehlo.broadcast_in_dim %4770, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4773 = stablehlo.subtract %4771, %4772 : tensor<1x256x1280xf64>
-    %4774 = stablehlo.multiply %4773, %4773 : tensor<1x256x1280xf64>
-    %4775 = stablehlo.reduce(%4774 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4776 = stablehlo.reshape %4775 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4777 = stablehlo.broadcast_in_dim %4776, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4778 = stablehlo.divide %4777, %142 : tensor<1x256x1xf64>
-    %4779 = stablehlo.convert %4778 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4780 = stablehlo.reduce(%4765 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4781 = stablehlo.reshape %4780 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4782 = stablehlo.broadcast_in_dim %4781, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4783 = stablehlo.divide %4782, %158 : tensor<1x256x1xf32>
-    %4784 = stablehlo.broadcast_in_dim %4779, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4785 = stablehlo.add %4784, %161 : tensor<1x256x1xf32>
-    %4786 = stablehlo.rsqrt %4785 : tensor<1x256x1xf32>
-    %4787 = stablehlo.broadcast_in_dim %4765, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4788 = stablehlo.broadcast_in_dim %4783, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4789 = stablehlo.subtract %4787, %4788 : tensor<1x256x1280xf32>
-    %4790 = stablehlo.broadcast_in_dim %4789, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4791 = stablehlo.broadcast_in_dim %4786, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4792 = stablehlo.multiply %4790, %4791 : tensor<1x256x1280xf32>
-    %4793 = stablehlo.convert %arg99 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4794 = stablehlo.broadcast_in_dim %4792, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4795 = stablehlo.broadcast_in_dim %4793, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4796 = stablehlo.multiply %4794, %4795 : tensor<1x256x1280xf32>
-    %4797 = stablehlo.convert %arg100 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4798 = stablehlo.broadcast_in_dim %4796, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4799 = stablehlo.broadcast_in_dim %4797, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4800 = stablehlo.add %4798, %4799 : tensor<1x256x1280xf32>
-    %4801 = stablehlo.convert %4800 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4802 = stablehlo.reshape %4801 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4803 = stablehlo.convert %4802 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4804 = stablehlo.dot_general %4803, %arg405, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4805 = stablehlo.broadcast_in_dim %4804, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4806 = stablehlo.multiply %4805, %273 : tensor<256x256xf32>
-    %4807 = stablehlo.broadcast_in_dim %4806, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4808 = stablehlo.broadcast_in_dim %arg406, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4809 = stablehlo.add %4807, %4808 : tensor<256x256xf32>
-    %4810 = stablehlo.convert %4809 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4811 = stablehlo.reshape %4810 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4812 = stablehlo.dot_general %4803, %arg407, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %4813 = stablehlo.broadcast_in_dim %4812, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4814 = stablehlo.multiply %4813, %273 : tensor<256x256xf32>
-    %4815 = stablehlo.broadcast_in_dim %4814, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %4816 = stablehlo.broadcast_in_dim %arg408, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %4817 = stablehlo.add %4815, %4816 : tensor<256x256xf32>
-    %4818 = stablehlo.convert %4817 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %4819 = stablehlo.reshape %4818 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %4820 = stablehlo.dot_general %4803, %arg409, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4821 = stablehlo.broadcast_in_dim %4820, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4822 = stablehlo.multiply %4821, %127 : tensor<256x1280xf32>
-    %4823 = stablehlo.broadcast_in_dim %4822, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4824 = stablehlo.broadcast_in_dim %arg410, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4825 = stablehlo.add %4823, %4824 : tensor<256x1280xf32>
-    %4826 = stablehlo.convert %4825 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4827 = stablehlo.reshape %4826 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4828 = stablehlo.reshape %4811 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4829 = stablehlo.transpose %4828, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4830 = stablehlo.reshape %4819 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %4831 = stablehlo.transpose %4830, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %4832 = stablehlo.reshape %4827 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %4833 = stablehlo.transpose %4832, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4834 = stablehlo.transpose %4831, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %4835 = stablehlo.reshape %4829 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %4836 = stablehlo.reshape %4834 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4837 = stablehlo.broadcast_in_dim %4836, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %4838 = stablehlo.dot_general %4835, %4837, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %4839 = stablehlo.reshape %4838 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4840 = stablehlo.broadcast_in_dim %4839, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %4841 = stablehlo.divide %4840, %309 : tensor<1x8x256x256xbf16>
-    %4842 = stablehlo.convert %4841 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %4843 = stablehlo.reduce(%4842 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4844 = stablehlo.reshape %4843 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4845 = stablehlo.broadcast_in_dim %4842, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4846 = stablehlo.broadcast_in_dim %4844, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4847 = stablehlo.subtract %4845, %4846 : tensor<1x8x256x256xf32>
-    %4848 = stablehlo.exponential %4847 : tensor<1x8x256x256xf32>
-    %4849 = stablehlo.reduce(%4848 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %4850 = stablehlo.reshape %4849 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %4851 = stablehlo.broadcast_in_dim %4848, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %4852 = stablehlo.broadcast_in_dim %4850, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %4853 = stablehlo.divide %4851, %4852 : tensor<1x8x256x256xf32>
-    %4854 = stablehlo.convert %4853 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %4855 = stablehlo.reshape %4854 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %4856 = stablehlo.reshape %4833 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4857 = stablehlo.broadcast_in_dim %4856, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4858 = stablehlo.dot_general %4855, %4857, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %4859 = stablehlo.reshape %4858 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %4860 = stablehlo.transpose %4859, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %4861 = stablehlo.reshape %4860 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %4862 = stablehlo.reshape %4861 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4863 = stablehlo.convert %4862 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4864 = stablehlo.dot_general %4863, %arg411, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4865 = stablehlo.broadcast_in_dim %4864, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4866 = stablehlo.multiply %4865, %127 : tensor<256x1280xf32>
-    %4867 = stablehlo.broadcast_in_dim %4866, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4868 = stablehlo.broadcast_in_dim %arg412, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4869 = stablehlo.add %4867, %4868 : tensor<256x1280xf32>
-    %4870 = stablehlo.convert %4869 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4871 = stablehlo.reshape %4870 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4872 = stablehlo.add %4871, %4764 : tensor<1x256x1280xbf16>
-    %4873 = stablehlo.convert %4872 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4874 = stablehlo.convert %4873 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4875 = stablehlo.reduce(%4874 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4876 = stablehlo.reshape %4875 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4877 = stablehlo.broadcast_in_dim %4876, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4878 = stablehlo.divide %4877, %142 : tensor<1x256x1xf64>
-    %4879 = stablehlo.broadcast_in_dim %4874, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4880 = stablehlo.broadcast_in_dim %4878, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4881 = stablehlo.subtract %4879, %4880 : tensor<1x256x1280xf64>
-    %4882 = stablehlo.multiply %4881, %4881 : tensor<1x256x1280xf64>
-    %4883 = stablehlo.reduce(%4882 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4884 = stablehlo.reshape %4883 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4885 = stablehlo.broadcast_in_dim %4884, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4886 = stablehlo.divide %4885, %142 : tensor<1x256x1xf64>
-    %4887 = stablehlo.convert %4886 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4888 = stablehlo.reduce(%4873 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4889 = stablehlo.reshape %4888 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4890 = stablehlo.broadcast_in_dim %4889, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4891 = stablehlo.divide %4890, %158 : tensor<1x256x1xf32>
-    %4892 = stablehlo.broadcast_in_dim %4887, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4893 = stablehlo.add %4892, %161 : tensor<1x256x1xf32>
-    %4894 = stablehlo.rsqrt %4893 : tensor<1x256x1xf32>
-    %4895 = stablehlo.broadcast_in_dim %4873, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4896 = stablehlo.broadcast_in_dim %4891, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4897 = stablehlo.subtract %4895, %4896 : tensor<1x256x1280xf32>
-    %4898 = stablehlo.broadcast_in_dim %4897, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4899 = stablehlo.broadcast_in_dim %4894, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4900 = stablehlo.multiply %4898, %4899 : tensor<1x256x1280xf32>
-    %4901 = stablehlo.convert %arg101 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4902 = stablehlo.broadcast_in_dim %4900, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4903 = stablehlo.broadcast_in_dim %4901, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4904 = stablehlo.multiply %4902, %4903 : tensor<1x256x1280xf32>
-    %4905 = stablehlo.convert %arg102 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4906 = stablehlo.broadcast_in_dim %4904, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4907 = stablehlo.broadcast_in_dim %4905, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4908 = stablehlo.add %4906, %4907 : tensor<1x256x1280xf32>
-    %4909 = stablehlo.convert %4908 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4910 = stablehlo.reshape %4909 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4911 = stablehlo.convert %4910 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4912 = stablehlo.dot_general %4911, %arg413, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4913 = stablehlo.broadcast_in_dim %4912, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4914 = stablehlo.multiply %4913, %127 : tensor<256x1280xf32>
-    %4915 = stablehlo.broadcast_in_dim %4914, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4916 = stablehlo.broadcast_in_dim %arg414, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4917 = stablehlo.add %4915, %4916 : tensor<256x1280xf32>
-    %4918 = stablehlo.convert %4917 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4919 = stablehlo.reshape %4918 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4920 = stablehlo.multiply %4919, %cst_4 : tensor<1x256x1280xbf16>
-    %4921 = stablehlo.multiply %4919, %190 : tensor<1x256x1280xbf16>
-    %4922 = stablehlo.convert %4921 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4923 = stablehlo.clamp %cst_5, %4922, %cst_6 : tensor<1x256x1280xf32>
-    %4924 = stablehlo.multiply %4923, %4923 : tensor<1x256x1280xf32>
-    %4925 = stablehlo.multiply %cst_7, %4924 : tensor<1x256x1280xf32>
-    %4926 = stablehlo.add %4925, %cst_8 : tensor<1x256x1280xf32>
-    %4927 = stablehlo.multiply %4926, %4924 : tensor<1x256x1280xf32>
-    %4928 = stablehlo.add %4927, %cst_9 : tensor<1x256x1280xf32>
-    %4929 = stablehlo.multiply %4928, %4924 : tensor<1x256x1280xf32>
-    %4930 = stablehlo.add %4929, %cst_10 : tensor<1x256x1280xf32>
-    %4931 = stablehlo.multiply %4930, %4924 : tensor<1x256x1280xf32>
-    %4932 = stablehlo.add %4931, %cst_11 : tensor<1x256x1280xf32>
-    %4933 = stablehlo.multiply %4932, %4924 : tensor<1x256x1280xf32>
-    %4934 = stablehlo.add %4933, %cst_12 : tensor<1x256x1280xf32>
-    %4935 = stablehlo.multiply %4934, %4924 : tensor<1x256x1280xf32>
-    %4936 = stablehlo.add %4935, %cst_13 : tensor<1x256x1280xf32>
-    %4937 = stablehlo.multiply %cst_14, %4924 : tensor<1x256x1280xf32>
-    %4938 = stablehlo.add %4937, %cst_15 : tensor<1x256x1280xf32>
-    %4939 = stablehlo.multiply %4938, %4924 : tensor<1x256x1280xf32>
-    %4940 = stablehlo.add %4939, %cst_16 : tensor<1x256x1280xf32>
-    %4941 = stablehlo.multiply %4940, %4924 : tensor<1x256x1280xf32>
-    %4942 = stablehlo.add %4941, %cst_17 : tensor<1x256x1280xf32>
-    %4943 = stablehlo.multiply %4942, %4924 : tensor<1x256x1280xf32>
-    %4944 = stablehlo.add %4943, %cst_18 : tensor<1x256x1280xf32>
-    %4945 = stablehlo.multiply %4923, %4936 : tensor<1x256x1280xf32>
-    %4946 = stablehlo.divide %4945, %4944 : tensor<1x256x1280xf32>
-    %4947 = stablehlo.clamp %cst_19, %4946, %cst_20 : tensor<1x256x1280xf32>
-    %4948 = stablehlo.convert %4947 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4949 = stablehlo.add %4948, %cst_2 : tensor<1x256x1280xbf16>
-    %4950 = stablehlo.multiply %4949, %4920 : tensor<1x256x1280xbf16>
-    %4951 = stablehlo.reshape %4950 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %4952 = stablehlo.convert %4951 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %4953 = stablehlo.dot_general %4952, %arg415, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %4954 = stablehlo.broadcast_in_dim %4953, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4955 = stablehlo.multiply %4954, %127 : tensor<256x1280xf32>
-    %4956 = stablehlo.broadcast_in_dim %4955, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %4957 = stablehlo.broadcast_in_dim %arg416, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %4958 = stablehlo.add %4956, %4957 : tensor<256x1280xf32>
-    %4959 = stablehlo.convert %4958 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %4960 = stablehlo.reshape %4959 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %4961 = stablehlo.add %4960, %4872 : tensor<1x256x1280xbf16>
-    %4962 = stablehlo.convert %4961 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %4963 = stablehlo.convert %4962 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %4964 = stablehlo.reduce(%4963 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4965 = stablehlo.reshape %4964 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4966 = stablehlo.broadcast_in_dim %4965, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4967 = stablehlo.divide %4966, %142 : tensor<1x256x1xf64>
-    %4968 = stablehlo.broadcast_in_dim %4963, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %4969 = stablehlo.broadcast_in_dim %4967, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %4970 = stablehlo.subtract %4968, %4969 : tensor<1x256x1280xf64>
-    %4971 = stablehlo.multiply %4970, %4970 : tensor<1x256x1280xf64>
-    %4972 = stablehlo.reduce(%4971 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %4973 = stablehlo.reshape %4972 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %4974 = stablehlo.broadcast_in_dim %4973, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %4975 = stablehlo.divide %4974, %142 : tensor<1x256x1xf64>
-    %4976 = stablehlo.convert %4975 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %4977 = stablehlo.reduce(%4962 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %4978 = stablehlo.reshape %4977 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %4979 = stablehlo.broadcast_in_dim %4978, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4980 = stablehlo.divide %4979, %158 : tensor<1x256x1xf32>
-    %4981 = stablehlo.broadcast_in_dim %4976, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %4982 = stablehlo.add %4981, %161 : tensor<1x256x1xf32>
-    %4983 = stablehlo.rsqrt %4982 : tensor<1x256x1xf32>
-    %4984 = stablehlo.broadcast_in_dim %4962, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4985 = stablehlo.broadcast_in_dim %4980, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4986 = stablehlo.subtract %4984, %4985 : tensor<1x256x1280xf32>
-    %4987 = stablehlo.broadcast_in_dim %4986, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4988 = stablehlo.broadcast_in_dim %4983, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %4989 = stablehlo.multiply %4987, %4988 : tensor<1x256x1280xf32>
-    %4990 = stablehlo.convert %arg103 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4991 = stablehlo.broadcast_in_dim %4989, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4992 = stablehlo.broadcast_in_dim %4990, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4993 = stablehlo.multiply %4991, %4992 : tensor<1x256x1280xf32>
-    %4994 = stablehlo.convert %arg104 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %4995 = stablehlo.broadcast_in_dim %4993, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %4996 = stablehlo.broadcast_in_dim %4994, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %4997 = stablehlo.add %4995, %4996 : tensor<1x256x1280xf32>
-    %4998 = stablehlo.convert %4997 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %4999 = stablehlo.reshape %4998 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5000 = stablehlo.convert %4999 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5001 = stablehlo.dot_general %5000, %arg417, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %5002 = stablehlo.broadcast_in_dim %5001, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5003 = stablehlo.multiply %5002, %273 : tensor<256x256xf32>
-    %5004 = stablehlo.broadcast_in_dim %5003, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5005 = stablehlo.broadcast_in_dim %arg418, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %5006 = stablehlo.add %5004, %5005 : tensor<256x256xf32>
-    %5007 = stablehlo.convert %5006 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %5008 = stablehlo.reshape %5007 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %5009 = stablehlo.dot_general %5000, %arg419, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %5010 = stablehlo.broadcast_in_dim %5009, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5011 = stablehlo.multiply %5010, %273 : tensor<256x256xf32>
-    %5012 = stablehlo.broadcast_in_dim %5011, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5013 = stablehlo.broadcast_in_dim %arg420, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %5014 = stablehlo.add %5012, %5013 : tensor<256x256xf32>
-    %5015 = stablehlo.convert %5014 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %5016 = stablehlo.reshape %5015 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %5017 = stablehlo.dot_general %5000, %arg421, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5018 = stablehlo.broadcast_in_dim %5017, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5019 = stablehlo.multiply %5018, %127 : tensor<256x1280xf32>
-    %5020 = stablehlo.broadcast_in_dim %5019, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5021 = stablehlo.broadcast_in_dim %arg422, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5022 = stablehlo.add %5020, %5021 : tensor<256x1280xf32>
-    %5023 = stablehlo.convert %5022 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5024 = stablehlo.reshape %5023 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5025 = stablehlo.reshape %5008 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %5026 = stablehlo.transpose %5025, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %5027 = stablehlo.reshape %5016 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %5028 = stablehlo.transpose %5027, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %5029 = stablehlo.reshape %5024 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %5030 = stablehlo.transpose %5029, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %5031 = stablehlo.transpose %5028, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %5032 = stablehlo.reshape %5026 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %5033 = stablehlo.reshape %5031 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %5034 = stablehlo.broadcast_in_dim %5033, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %5035 = stablehlo.dot_general %5032, %5034, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %5036 = stablehlo.reshape %5035 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %5037 = stablehlo.broadcast_in_dim %5036, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %5038 = stablehlo.divide %5037, %309 : tensor<1x8x256x256xbf16>
-    %5039 = stablehlo.convert %5038 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %5040 = stablehlo.reduce(%5039 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %5041 = stablehlo.reshape %5040 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %5042 = stablehlo.broadcast_in_dim %5039, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %5043 = stablehlo.broadcast_in_dim %5041, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %5044 = stablehlo.subtract %5042, %5043 : tensor<1x8x256x256xf32>
-    %5045 = stablehlo.exponential %5044 : tensor<1x8x256x256xf32>
-    %5046 = stablehlo.reduce(%5045 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %5047 = stablehlo.reshape %5046 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %5048 = stablehlo.broadcast_in_dim %5045, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %5049 = stablehlo.broadcast_in_dim %5047, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %5050 = stablehlo.divide %5048, %5049 : tensor<1x8x256x256xf32>
-    %5051 = stablehlo.convert %5050 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %5052 = stablehlo.reshape %5051 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %5053 = stablehlo.reshape %5030 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %5054 = stablehlo.broadcast_in_dim %5053, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %5055 = stablehlo.dot_general %5052, %5054, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %5056 = stablehlo.reshape %5055 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %5057 = stablehlo.transpose %5056, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %5058 = stablehlo.reshape %5057 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %5059 = stablehlo.reshape %5058 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5060 = stablehlo.convert %5059 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5061 = stablehlo.dot_general %5060, %arg423, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5062 = stablehlo.broadcast_in_dim %5061, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5063 = stablehlo.multiply %5062, %127 : tensor<256x1280xf32>
-    %5064 = stablehlo.broadcast_in_dim %5063, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5065 = stablehlo.broadcast_in_dim %arg424, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5066 = stablehlo.add %5064, %5065 : tensor<256x1280xf32>
-    %5067 = stablehlo.convert %5066 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5068 = stablehlo.reshape %5067 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5069 = stablehlo.add %5068, %4961 : tensor<1x256x1280xbf16>
-    %5070 = stablehlo.convert %5069 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %5071 = stablehlo.convert %5070 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %5072 = stablehlo.reduce(%5071 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5073 = stablehlo.reshape %5072 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5074 = stablehlo.broadcast_in_dim %5073, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5075 = stablehlo.divide %5074, %142 : tensor<1x256x1xf64>
-    %5076 = stablehlo.broadcast_in_dim %5071, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %5077 = stablehlo.broadcast_in_dim %5075, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %5078 = stablehlo.subtract %5076, %5077 : tensor<1x256x1280xf64>
-    %5079 = stablehlo.multiply %5078, %5078 : tensor<1x256x1280xf64>
-    %5080 = stablehlo.reduce(%5079 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5081 = stablehlo.reshape %5080 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5082 = stablehlo.broadcast_in_dim %5081, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5083 = stablehlo.divide %5082, %142 : tensor<1x256x1xf64>
-    %5084 = stablehlo.convert %5083 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %5085 = stablehlo.reduce(%5070 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %5086 = stablehlo.reshape %5085 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %5087 = stablehlo.broadcast_in_dim %5086, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5088 = stablehlo.divide %5087, %158 : tensor<1x256x1xf32>
-    %5089 = stablehlo.broadcast_in_dim %5084, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5090 = stablehlo.add %5089, %161 : tensor<1x256x1xf32>
-    %5091 = stablehlo.rsqrt %5090 : tensor<1x256x1xf32>
-    %5092 = stablehlo.broadcast_in_dim %5070, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5093 = stablehlo.broadcast_in_dim %5088, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5094 = stablehlo.subtract %5092, %5093 : tensor<1x256x1280xf32>
-    %5095 = stablehlo.broadcast_in_dim %5094, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5096 = stablehlo.broadcast_in_dim %5091, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5097 = stablehlo.multiply %5095, %5096 : tensor<1x256x1280xf32>
-    %5098 = stablehlo.convert %arg105 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5099 = stablehlo.broadcast_in_dim %5097, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5100 = stablehlo.broadcast_in_dim %5098, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5101 = stablehlo.multiply %5099, %5100 : tensor<1x256x1280xf32>
-    %5102 = stablehlo.convert %arg106 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5103 = stablehlo.broadcast_in_dim %5101, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5104 = stablehlo.broadcast_in_dim %5102, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5105 = stablehlo.add %5103, %5104 : tensor<1x256x1280xf32>
-    %5106 = stablehlo.convert %5105 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %5107 = stablehlo.reshape %5106 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5108 = stablehlo.convert %5107 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5109 = stablehlo.dot_general %5108, %arg425, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5110 = stablehlo.broadcast_in_dim %5109, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5111 = stablehlo.multiply %5110, %127 : tensor<256x1280xf32>
-    %5112 = stablehlo.broadcast_in_dim %5111, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5113 = stablehlo.broadcast_in_dim %arg426, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5114 = stablehlo.add %5112, %5113 : tensor<256x1280xf32>
-    %5115 = stablehlo.convert %5114 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5116 = stablehlo.reshape %5115 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5117 = stablehlo.multiply %5116, %cst_4 : tensor<1x256x1280xbf16>
-    %5118 = stablehlo.multiply %5116, %190 : tensor<1x256x1280xbf16>
-    %5119 = stablehlo.convert %5118 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %5120 = stablehlo.clamp %cst_5, %5119, %cst_6 : tensor<1x256x1280xf32>
-    %5121 = stablehlo.multiply %5120, %5120 : tensor<1x256x1280xf32>
-    %5122 = stablehlo.multiply %cst_7, %5121 : tensor<1x256x1280xf32>
-    %5123 = stablehlo.add %5122, %cst_8 : tensor<1x256x1280xf32>
-    %5124 = stablehlo.multiply %5123, %5121 : tensor<1x256x1280xf32>
-    %5125 = stablehlo.add %5124, %cst_9 : tensor<1x256x1280xf32>
-    %5126 = stablehlo.multiply %5125, %5121 : tensor<1x256x1280xf32>
-    %5127 = stablehlo.add %5126, %cst_10 : tensor<1x256x1280xf32>
-    %5128 = stablehlo.multiply %5127, %5121 : tensor<1x256x1280xf32>
-    %5129 = stablehlo.add %5128, %cst_11 : tensor<1x256x1280xf32>
-    %5130 = stablehlo.multiply %5129, %5121 : tensor<1x256x1280xf32>
-    %5131 = stablehlo.add %5130, %cst_12 : tensor<1x256x1280xf32>
-    %5132 = stablehlo.multiply %5131, %5121 : tensor<1x256x1280xf32>
-    %5133 = stablehlo.add %5132, %cst_13 : tensor<1x256x1280xf32>
-    %5134 = stablehlo.multiply %cst_14, %5121 : tensor<1x256x1280xf32>
-    %5135 = stablehlo.add %5134, %cst_15 : tensor<1x256x1280xf32>
-    %5136 = stablehlo.multiply %5135, %5121 : tensor<1x256x1280xf32>
-    %5137 = stablehlo.add %5136, %cst_16 : tensor<1x256x1280xf32>
-    %5138 = stablehlo.multiply %5137, %5121 : tensor<1x256x1280xf32>
-    %5139 = stablehlo.add %5138, %cst_17 : tensor<1x256x1280xf32>
-    %5140 = stablehlo.multiply %5139, %5121 : tensor<1x256x1280xf32>
-    %5141 = stablehlo.add %5140, %cst_18 : tensor<1x256x1280xf32>
-    %5142 = stablehlo.multiply %5120, %5133 : tensor<1x256x1280xf32>
-    %5143 = stablehlo.divide %5142, %5141 : tensor<1x256x1280xf32>
-    %5144 = stablehlo.clamp %cst_19, %5143, %cst_20 : tensor<1x256x1280xf32>
-    %5145 = stablehlo.convert %5144 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %5146 = stablehlo.add %5145, %cst_2 : tensor<1x256x1280xbf16>
-    %5147 = stablehlo.multiply %5146, %5117 : tensor<1x256x1280xbf16>
-    %5148 = stablehlo.reshape %5147 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5149 = stablehlo.convert %5148 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5150 = stablehlo.dot_general %5149, %arg427, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5151 = stablehlo.broadcast_in_dim %5150, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5152 = stablehlo.multiply %5151, %127 : tensor<256x1280xf32>
-    %5153 = stablehlo.broadcast_in_dim %5152, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5154 = stablehlo.broadcast_in_dim %arg428, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5155 = stablehlo.add %5153, %5154 : tensor<256x1280xf32>
-    %5156 = stablehlo.convert %5155 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5157 = stablehlo.reshape %5156 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5158 = stablehlo.add %5157, %5069 : tensor<1x256x1280xbf16>
-    %5159 = stablehlo.convert %5158 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %5160 = stablehlo.convert %5159 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %5161 = stablehlo.reduce(%5160 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5162 = stablehlo.reshape %5161 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5163 = stablehlo.broadcast_in_dim %5162, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5164 = stablehlo.divide %5163, %142 : tensor<1x256x1xf64>
-    %5165 = stablehlo.broadcast_in_dim %5160, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %5166 = stablehlo.broadcast_in_dim %5164, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %5167 = stablehlo.subtract %5165, %5166 : tensor<1x256x1280xf64>
-    %5168 = stablehlo.multiply %5167, %5167 : tensor<1x256x1280xf64>
-    %5169 = stablehlo.reduce(%5168 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5170 = stablehlo.reshape %5169 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5171 = stablehlo.broadcast_in_dim %5170, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5172 = stablehlo.divide %5171, %142 : tensor<1x256x1xf64>
-    %5173 = stablehlo.convert %5172 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %5174 = stablehlo.reduce(%5159 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %5175 = stablehlo.reshape %5174 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %5176 = stablehlo.broadcast_in_dim %5175, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5177 = stablehlo.divide %5176, %158 : tensor<1x256x1xf32>
-    %5178 = stablehlo.broadcast_in_dim %5173, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5179 = stablehlo.add %5178, %161 : tensor<1x256x1xf32>
-    %5180 = stablehlo.rsqrt %5179 : tensor<1x256x1xf32>
-    %5181 = stablehlo.broadcast_in_dim %5159, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5182 = stablehlo.broadcast_in_dim %5177, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5183 = stablehlo.subtract %5181, %5182 : tensor<1x256x1280xf32>
-    %5184 = stablehlo.broadcast_in_dim %5183, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5185 = stablehlo.broadcast_in_dim %5180, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5186 = stablehlo.multiply %5184, %5185 : tensor<1x256x1280xf32>
-    %5187 = stablehlo.convert %arg107 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5188 = stablehlo.broadcast_in_dim %5186, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5189 = stablehlo.broadcast_in_dim %5187, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5190 = stablehlo.multiply %5188, %5189 : tensor<1x256x1280xf32>
-    %5191 = stablehlo.convert %arg108 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5192 = stablehlo.broadcast_in_dim %5190, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5193 = stablehlo.broadcast_in_dim %5191, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5194 = stablehlo.add %5192, %5193 : tensor<1x256x1280xf32>
-    %5195 = stablehlo.convert %5194 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %5196 = stablehlo.reshape %5195 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5197 = stablehlo.convert %5196 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5198 = stablehlo.dot_general %5197, %arg429, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %5199 = stablehlo.broadcast_in_dim %5198, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5200 = stablehlo.multiply %5199, %273 : tensor<256x256xf32>
-    %5201 = stablehlo.broadcast_in_dim %5200, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5202 = stablehlo.broadcast_in_dim %arg430, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %5203 = stablehlo.add %5201, %5202 : tensor<256x256xf32>
-    %5204 = stablehlo.convert %5203 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %5205 = stablehlo.reshape %5204 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %5206 = stablehlo.dot_general %5197, %arg431, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %5207 = stablehlo.broadcast_in_dim %5206, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5208 = stablehlo.multiply %5207, %273 : tensor<256x256xf32>
-    %5209 = stablehlo.broadcast_in_dim %5208, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5210 = stablehlo.broadcast_in_dim %arg432, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %5211 = stablehlo.add %5209, %5210 : tensor<256x256xf32>
-    %5212 = stablehlo.convert %5211 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %5213 = stablehlo.reshape %5212 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %5214 = stablehlo.dot_general %5197, %arg433, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5215 = stablehlo.broadcast_in_dim %5214, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5216 = stablehlo.multiply %5215, %127 : tensor<256x1280xf32>
-    %5217 = stablehlo.broadcast_in_dim %5216, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5218 = stablehlo.broadcast_in_dim %arg434, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5219 = stablehlo.add %5217, %5218 : tensor<256x1280xf32>
-    %5220 = stablehlo.convert %5219 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5221 = stablehlo.reshape %5220 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5222 = stablehlo.reshape %5205 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %5223 = stablehlo.transpose %5222, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %5224 = stablehlo.reshape %5213 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %5225 = stablehlo.transpose %5224, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %5226 = stablehlo.reshape %5221 : (tensor<1x256x1280xbf16>) -> tensor<1x256x8x160xbf16>
-    %5227 = stablehlo.transpose %5226, dims = [0, 2, 1, 3] : (tensor<1x256x8x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %5228 = stablehlo.transpose %5225, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %5229 = stablehlo.reshape %5223 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %5230 = stablehlo.reshape %5228 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %5231 = stablehlo.broadcast_in_dim %5230, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %5232 = stablehlo.dot_general %5229, %5231, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %5233 = stablehlo.reshape %5232 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %5234 = stablehlo.broadcast_in_dim %5233, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %5235 = stablehlo.divide %5234, %309 : tensor<1x8x256x256xbf16>
-    %5236 = stablehlo.convert %5235 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %5237 = stablehlo.reduce(%5236 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %5238 = stablehlo.reshape %5237 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %5239 = stablehlo.broadcast_in_dim %5236, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %5240 = stablehlo.broadcast_in_dim %5238, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %5241 = stablehlo.subtract %5239, %5240 : tensor<1x8x256x256xf32>
-    %5242 = stablehlo.exponential %5241 : tensor<1x8x256x256xf32>
-    %5243 = stablehlo.reduce(%5242 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %5244 = stablehlo.reshape %5243 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %5245 = stablehlo.broadcast_in_dim %5242, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %5246 = stablehlo.broadcast_in_dim %5244, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %5247 = stablehlo.divide %5245, %5246 : tensor<1x8x256x256xf32>
-    %5248 = stablehlo.convert %5247 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %5249 = stablehlo.reshape %5248 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %5250 = stablehlo.reshape %5227 : (tensor<1x8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %5251 = stablehlo.broadcast_in_dim %5250, dims = [0, 1, 2] : (tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %5252 = stablehlo.dot_general %5249, %5251, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x160xbf16>) -> tensor<8x256x160xbf16>
-    %5253 = stablehlo.reshape %5252 : (tensor<8x256x160xbf16>) -> tensor<1x8x256x160xbf16>
-    %5254 = stablehlo.transpose %5253, dims = [0, 2, 1, 3] : (tensor<1x8x256x160xbf16>) -> tensor<1x256x8x160xbf16>
-    %5255 = stablehlo.reshape %5254 : (tensor<1x256x8x160xbf16>) -> tensor<1x256x1280xbf16>
-    %5256 = stablehlo.reshape %5255 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5257 = stablehlo.convert %5256 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5258 = stablehlo.dot_general %5257, %arg435, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5259 = stablehlo.broadcast_in_dim %5258, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5260 = stablehlo.multiply %5259, %127 : tensor<256x1280xf32>
-    %5261 = stablehlo.broadcast_in_dim %5260, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5262 = stablehlo.broadcast_in_dim %arg436, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5263 = stablehlo.add %5261, %5262 : tensor<256x1280xf32>
-    %5264 = stablehlo.convert %5263 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5265 = stablehlo.reshape %5264 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5266 = stablehlo.add %5265, %5158 : tensor<1x256x1280xbf16>
-    %5267 = stablehlo.convert %5266 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %5268 = stablehlo.convert %5267 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %5269 = stablehlo.reduce(%5268 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5270 = stablehlo.reshape %5269 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5271 = stablehlo.broadcast_in_dim %5270, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5272 = stablehlo.divide %5271, %142 : tensor<1x256x1xf64>
-    %5273 = stablehlo.broadcast_in_dim %5268, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %5274 = stablehlo.broadcast_in_dim %5272, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %5275 = stablehlo.subtract %5273, %5274 : tensor<1x256x1280xf64>
-    %5276 = stablehlo.multiply %5275, %5275 : tensor<1x256x1280xf64>
-    %5277 = stablehlo.reduce(%5276 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5278 = stablehlo.reshape %5277 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5279 = stablehlo.broadcast_in_dim %5278, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5280 = stablehlo.divide %5279, %142 : tensor<1x256x1xf64>
-    %5281 = stablehlo.convert %5280 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %5282 = stablehlo.reduce(%5267 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %5283 = stablehlo.reshape %5282 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %5284 = stablehlo.broadcast_in_dim %5283, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5285 = stablehlo.divide %5284, %158 : tensor<1x256x1xf32>
-    %5286 = stablehlo.broadcast_in_dim %5281, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5287 = stablehlo.add %5286, %161 : tensor<1x256x1xf32>
-    %5288 = stablehlo.rsqrt %5287 : tensor<1x256x1xf32>
-    %5289 = stablehlo.broadcast_in_dim %5267, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5290 = stablehlo.broadcast_in_dim %5285, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5291 = stablehlo.subtract %5289, %5290 : tensor<1x256x1280xf32>
-    %5292 = stablehlo.broadcast_in_dim %5291, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5293 = stablehlo.broadcast_in_dim %5288, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5294 = stablehlo.multiply %5292, %5293 : tensor<1x256x1280xf32>
-    %5295 = stablehlo.convert %arg109 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5296 = stablehlo.broadcast_in_dim %5294, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5297 = stablehlo.broadcast_in_dim %5295, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5298 = stablehlo.multiply %5296, %5297 : tensor<1x256x1280xf32>
-    %5299 = stablehlo.convert %arg110 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5300 = stablehlo.broadcast_in_dim %5298, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5301 = stablehlo.broadcast_in_dim %5299, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5302 = stablehlo.add %5300, %5301 : tensor<1x256x1280xf32>
-    %5303 = stablehlo.convert %5302 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %5304 = stablehlo.reshape %5303 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5305 = stablehlo.convert %5304 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5306 = stablehlo.dot_general %5305, %arg437, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5307 = stablehlo.broadcast_in_dim %5306, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5308 = stablehlo.multiply %5307, %127 : tensor<256x1280xf32>
-    %5309 = stablehlo.broadcast_in_dim %5308, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5310 = stablehlo.broadcast_in_dim %arg438, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5311 = stablehlo.add %5309, %5310 : tensor<256x1280xf32>
-    %5312 = stablehlo.convert %5311 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5313 = stablehlo.reshape %5312 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5314 = stablehlo.multiply %5313, %cst_4 : tensor<1x256x1280xbf16>
-    %5315 = stablehlo.multiply %5313, %190 : tensor<1x256x1280xbf16>
-    %5316 = stablehlo.convert %5315 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %5317 = stablehlo.clamp %cst_5, %5316, %cst_6 : tensor<1x256x1280xf32>
-    %5318 = stablehlo.multiply %5317, %5317 : tensor<1x256x1280xf32>
-    %5319 = stablehlo.multiply %cst_7, %5318 : tensor<1x256x1280xf32>
-    %5320 = stablehlo.add %5319, %cst_8 : tensor<1x256x1280xf32>
-    %5321 = stablehlo.multiply %5320, %5318 : tensor<1x256x1280xf32>
-    %5322 = stablehlo.add %5321, %cst_9 : tensor<1x256x1280xf32>
-    %5323 = stablehlo.multiply %5322, %5318 : tensor<1x256x1280xf32>
-    %5324 = stablehlo.add %5323, %cst_10 : tensor<1x256x1280xf32>
-    %5325 = stablehlo.multiply %5324, %5318 : tensor<1x256x1280xf32>
-    %5326 = stablehlo.add %5325, %cst_11 : tensor<1x256x1280xf32>
-    %5327 = stablehlo.multiply %5326, %5318 : tensor<1x256x1280xf32>
-    %5328 = stablehlo.add %5327, %cst_12 : tensor<1x256x1280xf32>
-    %5329 = stablehlo.multiply %5328, %5318 : tensor<1x256x1280xf32>
-    %5330 = stablehlo.add %5329, %cst_13 : tensor<1x256x1280xf32>
-    %5331 = stablehlo.multiply %cst_14, %5318 : tensor<1x256x1280xf32>
-    %5332 = stablehlo.add %5331, %cst_15 : tensor<1x256x1280xf32>
-    %5333 = stablehlo.multiply %5332, %5318 : tensor<1x256x1280xf32>
-    %5334 = stablehlo.add %5333, %cst_16 : tensor<1x256x1280xf32>
-    %5335 = stablehlo.multiply %5334, %5318 : tensor<1x256x1280xf32>
-    %5336 = stablehlo.add %5335, %cst_17 : tensor<1x256x1280xf32>
-    %5337 = stablehlo.multiply %5336, %5318 : tensor<1x256x1280xf32>
-    %5338 = stablehlo.add %5337, %cst_18 : tensor<1x256x1280xf32>
-    %5339 = stablehlo.multiply %5317, %5330 : tensor<1x256x1280xf32>
-    %5340 = stablehlo.divide %5339, %5338 : tensor<1x256x1280xf32>
-    %5341 = stablehlo.clamp %cst_19, %5340, %cst_20 : tensor<1x256x1280xf32>
-    %5342 = stablehlo.convert %5341 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %5343 = stablehlo.add %5342, %cst_2 : tensor<1x256x1280xbf16>
-    %5344 = stablehlo.multiply %5343, %5314 : tensor<1x256x1280xbf16>
-    %5345 = stablehlo.reshape %5344 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5346 = stablehlo.convert %5345 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5347 = stablehlo.dot_general %5346, %arg439, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x1280xf32>) -> tensor<256x1280xf32>
-    %5348 = stablehlo.broadcast_in_dim %5347, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5349 = stablehlo.multiply %5348, %127 : tensor<256x1280xf32>
-    %5350 = stablehlo.broadcast_in_dim %5349, dims = [0, 1] : (tensor<256x1280xf32>) -> tensor<256x1280xf32>
-    %5351 = stablehlo.broadcast_in_dim %arg440, dims = [1] : (tensor<1280xf32>) -> tensor<256x1280xf32>
-    %5352 = stablehlo.add %5350, %5351 : tensor<256x1280xf32>
-    %5353 = stablehlo.convert %5352 : (tensor<256x1280xf32>) -> tensor<256x1280xbf16>
-    %5354 = stablehlo.reshape %5353 : (tensor<256x1280xbf16>) -> tensor<1x256x1280xbf16>
-    %5355 = stablehlo.add %5354, %5266 : tensor<1x256x1280xbf16>
-    %5356 = stablehlo.convert %5355 : (tensor<1x256x1280xbf16>) -> tensor<1x256x1280xf32>
-    %5357 = stablehlo.convert %5356 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf64>
-    %5358 = stablehlo.reduce(%5357 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5359 = stablehlo.reshape %5358 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5360 = stablehlo.broadcast_in_dim %5359, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5361 = stablehlo.divide %5360, %142 : tensor<1x256x1xf64>
-    %5362 = stablehlo.broadcast_in_dim %5357, dims = [0, 1, 2] : (tensor<1x256x1280xf64>) -> tensor<1x256x1280xf64>
-    %5363 = stablehlo.broadcast_in_dim %5361, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1280xf64>
-    %5364 = stablehlo.subtract %5362, %5363 : tensor<1x256x1280xf64>
-    %5365 = stablehlo.multiply %5364, %5364 : tensor<1x256x1280xf64>
-    %5366 = stablehlo.reduce(%5365 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %5367 = stablehlo.reshape %5366 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %5368 = stablehlo.broadcast_in_dim %5367, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %5369 = stablehlo.divide %5368, %142 : tensor<1x256x1xf64>
-    %5370 = stablehlo.convert %5369 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %5371 = stablehlo.reduce(%5356 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x1280xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %5372 = stablehlo.reshape %5371 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %5373 = stablehlo.broadcast_in_dim %5372, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5374 = stablehlo.divide %5373, %158 : tensor<1x256x1xf32>
-    %5375 = stablehlo.broadcast_in_dim %5370, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %5376 = stablehlo.add %5375, %161 : tensor<1x256x1xf32>
-    %5377 = stablehlo.rsqrt %5376 : tensor<1x256x1xf32>
-    %5378 = stablehlo.broadcast_in_dim %5356, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5379 = stablehlo.broadcast_in_dim %5374, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5380 = stablehlo.subtract %5378, %5379 : tensor<1x256x1280xf32>
-    %5381 = stablehlo.broadcast_in_dim %5380, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5382 = stablehlo.broadcast_in_dim %5377, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1280xf32>
-    %5383 = stablehlo.multiply %5381, %5382 : tensor<1x256x1280xf32>
-    %5384 = stablehlo.convert %arg111 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5385 = stablehlo.broadcast_in_dim %5383, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5386 = stablehlo.broadcast_in_dim %5384, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5387 = stablehlo.multiply %5385, %5386 : tensor<1x256x1280xf32>
-    %5388 = stablehlo.convert %arg112 : (tensor<1280xbf16>) -> tensor<1280xf32>
-    %5389 = stablehlo.broadcast_in_dim %5387, dims = [0, 1, 2] : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xf32>
-    %5390 = stablehlo.broadcast_in_dim %5388, dims = [2] : (tensor<1280xf32>) -> tensor<1x256x1280xf32>
-    %5391 = stablehlo.add %5389, %5390 : tensor<1x256x1280xf32>
-    %5392 = stablehlo.convert %5391 : (tensor<1x256x1280xf32>) -> tensor<1x256x1280xbf16>
-    %5393 = stablehlo.reshape %5392 : (tensor<1x256x1280xbf16>) -> tensor<256x1280xbf16>
-    %5394 = stablehlo.convert %5393 : (tensor<256x1280xbf16>) -> tensor<256x1280xf32>
-    %5395 = stablehlo.dot_general %5394, %arg441, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x256xf32>) -> tensor<256x256xf32>
-    %5396 = stablehlo.broadcast_in_dim %5395, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5397 = stablehlo.multiply %5396, %273 : tensor<256x256xf32>
-    %5398 = stablehlo.broadcast_in_dim %5397, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %5399 = stablehlo.broadcast_in_dim %arg442, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %5400 = stablehlo.add %5398, %5399 : tensor<256x256xf32>
-    %5401 = stablehlo.convert %5400 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %5402 = stablehlo.reshape %5401 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %5403 = stablehlo.dot_general %5394, %arg443, contracting_dims = [1] x [0] : (tensor<256x1280xf32>, tensor<1280x768xf32>) -> tensor<256x768xf32>
-    %5404 = stablehlo.broadcast_in_dim %5403, dims = [0, 1] : (tensor<256x768xf32>) -> tensor<256x768xf32>
-    %5405 = stablehlo.broadcast_in_dim %68, dims = [] : (tensor<f32>) -> tensor<256x768xf32>
-    %5406 = stablehlo.multiply %5404, %5405 : tensor<256x768xf32>
-    %5407 = stablehlo.broadcast_in_dim %5406, dims = [0, 1] : (tensor<256x768xf32>) -> tensor<256x768xf32>
-    %5408 = stablehlo.broadcast_in_dim %arg444, dims = [1] : (tensor<768xf32>) -> tensor<256x768xf32>
-    %5409 = stablehlo.add %5407, %5408 : tensor<256x768xf32>
-    %5410 = stablehlo.convert %5409 : (tensor<256x768xf32>) -> tensor<256x768xbf16>
-    %5411 = stablehlo.reshape %5410 : (tensor<256x768xbf16>) -> tensor<1x256x768xbf16>
-    %5412 = stablehlo.reshape %5402 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %5413 = stablehlo.transpose %5412, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %5414 = stablehlo.reshape %5411 : (tensor<1x256x768xbf16>) -> tensor<1x256x8x96xbf16>
-    %5415 = stablehlo.transpose %5414, dims = [0, 2, 1, 3] : (tensor<1x256x8x96xbf16>) -> tensor<1x8x256x96xbf16>
-    %5416 = stablehlo.transpose %5413, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %5417 = stablehlo.reshape %5416 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %5418 = stablehlo.broadcast_in_dim %5417, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %5419 = stablehlo.dot_general %arg445, %5418, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x2048x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x2048x256xbf16>
-    %5420 = stablehlo.reshape %5419 : (tensor<8x2048x256xbf16>) -> tensor<1x8x2048x256xbf16>
-    %5421 = stablehlo.broadcast_in_dim %5420, dims = [0, 1, 2, 3] : (tensor<1x8x2048x256xbf16>) -> tensor<1x8x2048x256xbf16>
-    %5422 = stablehlo.broadcast_in_dim %96, dims = [] : (tensor<bf16>) -> tensor<1x8x2048x256xbf16>
-    %5423 = stablehlo.divide %5421, %5422 : tensor<1x8x2048x256xbf16>
-    %5424 = stablehlo.convert %5423 : (tensor<1x8x2048x256xbf16>) -> tensor<1x8x2048x256xf32>
-    %5425 = stablehlo.reduce(%5424 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x2048x256xf32>, tensor<f32>) -> tensor<1x8x2048xf32>
-    %5426 = stablehlo.reshape %5425 : (tensor<1x8x2048xf32>) -> tensor<1x8x2048x1xf32>
-    %5427 = stablehlo.broadcast_in_dim %5424, dims = [0, 1, 2, 3] : (tensor<1x8x2048x256xf32>) -> tensor<1x8x2048x256xf32>
-    %5428 = stablehlo.broadcast_in_dim %5426, dims = [0, 1, 2, 3] : (tensor<1x8x2048x1xf32>) -> tensor<1x8x2048x256xf32>
-    %5429 = stablehlo.subtract %5427, %5428 : tensor<1x8x2048x256xf32>
-    %5430 = stablehlo.exponential %5429 : tensor<1x8x2048x256xf32>
-    %5431 = stablehlo.reduce(%5430 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x2048x256xf32>, tensor<f32>) -> tensor<1x8x2048xf32>
-    %5432 = stablehlo.reshape %5431 : (tensor<1x8x2048xf32>) -> tensor<1x8x2048x1xf32>
-    %5433 = stablehlo.broadcast_in_dim %5430, dims = [0, 1, 2, 3] : (tensor<1x8x2048x256xf32>) -> tensor<1x8x2048x256xf32>
-    %5434 = stablehlo.broadcast_in_dim %5432, dims = [0, 1, 2, 3] : (tensor<1x8x2048x1xf32>) -> tensor<1x8x2048x256xf32>
-    %5435 = stablehlo.divide %5433, %5434 : tensor<1x8x2048x256xf32>
-    %5436 = stablehlo.convert %5435 : (tensor<1x8x2048x256xf32>) -> tensor<1x8x2048x256xbf16>
-    %5437 = stablehlo.reshape %5436 : (tensor<1x8x2048x256xbf16>) -> tensor<8x2048x256xbf16>
-    %5438 = stablehlo.reshape %5415 : (tensor<1x8x256x96xbf16>) -> tensor<8x256x96xbf16>
-    %5439 = stablehlo.broadcast_in_dim %5438, dims = [0, 1, 2] : (tensor<8x256x96xbf16>) -> tensor<8x256x96xbf16>
-    %5440 = stablehlo.dot_general %5437, %5439, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x2048x256xbf16>, tensor<8x256x96xbf16>) -> tensor<8x2048x96xbf16>
-    %5441 = stablehlo.reshape %5440 : (tensor<8x2048x96xbf16>) -> tensor<1x8x2048x96xbf16>
-    %5442 = stablehlo.transpose %5441, dims = [0, 2, 1, 3] : (tensor<1x8x2048x96xbf16>) -> tensor<1x2048x8x96xbf16>
-    %5443 = stablehlo.reshape %5442 : (tensor<1x2048x8x96xbf16>) -> tensor<1x2048x768xbf16>
-    %5444 = stablehlo.reshape %5443 : (tensor<1x2048x768xbf16>) -> tensor<2048x768xbf16>
-    %5445 = stablehlo.convert %5444 : (tensor<2048x768xbf16>) -> tensor<2048x768xf32>
-    %5446 = stablehlo.dot_general %5445, %arg446, contracting_dims = [1] x [0] : (tensor<2048x768xf32>, tensor<768x768xf32>) -> tensor<2048x768xf32>
-    %5447 = stablehlo.broadcast_in_dim %5446, dims = [0, 1] : (tensor<2048x768xf32>) -> tensor<2048x768xf32>
-    %5448 = stablehlo.broadcast_in_dim %68, dims = [] : (tensor<f32>) -> tensor<2048x768xf32>
-    %5449 = stablehlo.multiply %5447, %5448 : tensor<2048x768xf32>
-    %5450 = stablehlo.broadcast_in_dim %5449, dims = [0, 1] : (tensor<2048x768xf32>) -> tensor<2048x768xf32>
-    %5451 = stablehlo.broadcast_in_dim %arg447, dims = [1] : (tensor<768xf32>) -> tensor<2048x768xf32>
-    %5452 = stablehlo.add %5450, %5451 : tensor<2048x768xf32>
-    %5453 = stablehlo.convert %5452 : (tensor<2048x768xf32>) -> tensor<2048x768xbf16>
-    %5454 = stablehlo.reshape %5453 : (tensor<2048x768xbf16>) -> tensor<1x2048x768xbf16>
-    %5455 = stablehlo.convert %5454 : (tensor<1x2048x768xbf16>) -> tensor<1x2048x768xf32>
-    %5456 = stablehlo.convert %5455 : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf64>
-    %5457 = stablehlo.reduce(%5456 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x2048x768xf64>, tensor<f64>) -> tensor<1x2048xf64>
-    %5458 = stablehlo.reshape %5457 : (tensor<1x2048xf64>) -> tensor<1x2048x1xf64>
-    %5459 = stablehlo.broadcast_in_dim %5458, dims = [0, 1, 2] : (tensor<1x2048x1xf64>) -> tensor<1x2048x1xf64>
-    %5460 = stablehlo.divide %5459, %25 : tensor<1x2048x1xf64>
-    %5461 = stablehlo.broadcast_in_dim %5456, dims = [0, 1, 2] : (tensor<1x2048x768xf64>) -> tensor<1x2048x768xf64>
-    %5462 = stablehlo.broadcast_in_dim %5460, dims = [0, 1, 2] : (tensor<1x2048x1xf64>) -> tensor<1x2048x768xf64>
-    %5463 = stablehlo.subtract %5461, %5462 : tensor<1x2048x768xf64>
-    %5464 = stablehlo.multiply %5463, %5463 : tensor<1x2048x768xf64>
-    %5465 = stablehlo.reduce(%5464 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x2048x768xf64>, tensor<f64>) -> tensor<1x2048xf64>
-    %5466 = stablehlo.reshape %5465 : (tensor<1x2048xf64>) -> tensor<1x2048x1xf64>
-    %5467 = stablehlo.broadcast_in_dim %5466, dims = [0, 1, 2] : (tensor<1x2048x1xf64>) -> tensor<1x2048x1xf64>
-    %5468 = stablehlo.divide %5467, %25 : tensor<1x2048x1xf64>
-    %5469 = stablehlo.convert %5468 : (tensor<1x2048x1xf64>) -> tensor<1x2048x1xf32>
-    %5470 = stablehlo.reduce(%5455 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x2048x768xf32>, tensor<f32>) -> tensor<1x2048xf32>
-    %5471 = stablehlo.reshape %5470 : (tensor<1x2048xf32>) -> tensor<1x2048x1xf32>
-    %5472 = stablehlo.broadcast_in_dim %5471, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x1xf32>
-    %5473 = stablehlo.divide %5472, %41 : tensor<1x2048x1xf32>
-    %5474 = stablehlo.broadcast_in_dim %5469, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x1xf32>
-    %5475 = stablehlo.add %5474, %46 : tensor<1x2048x1xf32>
-    %5476 = stablehlo.rsqrt %5475 : tensor<1x2048x1xf32>
-    %5477 = stablehlo.broadcast_in_dim %5455, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %5478 = stablehlo.broadcast_in_dim %5473, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x768xf32>
-    %5479 = stablehlo.subtract %5477, %5478 : tensor<1x2048x768xf32>
-    %5480 = stablehlo.broadcast_in_dim %5479, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %5481 = stablehlo.broadcast_in_dim %5476, dims = [0, 1, 2] : (tensor<1x2048x1xf32>) -> tensor<1x2048x768xf32>
-    %5482 = stablehlo.multiply %5480, %5481 : tensor<1x2048x768xf32>
-    %5483 = stablehlo.convert %arg113 : (tensor<768xbf16>) -> tensor<768xf32>
-    %5484 = stablehlo.broadcast_in_dim %5482, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %5485 = stablehlo.broadcast_in_dim %5483, dims = [2] : (tensor<768xf32>) -> tensor<1x2048x768xf32>
-    %5486 = stablehlo.multiply %5484, %5485 : tensor<1x2048x768xf32>
-    %5487 = stablehlo.convert %arg114 : (tensor<768xbf16>) -> tensor<768xf32>
-    %5488 = stablehlo.broadcast_in_dim %5486, dims = [0, 1, 2] : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xf32>
-    %5489 = stablehlo.broadcast_in_dim %5487, dims = [2] : (tensor<768xf32>) -> tensor<1x2048x768xf32>
-    %5490 = stablehlo.add %5488, %5489 : tensor<1x2048x768xf32>
-    %5491 = stablehlo.convert %5490 : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xbf16>
-    %5492 = stablehlo.reshape %5491 : (tensor<1x2048x768xbf16>) -> tensor<2048x768xbf16>
-    %5493 = stablehlo.convert %5492 : (tensor<2048x768xbf16>) -> tensor<2048x768xf32>
-    %5494 = stablehlo.dot_general %5493, %arg448, contracting_dims = [1] x [0] : (tensor<2048x768xf32>, tensor<768x768xf32>) -> tensor<2048x768xf32>
-    %5495 = stablehlo.broadcast_in_dim %5494, dims = [0, 1] : (tensor<2048x768xf32>) -> tensor<2048x768xf32>
-    %5496 = stablehlo.multiply %5495, %5448 : tensor<2048x768xf32>
-    %5497 = stablehlo.broadcast_in_dim %5496, dims = [0, 1] : (tensor<2048x768xf32>) -> tensor<2048x768xf32>
-    %5498 = stablehlo.broadcast_in_dim %arg449, dims = [1] : (tensor<768xf32>) -> tensor<2048x768xf32>
-    %5499 = stablehlo.add %5497, %5498 : tensor<2048x768xf32>
-    %5500 = stablehlo.convert %5499 : (tensor<2048x768xf32>) -> tensor<2048x768xbf16>
-    %5501 = stablehlo.reshape %5500 : (tensor<2048x768xbf16>) -> tensor<1x2048x768xbf16>
-    %5502 = stablehlo.multiply %5501, %cst_23 : tensor<1x2048x768xbf16>
-    %5503 = stablehlo.rsqrt %cst_22 : tensor<1x2048x768xbf16>
-    %5504 = stablehlo.multiply %5501, %5503 : tensor<1x2048x768xbf16>
-    %5505 = stablehlo.convert %5504 : (tensor<1x2048x768xbf16>) -> tensor<1x2048x768xf32>
-    %5506 = stablehlo.clamp %cst_24, %5505, %cst_25 : tensor<1x2048x768xf32>
-    %5507 = stablehlo.multiply %5506, %5506 : tensor<1x2048x768xf32>
-    %5508 = stablehlo.multiply %cst_26, %5507 : tensor<1x2048x768xf32>
-    %5509 = stablehlo.add %5508, %cst_27 : tensor<1x2048x768xf32>
-    %5510 = stablehlo.multiply %5509, %5507 : tensor<1x2048x768xf32>
-    %5511 = stablehlo.add %5510, %cst_28 : tensor<1x2048x768xf32>
-    %5512 = stablehlo.multiply %5511, %5507 : tensor<1x2048x768xf32>
-    %5513 = stablehlo.add %5512, %cst_29 : tensor<1x2048x768xf32>
-    %5514 = stablehlo.multiply %5513, %5507 : tensor<1x2048x768xf32>
-    %5515 = stablehlo.add %5514, %cst_30 : tensor<1x2048x768xf32>
-    %5516 = stablehlo.multiply %5515, %5507 : tensor<1x2048x768xf32>
-    %5517 = stablehlo.add %5516, %cst_31 : tensor<1x2048x768xf32>
-    %5518 = stablehlo.multiply %5517, %5507 : tensor<1x2048x768xf32>
-    %5519 = stablehlo.add %5518, %cst_32 : tensor<1x2048x768xf32>
-    %5520 = stablehlo.multiply %cst_33, %5507 : tensor<1x2048x768xf32>
-    %5521 = stablehlo.add %5520, %cst_34 : tensor<1x2048x768xf32>
-    %5522 = stablehlo.multiply %5521, %5507 : tensor<1x2048x768xf32>
-    %5523 = stablehlo.add %5522, %cst_35 : tensor<1x2048x768xf32>
-    %5524 = stablehlo.multiply %5523, %5507 : tensor<1x2048x768xf32>
-    %5525 = stablehlo.add %5524, %cst_36 : tensor<1x2048x768xf32>
-    %5526 = stablehlo.multiply %5525, %5507 : tensor<1x2048x768xf32>
-    %5527 = stablehlo.add %5526, %cst_37 : tensor<1x2048x768xf32>
-    %5528 = stablehlo.multiply %5506, %5519 : tensor<1x2048x768xf32>
-    %5529 = stablehlo.divide %5528, %5527 : tensor<1x2048x768xf32>
-    %5530 = stablehlo.clamp %cst_38, %5529, %cst_39 : tensor<1x2048x768xf32>
-    %5531 = stablehlo.convert %5530 : (tensor<1x2048x768xf32>) -> tensor<1x2048x768xbf16>
-    %5532 = stablehlo.add %5531, %cst_21 : tensor<1x2048x768xbf16>
-    %5533 = stablehlo.multiply %5532, %5502 : tensor<1x2048x768xbf16>
-    %5534 = stablehlo.reshape %5533 : (tensor<1x2048x768xbf16>) -> tensor<2048x768xbf16>
-    %5535 = stablehlo.convert %5534 : (tensor<2048x768xbf16>) -> tensor<2048x768xf32>
-    %5536 = stablehlo.dot_general %5535, %arg450, contracting_dims = [1] x [0] : (tensor<2048x768xf32>, tensor<768x768xf32>) -> tensor<2048x768xf32>
-    %5537 = stablehlo.broadcast_in_dim %5536, dims = [0, 1] : (tensor<2048x768xf32>) -> tensor<2048x768xf32>
-    %5538 = stablehlo.multiply %5537, %5448 : tensor<2048x768xf32>
-    %5539 = stablehlo.broadcast_in_dim %5538, dims = [0, 1] : (tensor<2048x768xf32>) -> tensor<2048x768xf32>
-    %5540 = stablehlo.broadcast_in_dim %arg451, dims = [1] : (tensor<768xf32>) -> tensor<2048x768xf32>
-    %5541 = stablehlo.add %5539, %5540 : tensor<2048x768xf32>
-    %5542 = stablehlo.convert %5541 : (tensor<2048x768xf32>) -> tensor<2048x768xbf16>
-    %5543 = stablehlo.reshape %5542 : (tensor<2048x768xbf16>) -> tensor<1x2048x768xbf16>
-    %5544 = stablehlo.add %5543, %5454 : tensor<1x2048x768xbf16>
-    %5545 = stablehlo.reshape %5544 : (tensor<1x2048x768xbf16>) -> tensor<2048x768xbf16>
-    %5546 = stablehlo.dot_general %5545, %arg452, contracting_dims = [1] x [0] : (tensor<2048x768xbf16>, tensor<768x262xbf16>) -> tensor<2048x262xbf16>
-    %5547 = stablehlo.broadcast_in_dim %5546, dims = [0, 1] : (tensor<2048x262xbf16>) -> tensor<2048x262xbf16>
-    %5548 = stablehlo.broadcast_in_dim %arg115, dims = [1] : (tensor<262xbf16>) -> tensor<2048x262xbf16>
-    %5549 = stablehlo.add %5547, %5548 : tensor<2048x262xbf16>
-    %5550 = stablehlo.reshape %5549 : (tensor<2048x262xbf16>) -> tensor<1x2048x262xbf16>
-    return %5550 : tensor<1x2048x262xbf16>
-  }
-}
diff --git a/mlir_tests/ResNet18.mlir b/mlir_tests/ResNet18.mlir
deleted file mode 100644
index f796cbbc..00000000
--- a/mlir_tests/ResNet18.mlir
+++ /dev/null
@@ -1,403 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x224x224xbf16>, %arg1: tensor<64x3x7x7xbf16>, %arg2: tensor<64x64x3x3xbf16>, %arg3: tensor<64x64x3x3xbf16>, %arg4: tensor<64x64x3x3xbf16>, %arg5: tensor<64x64x3x3xbf16>, %arg6: tensor<128x64x3x3xbf16>, %arg7: tensor<128x128x3x3xbf16>, %arg8: tensor<128x64x1x1xbf16>, %arg9: tensor<128x128x3x3xbf16>, %arg10: tensor<128x128x3x3xbf16>, %arg11: tensor<256x128x3x3xbf16>, %arg12: tensor<256x256x3x3xbf16>, %arg13: tensor<256x128x1x1xbf16>, %arg14: tensor<256x256x3x3xbf16>, %arg15: tensor<256x256x3x3xbf16>, %arg16: tensor<512x256x3x3xbf16>, %arg17: tensor<512x512x3x3xbf16>, %arg18: tensor<512x256x1x1xbf16>, %arg19: tensor<512x512x3x3xbf16>, %arg20: tensor<512x512x3x3xbf16>, %arg21: tensor<64x1x1xf32>, %arg22: tensor<64x1x1xf32>, %arg23: tensor<64x1x1xbf16>, %arg24: tensor<64x1x1xbf16>, %arg25: tensor<64x1x1xf32>, %arg26: tensor<64x1x1xf32>, %arg27: tensor<64x1x1xbf16>, %arg28: tensor<64x1x1xbf16>, %arg29: tensor<64x1x1xf32>, %arg30: tensor<64x1x1xf32>, %arg31: tensor<64x1x1xbf16>, %arg32: tensor<64x1x1xbf16>, %arg33: tensor<64x1x1xf32>, %arg34: tensor<64x1x1xf32>, %arg35: tensor<64x1x1xbf16>, %arg36: tensor<64x1x1xbf16>, %arg37: tensor<64x1x1xf32>, %arg38: tensor<64x1x1xf32>, %arg39: tensor<64x1x1xbf16>, %arg40: tensor<64x1x1xbf16>, %arg41: tensor<128x1x1xf32>, %arg42: tensor<128x1x1xf32>, %arg43: tensor<128x1x1xbf16>, %arg44: tensor<128x1x1xbf16>, %arg45: tensor<128x1x1xf32>, %arg46: tensor<128x1x1xf32>, %arg47: tensor<128x1x1xbf16>, %arg48: tensor<128x1x1xbf16>, %arg49: tensor<128x1x1xf32>, %arg50: tensor<128x1x1xf32>, %arg51: tensor<128x1x1xbf16>, %arg52: tensor<128x1x1xbf16>, %arg53: tensor<128x1x1xf32>, %arg54: tensor<128x1x1xf32>, %arg55: tensor<128x1x1xbf16>, %arg56: tensor<128x1x1xbf16>, %arg57: tensor<128x1x1xf32>, %arg58: tensor<128x1x1xf32>, %arg59: tensor<128x1x1xbf16>, %arg60: tensor<128x1x1xbf16>, %arg61: tensor<256x1x1xf32>, %arg62: tensor<256x1x1xf32>, %arg63: tensor<256x1x1xbf16>, %arg64: tensor<256x1x1xbf16>, %arg65: tensor<256x1x1xf32>, %arg66: tensor<256x1x1xf32>, %arg67: tensor<256x1x1xbf16>, %arg68: tensor<256x1x1xbf16>, %arg69: tensor<256x1x1xf32>, %arg70: tensor<256x1x1xf32>, %arg71: tensor<256x1x1xbf16>, %arg72: tensor<256x1x1xbf16>, %arg73: tensor<256x1x1xf32>, %arg74: tensor<256x1x1xf32>, %arg75: tensor<256x1x1xbf16>, %arg76: tensor<256x1x1xbf16>, %arg77: tensor<256x1x1xf32>, %arg78: tensor<256x1x1xf32>, %arg79: tensor<256x1x1xbf16>, %arg80: tensor<256x1x1xbf16>, %arg81: tensor<512x1x1xf32>, %arg82: tensor<512x1x1xf32>, %arg83: tensor<512x1x1xbf16>, %arg84: tensor<512x1x1xbf16>, %arg85: tensor<512x1x1xf32>, %arg86: tensor<512x1x1xf32>, %arg87: tensor<512x1x1xbf16>, %arg88: tensor<512x1x1xbf16>, %arg89: tensor<512x1x1xf32>, %arg90: tensor<512x1x1xf32>, %arg91: tensor<512x1x1xbf16>, %arg92: tensor<512x1x1xbf16>, %arg93: tensor<512x1x1xf32>, %arg94: tensor<512x1x1xf32>, %arg95: tensor<512x1x1xbf16>, %arg96: tensor<512x1x1xbf16>, %arg97: tensor<512x1x1xf32>, %arg98: tensor<512x1x1xf32>, %arg99: tensor<512x1x1xbf16>, %arg100: tensor<512x1x1xbf16>, %arg101: tensor<512x1000xf32>, %arg102: tensor<1000xf32>) -> tensor<1x1000xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<1x64x112x112xbf16>
-    %cst_0 = stablehlo.constant dense<0xFF80> : tensor<bf16>
-    %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x56x56xbf16>
-    %cst_2 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x28x28xbf16>
-    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x14x14xbf16>
-    %cst_4 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x7x7xbf16>
-    %cst_5 = stablehlo.constant dense<0.000000e+00> : tensor<bf16>
-    %cst_6 = arith.constant dense<49> : tensor<1xi64>
-    %cst_7 = arith.constant dense<1> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x224x224xbf16>, tensor<64x3x7x7xbf16>) -> tensor<1x64x112x112xbf16>
-    %1 = stablehlo.convert %0 : (tensor<1x64x112x112xbf16>) -> tensor<1x64x112x112xf32>
-    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %3 = stablehlo.broadcast_in_dim %arg21, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %4 = stablehlo.subtract %2, %3 : tensor<1x64x112x112xf32>
-    %5 = stablehlo.broadcast_in_dim %4, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %6 = stablehlo.broadcast_in_dim %arg22, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %7 = stablehlo.multiply %5, %6 : tensor<1x64x112x112xf32>
-    %8 = stablehlo.convert %arg23 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %10 = stablehlo.broadcast_in_dim %8, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %11 = stablehlo.multiply %9, %10 : tensor<1x64x112x112xf32>
-    %12 = stablehlo.convert %arg24 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %15 = stablehlo.add %13, %14 : tensor<1x64x112x112xf32>
-    %16 = stablehlo.convert %15 : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xbf16>
-    %17 = stablehlo.maximum %16, %cst : tensor<1x64x112x112xbf16>
-    %18 = "stablehlo.reduce_window"(%17, %cst_0) <{padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 1, 3, 3>, window_strides = array<i64: 1, 1, 2, 2>}> ({
-    ^bb0(%arg103: tensor<bf16>, %arg104: tensor<bf16>):
-      %385 = stablehlo.maximum %arg103, %arg104 : tensor<bf16>
-      stablehlo.return %385 : tensor<bf16>
-    }) : (tensor<1x64x112x112xbf16>, tensor<bf16>) -> tensor<1x64x56x56xbf16>
-    %19 = stablehlo.convolution(%18, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %20 = stablehlo.convert %19 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %21 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %22 = stablehlo.broadcast_in_dim %arg25, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %23 = stablehlo.subtract %21, %22 : tensor<1x64x56x56xf32>
-    %24 = stablehlo.broadcast_in_dim %23, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %25 = stablehlo.broadcast_in_dim %arg26, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %26 = stablehlo.multiply %24, %25 : tensor<1x64x56x56xf32>
-    %27 = stablehlo.convert %arg27 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %28 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %29 = stablehlo.broadcast_in_dim %27, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %30 = stablehlo.multiply %28, %29 : tensor<1x64x56x56xf32>
-    %31 = stablehlo.convert %arg28 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %32 = stablehlo.broadcast_in_dim %30, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %33 = stablehlo.broadcast_in_dim %31, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %34 = stablehlo.add %32, %33 : tensor<1x64x56x56xf32>
-    %35 = stablehlo.convert %34 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %36 = stablehlo.maximum %35, %cst_1 : tensor<1x64x56x56xbf16>
-    %37 = stablehlo.convolution(%36, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %38 = stablehlo.convert %37 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %39 = stablehlo.broadcast_in_dim %38, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %40 = stablehlo.broadcast_in_dim %arg29, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %41 = stablehlo.subtract %39, %40 : tensor<1x64x56x56xf32>
-    %42 = stablehlo.broadcast_in_dim %41, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %43 = stablehlo.broadcast_in_dim %arg30, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %44 = stablehlo.multiply %42, %43 : tensor<1x64x56x56xf32>
-    %45 = stablehlo.convert %arg31 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %47 = stablehlo.broadcast_in_dim %45, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %48 = stablehlo.multiply %46, %47 : tensor<1x64x56x56xf32>
-    %49 = stablehlo.convert %arg32 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %51 = stablehlo.broadcast_in_dim %49, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %52 = stablehlo.add %50, %51 : tensor<1x64x56x56xf32>
-    %53 = stablehlo.convert %52 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %54 = stablehlo.add %53, %18 : tensor<1x64x56x56xbf16>
-    %55 = stablehlo.maximum %54, %cst_1 : tensor<1x64x56x56xbf16>
-    %56 = stablehlo.convolution(%55, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %57 = stablehlo.convert %56 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %58 = stablehlo.broadcast_in_dim %57, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %59 = stablehlo.broadcast_in_dim %arg33, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %60 = stablehlo.subtract %58, %59 : tensor<1x64x56x56xf32>
-    %61 = stablehlo.broadcast_in_dim %60, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %62 = stablehlo.broadcast_in_dim %arg34, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %63 = stablehlo.multiply %61, %62 : tensor<1x64x56x56xf32>
-    %64 = stablehlo.convert %arg35 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %65 = stablehlo.broadcast_in_dim %63, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %66 = stablehlo.broadcast_in_dim %64, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %67 = stablehlo.multiply %65, %66 : tensor<1x64x56x56xf32>
-    %68 = stablehlo.convert %arg36 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %69 = stablehlo.broadcast_in_dim %67, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %70 = stablehlo.broadcast_in_dim %68, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %71 = stablehlo.add %69, %70 : tensor<1x64x56x56xf32>
-    %72 = stablehlo.convert %71 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %73 = stablehlo.maximum %72, %cst_1 : tensor<1x64x56x56xbf16>
-    %74 = stablehlo.convolution(%73, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %75 = stablehlo.convert %74 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %76 = stablehlo.broadcast_in_dim %75, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %77 = stablehlo.broadcast_in_dim %arg37, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %78 = stablehlo.subtract %76, %77 : tensor<1x64x56x56xf32>
-    %79 = stablehlo.broadcast_in_dim %78, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %80 = stablehlo.broadcast_in_dim %arg38, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %81 = stablehlo.multiply %79, %80 : tensor<1x64x56x56xf32>
-    %82 = stablehlo.convert %arg39 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %83 = stablehlo.broadcast_in_dim %81, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %84 = stablehlo.broadcast_in_dim %82, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %85 = stablehlo.multiply %83, %84 : tensor<1x64x56x56xf32>
-    %86 = stablehlo.convert %arg40 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %87 = stablehlo.broadcast_in_dim %85, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %88 = stablehlo.broadcast_in_dim %86, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %89 = stablehlo.add %87, %88 : tensor<1x64x56x56xf32>
-    %90 = stablehlo.convert %89 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %91 = stablehlo.add %90, %55 : tensor<1x64x56x56xbf16>
-    %92 = stablehlo.maximum %91, %cst_1 : tensor<1x64x56x56xbf16>
-    %93 = stablehlo.convolution(%92, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<128x64x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %94 = stablehlo.convert %93 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %95 = stablehlo.broadcast_in_dim %94, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %96 = stablehlo.broadcast_in_dim %arg41, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %97 = stablehlo.subtract %95, %96 : tensor<1x128x28x28xf32>
-    %98 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %99 = stablehlo.broadcast_in_dim %arg42, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %100 = stablehlo.multiply %98, %99 : tensor<1x128x28x28xf32>
-    %101 = stablehlo.convert %arg43 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %102 = stablehlo.broadcast_in_dim %100, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %103 = stablehlo.broadcast_in_dim %101, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %104 = stablehlo.multiply %102, %103 : tensor<1x128x28x28xf32>
-    %105 = stablehlo.convert %arg44 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %106 = stablehlo.broadcast_in_dim %104, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %107 = stablehlo.broadcast_in_dim %105, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %108 = stablehlo.add %106, %107 : tensor<1x128x28x28xf32>
-    %109 = stablehlo.convert %108 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %110 = stablehlo.maximum %109, %cst_2 : tensor<1x128x28x28xbf16>
-    %111 = stablehlo.convolution(%110, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %112 = stablehlo.convert %111 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %113 = stablehlo.broadcast_in_dim %112, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %114 = stablehlo.broadcast_in_dim %arg45, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %115 = stablehlo.subtract %113, %114 : tensor<1x128x28x28xf32>
-    %116 = stablehlo.broadcast_in_dim %115, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %117 = stablehlo.broadcast_in_dim %arg46, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %118 = stablehlo.multiply %116, %117 : tensor<1x128x28x28xf32>
-    %119 = stablehlo.convert %arg47 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %120 = stablehlo.broadcast_in_dim %118, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %121 = stablehlo.broadcast_in_dim %119, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %122 = stablehlo.multiply %120, %121 : tensor<1x128x28x28xf32>
-    %123 = stablehlo.convert %arg48 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %124 = stablehlo.broadcast_in_dim %122, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %125 = stablehlo.broadcast_in_dim %123, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %126 = stablehlo.add %124, %125 : tensor<1x128x28x28xf32>
-    %127 = stablehlo.convert %126 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %128 = stablehlo.convolution(%92, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<128x64x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %129 = stablehlo.convert %128 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %130 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %131 = stablehlo.broadcast_in_dim %arg49, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %132 = stablehlo.subtract %130, %131 : tensor<1x128x28x28xf32>
-    %133 = stablehlo.broadcast_in_dim %132, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %134 = stablehlo.broadcast_in_dim %arg50, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %135 = stablehlo.multiply %133, %134 : tensor<1x128x28x28xf32>
-    %136 = stablehlo.convert %arg51 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %137 = stablehlo.broadcast_in_dim %135, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %138 = stablehlo.broadcast_in_dim %136, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %139 = stablehlo.multiply %137, %138 : tensor<1x128x28x28xf32>
-    %140 = stablehlo.convert %arg52 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %141 = stablehlo.broadcast_in_dim %139, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %142 = stablehlo.broadcast_in_dim %140, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %143 = stablehlo.add %141, %142 : tensor<1x128x28x28xf32>
-    %144 = stablehlo.convert %143 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %145 = stablehlo.add %127, %144 : tensor<1x128x28x28xbf16>
-    %146 = stablehlo.maximum %145, %cst_2 : tensor<1x128x28x28xbf16>
-    %147 = stablehlo.convolution(%146, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %148 = stablehlo.convert %147 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %149 = stablehlo.broadcast_in_dim %148, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %150 = stablehlo.broadcast_in_dim %arg53, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %151 = stablehlo.subtract %149, %150 : tensor<1x128x28x28xf32>
-    %152 = stablehlo.broadcast_in_dim %151, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %153 = stablehlo.broadcast_in_dim %arg54, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %154 = stablehlo.multiply %152, %153 : tensor<1x128x28x28xf32>
-    %155 = stablehlo.convert %arg55 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %156 = stablehlo.broadcast_in_dim %154, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %157 = stablehlo.broadcast_in_dim %155, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %158 = stablehlo.multiply %156, %157 : tensor<1x128x28x28xf32>
-    %159 = stablehlo.convert %arg56 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %160 = stablehlo.broadcast_in_dim %158, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %161 = stablehlo.broadcast_in_dim %159, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %162 = stablehlo.add %160, %161 : tensor<1x128x28x28xf32>
-    %163 = stablehlo.convert %162 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %164 = stablehlo.maximum %163, %cst_2 : tensor<1x128x28x28xbf16>
-    %165 = stablehlo.convolution(%164, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %166 = stablehlo.convert %165 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %167 = stablehlo.broadcast_in_dim %166, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %168 = stablehlo.broadcast_in_dim %arg57, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %169 = stablehlo.subtract %167, %168 : tensor<1x128x28x28xf32>
-    %170 = stablehlo.broadcast_in_dim %169, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %171 = stablehlo.broadcast_in_dim %arg58, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %172 = stablehlo.multiply %170, %171 : tensor<1x128x28x28xf32>
-    %173 = stablehlo.convert %arg59 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %174 = stablehlo.broadcast_in_dim %172, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %175 = stablehlo.broadcast_in_dim %173, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %176 = stablehlo.multiply %174, %175 : tensor<1x128x28x28xf32>
-    %177 = stablehlo.convert %arg60 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %178 = stablehlo.broadcast_in_dim %176, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %179 = stablehlo.broadcast_in_dim %177, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %180 = stablehlo.add %178, %179 : tensor<1x128x28x28xf32>
-    %181 = stablehlo.convert %180 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %182 = stablehlo.add %181, %146 : tensor<1x128x28x28xbf16>
-    %183 = stablehlo.maximum %182, %cst_2 : tensor<1x128x28x28xbf16>
-    %184 = stablehlo.convolution(%183, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %185 = stablehlo.convert %184 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %186 = stablehlo.broadcast_in_dim %185, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %187 = stablehlo.broadcast_in_dim %arg61, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %188 = stablehlo.subtract %186, %187 : tensor<1x256x14x14xf32>
-    %189 = stablehlo.broadcast_in_dim %188, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %190 = stablehlo.broadcast_in_dim %arg62, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %191 = stablehlo.multiply %189, %190 : tensor<1x256x14x14xf32>
-    %192 = stablehlo.convert %arg63 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %193 = stablehlo.broadcast_in_dim %191, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %194 = stablehlo.broadcast_in_dim %192, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %195 = stablehlo.multiply %193, %194 : tensor<1x256x14x14xf32>
-    %196 = stablehlo.convert %arg64 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %197 = stablehlo.broadcast_in_dim %195, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %198 = stablehlo.broadcast_in_dim %196, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %199 = stablehlo.add %197, %198 : tensor<1x256x14x14xf32>
-    %200 = stablehlo.convert %199 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %201 = stablehlo.maximum %200, %cst_3 : tensor<1x256x14x14xbf16>
-    %202 = stablehlo.convolution(%201, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %203 = stablehlo.convert %202 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %204 = stablehlo.broadcast_in_dim %203, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %205 = stablehlo.broadcast_in_dim %arg65, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %206 = stablehlo.subtract %204, %205 : tensor<1x256x14x14xf32>
-    %207 = stablehlo.broadcast_in_dim %206, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %208 = stablehlo.broadcast_in_dim %arg66, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %209 = stablehlo.multiply %207, %208 : tensor<1x256x14x14xf32>
-    %210 = stablehlo.convert %arg67 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %211 = stablehlo.broadcast_in_dim %209, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %212 = stablehlo.broadcast_in_dim %210, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %213 = stablehlo.multiply %211, %212 : tensor<1x256x14x14xf32>
-    %214 = stablehlo.convert %arg68 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %215 = stablehlo.broadcast_in_dim %213, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %216 = stablehlo.broadcast_in_dim %214, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %217 = stablehlo.add %215, %216 : tensor<1x256x14x14xf32>
-    %218 = stablehlo.convert %217 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %219 = stablehlo.convolution(%183, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<256x128x1x1xbf16>) -> tensor<1x256x14x14xbf16>
-    %220 = stablehlo.convert %219 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %221 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %222 = stablehlo.broadcast_in_dim %arg69, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %223 = stablehlo.subtract %221, %222 : tensor<1x256x14x14xf32>
-    %224 = stablehlo.broadcast_in_dim %223, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %225 = stablehlo.broadcast_in_dim %arg70, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %226 = stablehlo.multiply %224, %225 : tensor<1x256x14x14xf32>
-    %227 = stablehlo.convert %arg71 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %228 = stablehlo.broadcast_in_dim %226, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %229 = stablehlo.broadcast_in_dim %227, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %230 = stablehlo.multiply %228, %229 : tensor<1x256x14x14xf32>
-    %231 = stablehlo.convert %arg72 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %232 = stablehlo.broadcast_in_dim %230, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %233 = stablehlo.broadcast_in_dim %231, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %234 = stablehlo.add %232, %233 : tensor<1x256x14x14xf32>
-    %235 = stablehlo.convert %234 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %236 = stablehlo.add %218, %235 : tensor<1x256x14x14xbf16>
-    %237 = stablehlo.maximum %236, %cst_3 : tensor<1x256x14x14xbf16>
-    %238 = stablehlo.convolution(%237, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %239 = stablehlo.convert %238 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %240 = stablehlo.broadcast_in_dim %239, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %241 = stablehlo.broadcast_in_dim %arg73, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %242 = stablehlo.subtract %240, %241 : tensor<1x256x14x14xf32>
-    %243 = stablehlo.broadcast_in_dim %242, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %244 = stablehlo.broadcast_in_dim %arg74, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %245 = stablehlo.multiply %243, %244 : tensor<1x256x14x14xf32>
-    %246 = stablehlo.convert %arg75 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %247 = stablehlo.broadcast_in_dim %245, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %248 = stablehlo.broadcast_in_dim %246, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %249 = stablehlo.multiply %247, %248 : tensor<1x256x14x14xf32>
-    %250 = stablehlo.convert %arg76 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %251 = stablehlo.broadcast_in_dim %249, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %252 = stablehlo.broadcast_in_dim %250, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %253 = stablehlo.add %251, %252 : tensor<1x256x14x14xf32>
-    %254 = stablehlo.convert %253 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %255 = stablehlo.maximum %254, %cst_3 : tensor<1x256x14x14xbf16>
-    %256 = stablehlo.convolution(%255, %arg15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %257 = stablehlo.convert %256 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %258 = stablehlo.broadcast_in_dim %257, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %259 = stablehlo.broadcast_in_dim %arg77, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %260 = stablehlo.subtract %258, %259 : tensor<1x256x14x14xf32>
-    %261 = stablehlo.broadcast_in_dim %260, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %262 = stablehlo.broadcast_in_dim %arg78, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %263 = stablehlo.multiply %261, %262 : tensor<1x256x14x14xf32>
-    %264 = stablehlo.convert %arg79 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %265 = stablehlo.broadcast_in_dim %263, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %266 = stablehlo.broadcast_in_dim %264, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %267 = stablehlo.multiply %265, %266 : tensor<1x256x14x14xf32>
-    %268 = stablehlo.convert %arg80 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %269 = stablehlo.broadcast_in_dim %267, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %270 = stablehlo.broadcast_in_dim %268, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %271 = stablehlo.add %269, %270 : tensor<1x256x14x14xf32>
-    %272 = stablehlo.convert %271 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %273 = stablehlo.add %272, %237 : tensor<1x256x14x14xbf16>
-    %274 = stablehlo.maximum %273, %cst_3 : tensor<1x256x14x14xbf16>
-    %275 = stablehlo.convolution(%274, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %276 = stablehlo.convert %275 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %277 = stablehlo.broadcast_in_dim %276, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %278 = stablehlo.broadcast_in_dim %arg81, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %279 = stablehlo.subtract %277, %278 : tensor<1x512x7x7xf32>
-    %280 = stablehlo.broadcast_in_dim %279, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %281 = stablehlo.broadcast_in_dim %arg82, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %282 = stablehlo.multiply %280, %281 : tensor<1x512x7x7xf32>
-    %283 = stablehlo.convert %arg83 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %284 = stablehlo.broadcast_in_dim %282, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %285 = stablehlo.broadcast_in_dim %283, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %286 = stablehlo.multiply %284, %285 : tensor<1x512x7x7xf32>
-    %287 = stablehlo.convert %arg84 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %288 = stablehlo.broadcast_in_dim %286, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %289 = stablehlo.broadcast_in_dim %287, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %290 = stablehlo.add %288, %289 : tensor<1x512x7x7xf32>
-    %291 = stablehlo.convert %290 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %292 = stablehlo.maximum %291, %cst_4 : tensor<1x512x7x7xbf16>
-    %293 = stablehlo.convolution(%292, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %294 = stablehlo.convert %293 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %295 = stablehlo.broadcast_in_dim %294, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %296 = stablehlo.broadcast_in_dim %arg85, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %297 = stablehlo.subtract %295, %296 : tensor<1x512x7x7xf32>
-    %298 = stablehlo.broadcast_in_dim %297, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %299 = stablehlo.broadcast_in_dim %arg86, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %300 = stablehlo.multiply %298, %299 : tensor<1x512x7x7xf32>
-    %301 = stablehlo.convert %arg87 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %302 = stablehlo.broadcast_in_dim %300, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %303 = stablehlo.broadcast_in_dim %301, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %304 = stablehlo.multiply %302, %303 : tensor<1x512x7x7xf32>
-    %305 = stablehlo.convert %arg88 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %306 = stablehlo.broadcast_in_dim %304, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %307 = stablehlo.broadcast_in_dim %305, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %308 = stablehlo.add %306, %307 : tensor<1x512x7x7xf32>
-    %309 = stablehlo.convert %308 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %310 = stablehlo.convolution(%274, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<512x256x1x1xbf16>) -> tensor<1x512x7x7xbf16>
-    %311 = stablehlo.convert %310 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %312 = stablehlo.broadcast_in_dim %311, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %313 = stablehlo.broadcast_in_dim %arg89, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %314 = stablehlo.subtract %312, %313 : tensor<1x512x7x7xf32>
-    %315 = stablehlo.broadcast_in_dim %314, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %316 = stablehlo.broadcast_in_dim %arg90, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %317 = stablehlo.multiply %315, %316 : tensor<1x512x7x7xf32>
-    %318 = stablehlo.convert %arg91 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %319 = stablehlo.broadcast_in_dim %317, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %320 = stablehlo.broadcast_in_dim %318, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %321 = stablehlo.multiply %319, %320 : tensor<1x512x7x7xf32>
-    %322 = stablehlo.convert %arg92 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %323 = stablehlo.broadcast_in_dim %321, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %324 = stablehlo.broadcast_in_dim %322, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %325 = stablehlo.add %323, %324 : tensor<1x512x7x7xf32>
-    %326 = stablehlo.convert %325 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %327 = stablehlo.add %309, %326 : tensor<1x512x7x7xbf16>
-    %328 = stablehlo.maximum %327, %cst_4 : tensor<1x512x7x7xbf16>
-    %329 = stablehlo.convolution(%328, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %330 = stablehlo.convert %329 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %331 = stablehlo.broadcast_in_dim %330, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %332 = stablehlo.broadcast_in_dim %arg93, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %333 = stablehlo.subtract %331, %332 : tensor<1x512x7x7xf32>
-    %334 = stablehlo.broadcast_in_dim %333, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %335 = stablehlo.broadcast_in_dim %arg94, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %336 = stablehlo.multiply %334, %335 : tensor<1x512x7x7xf32>
-    %337 = stablehlo.convert %arg95 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %338 = stablehlo.broadcast_in_dim %336, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %339 = stablehlo.broadcast_in_dim %337, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %340 = stablehlo.multiply %338, %339 : tensor<1x512x7x7xf32>
-    %341 = stablehlo.convert %arg96 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %342 = stablehlo.broadcast_in_dim %340, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %343 = stablehlo.broadcast_in_dim %341, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %344 = stablehlo.add %342, %343 : tensor<1x512x7x7xf32>
-    %345 = stablehlo.convert %344 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %346 = stablehlo.maximum %345, %cst_4 : tensor<1x512x7x7xbf16>
-    %347 = stablehlo.convolution(%346, %arg20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %348 = stablehlo.convert %347 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %349 = stablehlo.broadcast_in_dim %348, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %350 = stablehlo.broadcast_in_dim %arg97, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %351 = stablehlo.subtract %349, %350 : tensor<1x512x7x7xf32>
-    %352 = stablehlo.broadcast_in_dim %351, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %353 = stablehlo.broadcast_in_dim %arg98, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %354 = stablehlo.multiply %352, %353 : tensor<1x512x7x7xf32>
-    %355 = stablehlo.convert %arg99 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %356 = stablehlo.broadcast_in_dim %354, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %357 = stablehlo.broadcast_in_dim %355, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %358 = stablehlo.multiply %356, %357 : tensor<1x512x7x7xf32>
-    %359 = stablehlo.convert %arg100 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %360 = stablehlo.broadcast_in_dim %358, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %361 = stablehlo.broadcast_in_dim %359, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %362 = stablehlo.add %360, %361 : tensor<1x512x7x7xf32>
-    %363 = stablehlo.convert %362 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %364 = stablehlo.add %363, %328 : tensor<1x512x7x7xbf16>
-    %365 = stablehlo.maximum %364, %cst_4 : tensor<1x512x7x7xbf16>
-    %366 = stablehlo.reduce(%365 init: %cst_5) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x512x7x7xbf16>, tensor<bf16>) -> tensor<1x512xbf16>
-    %367 = stablehlo.reshape %366 : (tensor<1x512xbf16>) -> tensor<1x512x1x1xbf16>
-    %368 = stablehlo.convert %cst_6 : (tensor<1xi64>) -> tensor<1xbf16>
-    %369 = stablehlo.reshape %368 : (tensor<1xbf16>) -> tensor<bf16>
-    %370 = stablehlo.broadcast_in_dim %367, dims = [0, 1, 2, 3] : (tensor<1x512x1x1xbf16>) -> tensor<1x512x1x1xbf16>
-    %371 = stablehlo.broadcast_in_dim %369, dims = [] : (tensor<bf16>) -> tensor<1x512x1x1xbf16>
-    %372 = stablehlo.divide %370, %371 : tensor<1x512x1x1xbf16>
-    %373 = stablehlo.reshape %372 : (tensor<1x512x1x1xbf16>) -> tensor<1x512xbf16>
-    %374 = stablehlo.convert %373 : (tensor<1x512xbf16>) -> tensor<1x512xf32>
-    %375 = stablehlo.dot_general %374, %arg101, contracting_dims = [1] x [0] : (tensor<1x512xf32>, tensor<512x1000xf32>) -> tensor<1x1000xf32>
-    %376 = stablehlo.convert %cst_7 : (tensor<1xi64>) -> tensor<1xf32>
-    %377 = stablehlo.reshape %376 : (tensor<1xf32>) -> tensor<f32>
-    %378 = stablehlo.broadcast_in_dim %375, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %379 = stablehlo.broadcast_in_dim %377, dims = [] : (tensor<f32>) -> tensor<1x1000xf32>
-    %380 = stablehlo.multiply %378, %379 : tensor<1x1000xf32>
-    %381 = stablehlo.broadcast_in_dim %380, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %382 = stablehlo.broadcast_in_dim %arg102, dims = [1] : (tensor<1000xf32>) -> tensor<1x1000xf32>
-    %383 = stablehlo.add %381, %382 : tensor<1x1000xf32>
-    %384 = stablehlo.convert %383 : (tensor<1x1000xf32>) -> tensor<1x1000xbf16>
-    return %384 : tensor<1x1000xbf16>
-  }
-}
diff --git a/mlir_tests/ResNet50.mlir b/mlir_tests/ResNet50.mlir
deleted file mode 100644
index a87500a5..00000000
--- a/mlir_tests/ResNet50.mlir
+++ /dev/null
@@ -1,1011 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x224x224xbf16>, %arg1: tensor<64x3x7x7xbf16>, %arg2: tensor<64x64x1x1xbf16>, %arg3: tensor<64x64x3x3xbf16>, %arg4: tensor<256x64x1x1xbf16>, %arg5: tensor<256x64x1x1xbf16>, %arg6: tensor<64x256x1x1xbf16>, %arg7: tensor<64x64x3x3xbf16>, %arg8: tensor<256x64x1x1xbf16>, %arg9: tensor<64x256x1x1xbf16>, %arg10: tensor<64x64x3x3xbf16>, %arg11: tensor<256x64x1x1xbf16>, %arg12: tensor<128x256x1x1xbf16>, %arg13: tensor<128x128x3x3xbf16>, %arg14: tensor<512x128x1x1xbf16>, %arg15: tensor<512x256x1x1xbf16>, %arg16: tensor<128x512x1x1xbf16>, %arg17: tensor<128x128x3x3xbf16>, %arg18: tensor<512x128x1x1xbf16>, %arg19: tensor<128x512x1x1xbf16>, %arg20: tensor<128x128x3x3xbf16>, %arg21: tensor<512x128x1x1xbf16>, %arg22: tensor<128x512x1x1xbf16>, %arg23: tensor<128x128x3x3xbf16>, %arg24: tensor<512x128x1x1xbf16>, %arg25: tensor<256x512x1x1xbf16>, %arg26: tensor<256x256x3x3xbf16>, %arg27: tensor<1024x256x1x1xbf16>, %arg28: tensor<1024x512x1x1xbf16>, %arg29: tensor<256x1024x1x1xbf16>, %arg30: tensor<256x256x3x3xbf16>, %arg31: tensor<1024x256x1x1xbf16>, %arg32: tensor<256x1024x1x1xbf16>, %arg33: tensor<256x256x3x3xbf16>, %arg34: tensor<1024x256x1x1xbf16>, %arg35: tensor<256x1024x1x1xbf16>, %arg36: tensor<256x256x3x3xbf16>, %arg37: tensor<1024x256x1x1xbf16>, %arg38: tensor<256x1024x1x1xbf16>, %arg39: tensor<256x256x3x3xbf16>, %arg40: tensor<1024x256x1x1xbf16>, %arg41: tensor<256x1024x1x1xbf16>, %arg42: tensor<256x256x3x3xbf16>, %arg43: tensor<1024x256x1x1xbf16>, %arg44: tensor<512x1024x1x1xbf16>, %arg45: tensor<512x512x3x3xbf16>, %arg46: tensor<2048x512x1x1xbf16>, %arg47: tensor<2048x1024x1x1xbf16>, %arg48: tensor<512x2048x1x1xbf16>, %arg49: tensor<512x512x3x3xbf16>, %arg50: tensor<2048x512x1x1xbf16>, %arg51: tensor<512x2048x1x1xbf16>, %arg52: tensor<512x512x3x3xbf16>, %arg53: tensor<2048x512x1x1xbf16>, %arg54: tensor<64x1x1xf32>, %arg55: tensor<64x1x1xf32>, %arg56: tensor<64x1x1xbf16>, %arg57: tensor<64x1x1xbf16>, %arg58: tensor<64x1x1xf32>, %arg59: tensor<64x1x1xf32>, %arg60: tensor<64x1x1xbf16>, %arg61: tensor<64x1x1xbf16>, %arg62: tensor<64x1x1xf32>, %arg63: tensor<64x1x1xf32>, %arg64: tensor<64x1x1xbf16>, %arg65: tensor<64x1x1xbf16>, %arg66: tensor<256x1x1xf32>, %arg67: tensor<256x1x1xf32>, %arg68: tensor<256x1x1xbf16>, %arg69: tensor<256x1x1xbf16>, %arg70: tensor<256x1x1xf32>, %arg71: tensor<256x1x1xf32>, %arg72: tensor<256x1x1xbf16>, %arg73: tensor<256x1x1xbf16>, %arg74: tensor<64x1x1xf32>, %arg75: tensor<64x1x1xf32>, %arg76: tensor<64x1x1xbf16>, %arg77: tensor<64x1x1xbf16>, %arg78: tensor<64x1x1xf32>, %arg79: tensor<64x1x1xf32>, %arg80: tensor<64x1x1xbf16>, %arg81: tensor<64x1x1xbf16>, %arg82: tensor<256x1x1xf32>, %arg83: tensor<256x1x1xf32>, %arg84: tensor<256x1x1xbf16>, %arg85: tensor<256x1x1xbf16>, %arg86: tensor<64x1x1xf32>, %arg87: tensor<64x1x1xf32>, %arg88: tensor<64x1x1xbf16>, %arg89: tensor<64x1x1xbf16>, %arg90: tensor<64x1x1xf32>, %arg91: tensor<64x1x1xf32>, %arg92: tensor<64x1x1xbf16>, %arg93: tensor<64x1x1xbf16>, %arg94: tensor<256x1x1xf32>, %arg95: tensor<256x1x1xf32>, %arg96: tensor<256x1x1xbf16>, %arg97: tensor<256x1x1xbf16>, %arg98: tensor<128x1x1xf32>, %arg99: tensor<128x1x1xf32>, %arg100: tensor<128x1x1xbf16>, %arg101: tensor<128x1x1xbf16>, %arg102: tensor<128x1x1xf32>, %arg103: tensor<128x1x1xf32>, %arg104: tensor<128x1x1xbf16>, %arg105: tensor<128x1x1xbf16>, %arg106: tensor<512x1x1xf32>, %arg107: tensor<512x1x1xf32>, %arg108: tensor<512x1x1xbf16>, %arg109: tensor<512x1x1xbf16>, %arg110: tensor<512x1x1xf32>, %arg111: tensor<512x1x1xf32>, %arg112: tensor<512x1x1xbf16>, %arg113: tensor<512x1x1xbf16>, %arg114: tensor<128x1x1xf32>, %arg115: tensor<128x1x1xf32>, %arg116: tensor<128x1x1xbf16>, %arg117: tensor<128x1x1xbf16>, %arg118: tensor<128x1x1xf32>, %arg119: tensor<128x1x1xf32>, %arg120: tensor<128x1x1xbf16>, %arg121: tensor<128x1x1xbf16>, %arg122: tensor<512x1x1xf32>, %arg123: tensor<512x1x1xf32>, %arg124: tensor<512x1x1xbf16>, %arg125: tensor<512x1x1xbf16>, %arg126: tensor<128x1x1xf32>, %arg127: tensor<128x1x1xf32>, %arg128: tensor<128x1x1xbf16>, %arg129: tensor<128x1x1xbf16>, %arg130: tensor<128x1x1xf32>, %arg131: tensor<128x1x1xf32>, %arg132: tensor<128x1x1xbf16>, %arg133: tensor<128x1x1xbf16>, %arg134: tensor<512x1x1xf32>, %arg135: tensor<512x1x1xf32>, %arg136: tensor<512x1x1xbf16>, %arg137: tensor<512x1x1xbf16>, %arg138: tensor<128x1x1xf32>, %arg139: tensor<128x1x1xf32>, %arg140: tensor<128x1x1xbf16>, %arg141: tensor<128x1x1xbf16>, %arg142: tensor<128x1x1xf32>, %arg143: tensor<128x1x1xf32>, %arg144: tensor<128x1x1xbf16>, %arg145: tensor<128x1x1xbf16>, %arg146: tensor<512x1x1xf32>, %arg147: tensor<512x1x1xf32>, %arg148: tensor<512x1x1xbf16>, %arg149: tensor<512x1x1xbf16>, %arg150: tensor<256x1x1xf32>, %arg151: tensor<256x1x1xf32>, %arg152: tensor<256x1x1xbf16>, %arg153: tensor<256x1x1xbf16>, %arg154: tensor<256x1x1xf32>, %arg155: tensor<256x1x1xf32>, %arg156: tensor<256x1x1xbf16>, %arg157: tensor<256x1x1xbf16>, %arg158: tensor<1024x1x1xf32>, %arg159: tensor<1024x1x1xf32>, %arg160: tensor<1024x1x1xbf16>, %arg161: tensor<1024x1x1xbf16>, %arg162: tensor<1024x1x1xf32>, %arg163: tensor<1024x1x1xf32>, %arg164: tensor<1024x1x1xbf16>, %arg165: tensor<1024x1x1xbf16>, %arg166: tensor<256x1x1xf32>, %arg167: tensor<256x1x1xf32>, %arg168: tensor<256x1x1xbf16>, %arg169: tensor<256x1x1xbf16>, %arg170: tensor<256x1x1xf32>, %arg171: tensor<256x1x1xf32>, %arg172: tensor<256x1x1xbf16>, %arg173: tensor<256x1x1xbf16>, %arg174: tensor<1024x1x1xf32>, %arg175: tensor<1024x1x1xf32>, %arg176: tensor<1024x1x1xbf16>, %arg177: tensor<1024x1x1xbf16>, %arg178: tensor<256x1x1xf32>, %arg179: tensor<256x1x1xf32>, %arg180: tensor<256x1x1xbf16>, %arg181: tensor<256x1x1xbf16>, %arg182: tensor<256x1x1xf32>, %arg183: tensor<256x1x1xf32>, %arg184: tensor<256x1x1xbf16>, %arg185: tensor<256x1x1xbf16>, %arg186: tensor<1024x1x1xf32>, %arg187: tensor<1024x1x1xf32>, %arg188: tensor<1024x1x1xbf16>, %arg189: tensor<1024x1x1xbf16>, %arg190: tensor<256x1x1xf32>, %arg191: tensor<256x1x1xf32>, %arg192: tensor<256x1x1xbf16>, %arg193: tensor<256x1x1xbf16>, %arg194: tensor<256x1x1xf32>, %arg195: tensor<256x1x1xf32>, %arg196: tensor<256x1x1xbf16>, %arg197: tensor<256x1x1xbf16>, %arg198: tensor<1024x1x1xf32>, %arg199: tensor<1024x1x1xf32>, %arg200: tensor<1024x1x1xbf16>, %arg201: tensor<1024x1x1xbf16>, %arg202: tensor<256x1x1xf32>, %arg203: tensor<256x1x1xf32>, %arg204: tensor<256x1x1xbf16>, %arg205: tensor<256x1x1xbf16>, %arg206: tensor<256x1x1xf32>, %arg207: tensor<256x1x1xf32>, %arg208: tensor<256x1x1xbf16>, %arg209: tensor<256x1x1xbf16>, %arg210: tensor<1024x1x1xf32>, %arg211: tensor<1024x1x1xf32>, %arg212: tensor<1024x1x1xbf16>, %arg213: tensor<1024x1x1xbf16>, %arg214: tensor<256x1x1xf32>, %arg215: tensor<256x1x1xf32>, %arg216: tensor<256x1x1xbf16>, %arg217: tensor<256x1x1xbf16>, %arg218: tensor<256x1x1xf32>, %arg219: tensor<256x1x1xf32>, %arg220: tensor<256x1x1xbf16>, %arg221: tensor<256x1x1xbf16>, %arg222: tensor<1024x1x1xf32>, %arg223: tensor<1024x1x1xf32>, %arg224: tensor<1024x1x1xbf16>, %arg225: tensor<1024x1x1xbf16>, %arg226: tensor<512x1x1xf32>, %arg227: tensor<512x1x1xf32>, %arg228: tensor<512x1x1xbf16>, %arg229: tensor<512x1x1xbf16>, %arg230: tensor<512x1x1xf32>, %arg231: tensor<512x1x1xf32>, %arg232: tensor<512x1x1xbf16>, %arg233: tensor<512x1x1xbf16>, %arg234: tensor<2048x1x1xf32>, %arg235: tensor<2048x1x1xf32>, %arg236: tensor<2048x1x1xbf16>, %arg237: tensor<2048x1x1xbf16>, %arg238: tensor<2048x1x1xf32>, %arg239: tensor<2048x1x1xf32>, %arg240: tensor<2048x1x1xbf16>, %arg241: tensor<2048x1x1xbf16>, %arg242: tensor<512x1x1xf32>, %arg243: tensor<512x1x1xf32>, %arg244: tensor<512x1x1xbf16>, %arg245: tensor<512x1x1xbf16>, %arg246: tensor<512x1x1xf32>, %arg247: tensor<512x1x1xf32>, %arg248: tensor<512x1x1xbf16>, %arg249: tensor<512x1x1xbf16>, %arg250: tensor<2048x1x1xf32>, %arg251: tensor<2048x1x1xf32>, %arg252: tensor<2048x1x1xbf16>, %arg253: tensor<2048x1x1xbf16>, %arg254: tensor<512x1x1xf32>, %arg255: tensor<512x1x1xf32>, %arg256: tensor<512x1x1xbf16>, %arg257: tensor<512x1x1xbf16>, %arg258: tensor<512x1x1xf32>, %arg259: tensor<512x1x1xf32>, %arg260: tensor<512x1x1xbf16>, %arg261: tensor<512x1x1xbf16>, %arg262: tensor<2048x1x1xf32>, %arg263: tensor<2048x1x1xf32>, %arg264: tensor<2048x1x1xbf16>, %arg265: tensor<2048x1x1xbf16>, %arg266: tensor<2048x1000xf32>, %arg267: tensor<1000xf32>) -> tensor<1x1000xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<1x64x112x112xbf16>
-    %cst_0 = stablehlo.constant dense<0xFF80> : tensor<bf16>
-    %cst_1 = stablehlo.constant dense<0.000000e+00> : tensor<1x64x56x56xbf16>
-    %cst_2 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x56x56xbf16>
-    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x56x56xbf16>
-    %cst_4 = stablehlo.constant dense<0.000000e+00> : tensor<1x128x28x28xbf16>
-    %cst_5 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x28x28xbf16>
-    %cst_6 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x28x28xbf16>
-    %cst_7 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x14x14xbf16>
-    %cst_8 = stablehlo.constant dense<0.000000e+00> : tensor<1x1024x14x14xbf16>
-    %cst_9 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x14x14xbf16>
-    %cst_10 = stablehlo.constant dense<0.000000e+00> : tensor<1x512x7x7xbf16>
-    %cst_11 = stablehlo.constant dense<0.000000e+00> : tensor<1x2048x7x7xbf16>
-    %cst_12 = stablehlo.constant dense<0.000000e+00> : tensor<bf16>
-    %cst_13 = arith.constant dense<49> : tensor<1xi64>
-    %cst_14 = arith.constant dense<1> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x224x224xbf16>, tensor<64x3x7x7xbf16>) -> tensor<1x64x112x112xbf16>
-    %1 = stablehlo.convert %0 : (tensor<1x64x112x112xbf16>) -> tensor<1x64x112x112xf32>
-    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %3 = stablehlo.broadcast_in_dim %arg54, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %4 = stablehlo.subtract %2, %3 : tensor<1x64x112x112xf32>
-    %5 = stablehlo.broadcast_in_dim %4, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %6 = stablehlo.broadcast_in_dim %arg55, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %7 = stablehlo.multiply %5, %6 : tensor<1x64x112x112xf32>
-    %8 = stablehlo.convert %arg56 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %10 = stablehlo.broadcast_in_dim %8, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %11 = stablehlo.multiply %9, %10 : tensor<1x64x112x112xf32>
-    %12 = stablehlo.convert %arg57 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2, 3] : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x112x112xf32>
-    %15 = stablehlo.add %13, %14 : tensor<1x64x112x112xf32>
-    %16 = stablehlo.convert %15 : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xbf16>
-    %17 = stablehlo.maximum %16, %cst : tensor<1x64x112x112xbf16>
-    %18 = "stablehlo.reduce_window"(%17, %cst_0) <{padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 1, 3, 3>, window_strides = array<i64: 1, 1, 2, 2>}> ({
-    ^bb0(%arg268: tensor<bf16>, %arg269: tensor<bf16>):
-      %986 = stablehlo.maximum %arg268, %arg269 : tensor<bf16>
-      stablehlo.return %986 : tensor<bf16>
-    }) : (tensor<1x64x112x112xbf16>, tensor<bf16>) -> tensor<1x64x56x56xbf16>
-    %19 = stablehlo.convolution(%18, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x1x1xbf16>) -> tensor<1x64x56x56xbf16>
-    %20 = stablehlo.convert %19 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %21 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %22 = stablehlo.broadcast_in_dim %arg58, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %23 = stablehlo.subtract %21, %22 : tensor<1x64x56x56xf32>
-    %24 = stablehlo.broadcast_in_dim %23, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %25 = stablehlo.broadcast_in_dim %arg59, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %26 = stablehlo.multiply %24, %25 : tensor<1x64x56x56xf32>
-    %27 = stablehlo.convert %arg60 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %28 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %29 = stablehlo.broadcast_in_dim %27, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %30 = stablehlo.multiply %28, %29 : tensor<1x64x56x56xf32>
-    %31 = stablehlo.convert %arg61 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %32 = stablehlo.broadcast_in_dim %30, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %33 = stablehlo.broadcast_in_dim %31, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %34 = stablehlo.add %32, %33 : tensor<1x64x56x56xf32>
-    %35 = stablehlo.convert %34 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %36 = stablehlo.maximum %35, %cst_1 : tensor<1x64x56x56xbf16>
-    %37 = stablehlo.convolution(%36, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %38 = stablehlo.convert %37 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %39 = stablehlo.broadcast_in_dim %38, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %40 = stablehlo.broadcast_in_dim %arg62, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %41 = stablehlo.subtract %39, %40 : tensor<1x64x56x56xf32>
-    %42 = stablehlo.broadcast_in_dim %41, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %43 = stablehlo.broadcast_in_dim %arg63, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %44 = stablehlo.multiply %42, %43 : tensor<1x64x56x56xf32>
-    %45 = stablehlo.convert %arg64 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %47 = stablehlo.broadcast_in_dim %45, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %48 = stablehlo.multiply %46, %47 : tensor<1x64x56x56xf32>
-    %49 = stablehlo.convert %arg65 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %51 = stablehlo.broadcast_in_dim %49, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %52 = stablehlo.add %50, %51 : tensor<1x64x56x56xf32>
-    %53 = stablehlo.convert %52 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %54 = stablehlo.maximum %53, %cst_1 : tensor<1x64x56x56xbf16>
-    %55 = stablehlo.convolution(%54, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x56x56xbf16>
-    %56 = stablehlo.convert %55 : (tensor<1x256x56x56xbf16>) -> tensor<1x256x56x56xf32>
-    %57 = stablehlo.broadcast_in_dim %56, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %58 = stablehlo.broadcast_in_dim %arg66, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %59 = stablehlo.subtract %57, %58 : tensor<1x256x56x56xf32>
-    %60 = stablehlo.broadcast_in_dim %59, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %61 = stablehlo.broadcast_in_dim %arg67, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %62 = stablehlo.multiply %60, %61 : tensor<1x256x56x56xf32>
-    %63 = stablehlo.convert %arg68 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %64 = stablehlo.broadcast_in_dim %62, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %65 = stablehlo.broadcast_in_dim %63, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %66 = stablehlo.multiply %64, %65 : tensor<1x256x56x56xf32>
-    %67 = stablehlo.convert %arg69 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %68 = stablehlo.broadcast_in_dim %66, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %69 = stablehlo.broadcast_in_dim %67, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %70 = stablehlo.add %68, %69 : tensor<1x256x56x56xf32>
-    %71 = stablehlo.convert %70 : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xbf16>
-    %72 = stablehlo.convolution(%18, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x56x56xbf16>
-    %73 = stablehlo.convert %72 : (tensor<1x256x56x56xbf16>) -> tensor<1x256x56x56xf32>
-    %74 = stablehlo.broadcast_in_dim %73, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %75 = stablehlo.broadcast_in_dim %arg70, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %76 = stablehlo.subtract %74, %75 : tensor<1x256x56x56xf32>
-    %77 = stablehlo.broadcast_in_dim %76, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %78 = stablehlo.broadcast_in_dim %arg71, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %79 = stablehlo.multiply %77, %78 : tensor<1x256x56x56xf32>
-    %80 = stablehlo.convert %arg72 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %81 = stablehlo.broadcast_in_dim %79, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %82 = stablehlo.broadcast_in_dim %80, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %83 = stablehlo.multiply %81, %82 : tensor<1x256x56x56xf32>
-    %84 = stablehlo.convert %arg73 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %85 = stablehlo.broadcast_in_dim %83, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %86 = stablehlo.broadcast_in_dim %84, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %87 = stablehlo.add %85, %86 : tensor<1x256x56x56xf32>
-    %88 = stablehlo.convert %87 : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xbf16>
-    %89 = stablehlo.add %71, %88 : tensor<1x256x56x56xbf16>
-    %90 = stablehlo.maximum %89, %cst_2 : tensor<1x256x56x56xbf16>
-    %91 = stablehlo.convolution(%90, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x56x56xbf16>, tensor<64x256x1x1xbf16>) -> tensor<1x64x56x56xbf16>
-    %92 = stablehlo.convert %91 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %93 = stablehlo.broadcast_in_dim %92, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %94 = stablehlo.broadcast_in_dim %arg74, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %95 = stablehlo.subtract %93, %94 : tensor<1x64x56x56xf32>
-    %96 = stablehlo.broadcast_in_dim %95, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %97 = stablehlo.broadcast_in_dim %arg75, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %98 = stablehlo.multiply %96, %97 : tensor<1x64x56x56xf32>
-    %99 = stablehlo.convert %arg76 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %100 = stablehlo.broadcast_in_dim %98, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %101 = stablehlo.broadcast_in_dim %99, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %102 = stablehlo.multiply %100, %101 : tensor<1x64x56x56xf32>
-    %103 = stablehlo.convert %arg77 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %104 = stablehlo.broadcast_in_dim %102, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %105 = stablehlo.broadcast_in_dim %103, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %106 = stablehlo.add %104, %105 : tensor<1x64x56x56xf32>
-    %107 = stablehlo.convert %106 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %108 = stablehlo.maximum %107, %cst_1 : tensor<1x64x56x56xbf16>
-    %109 = stablehlo.convolution(%108, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %110 = stablehlo.convert %109 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %111 = stablehlo.broadcast_in_dim %110, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %112 = stablehlo.broadcast_in_dim %arg78, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %113 = stablehlo.subtract %111, %112 : tensor<1x64x56x56xf32>
-    %114 = stablehlo.broadcast_in_dim %113, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %115 = stablehlo.broadcast_in_dim %arg79, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %116 = stablehlo.multiply %114, %115 : tensor<1x64x56x56xf32>
-    %117 = stablehlo.convert %arg80 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %118 = stablehlo.broadcast_in_dim %116, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %119 = stablehlo.broadcast_in_dim %117, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %120 = stablehlo.multiply %118, %119 : tensor<1x64x56x56xf32>
-    %121 = stablehlo.convert %arg81 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %122 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %123 = stablehlo.broadcast_in_dim %121, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %124 = stablehlo.add %122, %123 : tensor<1x64x56x56xf32>
-    %125 = stablehlo.convert %124 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %126 = stablehlo.maximum %125, %cst_1 : tensor<1x64x56x56xbf16>
-    %127 = stablehlo.convolution(%126, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x56x56xbf16>
-    %128 = stablehlo.convert %127 : (tensor<1x256x56x56xbf16>) -> tensor<1x256x56x56xf32>
-    %129 = stablehlo.broadcast_in_dim %128, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %130 = stablehlo.broadcast_in_dim %arg82, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %131 = stablehlo.subtract %129, %130 : tensor<1x256x56x56xf32>
-    %132 = stablehlo.broadcast_in_dim %131, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %133 = stablehlo.broadcast_in_dim %arg83, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %134 = stablehlo.multiply %132, %133 : tensor<1x256x56x56xf32>
-    %135 = stablehlo.convert %arg84 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %136 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %137 = stablehlo.broadcast_in_dim %135, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %138 = stablehlo.multiply %136, %137 : tensor<1x256x56x56xf32>
-    %139 = stablehlo.convert %arg85 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %140 = stablehlo.broadcast_in_dim %138, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %141 = stablehlo.broadcast_in_dim %139, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %142 = stablehlo.add %140, %141 : tensor<1x256x56x56xf32>
-    %143 = stablehlo.convert %142 : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xbf16>
-    %144 = stablehlo.add %143, %90 : tensor<1x256x56x56xbf16>
-    %145 = stablehlo.maximum %144, %cst_2 : tensor<1x256x56x56xbf16>
-    %146 = stablehlo.convolution(%145, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x56x56xbf16>, tensor<64x256x1x1xbf16>) -> tensor<1x64x56x56xbf16>
-    %147 = stablehlo.convert %146 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %148 = stablehlo.broadcast_in_dim %147, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %149 = stablehlo.broadcast_in_dim %arg86, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %150 = stablehlo.subtract %148, %149 : tensor<1x64x56x56xf32>
-    %151 = stablehlo.broadcast_in_dim %150, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %152 = stablehlo.broadcast_in_dim %arg87, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %153 = stablehlo.multiply %151, %152 : tensor<1x64x56x56xf32>
-    %154 = stablehlo.convert %arg88 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %155 = stablehlo.broadcast_in_dim %153, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %156 = stablehlo.broadcast_in_dim %154, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %157 = stablehlo.multiply %155, %156 : tensor<1x64x56x56xf32>
-    %158 = stablehlo.convert %arg89 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %159 = stablehlo.broadcast_in_dim %157, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %160 = stablehlo.broadcast_in_dim %158, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %161 = stablehlo.add %159, %160 : tensor<1x64x56x56xf32>
-    %162 = stablehlo.convert %161 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %163 = stablehlo.maximum %162, %cst_1 : tensor<1x64x56x56xbf16>
-    %164 = stablehlo.convolution(%163, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<64x64x3x3xbf16>) -> tensor<1x64x56x56xbf16>
-    %165 = stablehlo.convert %164 : (tensor<1x64x56x56xbf16>) -> tensor<1x64x56x56xf32>
-    %166 = stablehlo.broadcast_in_dim %165, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %167 = stablehlo.broadcast_in_dim %arg90, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %168 = stablehlo.subtract %166, %167 : tensor<1x64x56x56xf32>
-    %169 = stablehlo.broadcast_in_dim %168, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %170 = stablehlo.broadcast_in_dim %arg91, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %171 = stablehlo.multiply %169, %170 : tensor<1x64x56x56xf32>
-    %172 = stablehlo.convert %arg92 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %173 = stablehlo.broadcast_in_dim %171, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %174 = stablehlo.broadcast_in_dim %172, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %175 = stablehlo.multiply %173, %174 : tensor<1x64x56x56xf32>
-    %176 = stablehlo.convert %arg93 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %177 = stablehlo.broadcast_in_dim %175, dims = [0, 1, 2, 3] : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
-    %178 = stablehlo.broadcast_in_dim %176, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x56x56xf32>
-    %179 = stablehlo.add %177, %178 : tensor<1x64x56x56xf32>
-    %180 = stablehlo.convert %179 : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xbf16>
-    %181 = stablehlo.maximum %180, %cst_1 : tensor<1x64x56x56xbf16>
-    %182 = stablehlo.convolution(%181, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x56x56xbf16>, tensor<256x64x1x1xbf16>) -> tensor<1x256x56x56xbf16>
-    %183 = stablehlo.convert %182 : (tensor<1x256x56x56xbf16>) -> tensor<1x256x56x56xf32>
-    %184 = stablehlo.broadcast_in_dim %183, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %185 = stablehlo.broadcast_in_dim %arg94, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %186 = stablehlo.subtract %184, %185 : tensor<1x256x56x56xf32>
-    %187 = stablehlo.broadcast_in_dim %186, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %188 = stablehlo.broadcast_in_dim %arg95, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %189 = stablehlo.multiply %187, %188 : tensor<1x256x56x56xf32>
-    %190 = stablehlo.convert %arg96 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %191 = stablehlo.broadcast_in_dim %189, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %192 = stablehlo.broadcast_in_dim %190, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %193 = stablehlo.multiply %191, %192 : tensor<1x256x56x56xf32>
-    %194 = stablehlo.convert %arg97 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %195 = stablehlo.broadcast_in_dim %193, dims = [0, 1, 2, 3] : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf32>
-    %196 = stablehlo.broadcast_in_dim %194, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x56x56xf32>
-    %197 = stablehlo.add %195, %196 : tensor<1x256x56x56xf32>
-    %198 = stablehlo.convert %197 : (tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xbf16>
-    %199 = stablehlo.add %198, %145 : tensor<1x256x56x56xbf16>
-    %200 = stablehlo.maximum %199, %cst_2 : tensor<1x256x56x56xbf16>
-    %201 = stablehlo.convolution(%200, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x56x56xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x56x56xbf16>
-    %202 = stablehlo.convert %201 : (tensor<1x128x56x56xbf16>) -> tensor<1x128x56x56xf32>
-    %203 = stablehlo.broadcast_in_dim %202, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %204 = stablehlo.broadcast_in_dim %arg98, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %205 = stablehlo.subtract %203, %204 : tensor<1x128x56x56xf32>
-    %206 = stablehlo.broadcast_in_dim %205, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %207 = stablehlo.broadcast_in_dim %arg99, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %208 = stablehlo.multiply %206, %207 : tensor<1x128x56x56xf32>
-    %209 = stablehlo.convert %arg100 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %210 = stablehlo.broadcast_in_dim %208, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %211 = stablehlo.broadcast_in_dim %209, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %212 = stablehlo.multiply %210, %211 : tensor<1x128x56x56xf32>
-    %213 = stablehlo.convert %arg101 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %214 = stablehlo.broadcast_in_dim %212, dims = [0, 1, 2, 3] : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xf32>
-    %215 = stablehlo.broadcast_in_dim %213, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x56x56xf32>
-    %216 = stablehlo.add %214, %215 : tensor<1x128x56x56xf32>
-    %217 = stablehlo.convert %216 : (tensor<1x128x56x56xf32>) -> tensor<1x128x56x56xbf16>
-    %218 = stablehlo.maximum %217, %cst_3 : tensor<1x128x56x56xbf16>
-    %219 = stablehlo.convolution(%218, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x56x56xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %220 = stablehlo.convert %219 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %221 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %222 = stablehlo.broadcast_in_dim %arg102, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %223 = stablehlo.subtract %221, %222 : tensor<1x128x28x28xf32>
-    %224 = stablehlo.broadcast_in_dim %223, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %225 = stablehlo.broadcast_in_dim %arg103, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %226 = stablehlo.multiply %224, %225 : tensor<1x128x28x28xf32>
-    %227 = stablehlo.convert %arg104 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %228 = stablehlo.broadcast_in_dim %226, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %229 = stablehlo.broadcast_in_dim %227, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %230 = stablehlo.multiply %228, %229 : tensor<1x128x28x28xf32>
-    %231 = stablehlo.convert %arg105 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %232 = stablehlo.broadcast_in_dim %230, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %233 = stablehlo.broadcast_in_dim %231, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %234 = stablehlo.add %232, %233 : tensor<1x128x28x28xf32>
-    %235 = stablehlo.convert %234 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %236 = stablehlo.maximum %235, %cst_4 : tensor<1x128x28x28xbf16>
-    %237 = stablehlo.convolution(%236, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %238 = stablehlo.convert %237 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %239 = stablehlo.broadcast_in_dim %238, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %240 = stablehlo.broadcast_in_dim %arg106, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %241 = stablehlo.subtract %239, %240 : tensor<1x512x28x28xf32>
-    %242 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %243 = stablehlo.broadcast_in_dim %arg107, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %244 = stablehlo.multiply %242, %243 : tensor<1x512x28x28xf32>
-    %245 = stablehlo.convert %arg108 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %246 = stablehlo.broadcast_in_dim %244, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %247 = stablehlo.broadcast_in_dim %245, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %248 = stablehlo.multiply %246, %247 : tensor<1x512x28x28xf32>
-    %249 = stablehlo.convert %arg109 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %250 = stablehlo.broadcast_in_dim %248, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %251 = stablehlo.broadcast_in_dim %249, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %252 = stablehlo.add %250, %251 : tensor<1x512x28x28xf32>
-    %253 = stablehlo.convert %252 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %254 = stablehlo.convolution(%200, %arg15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x56x56xbf16>, tensor<512x256x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %255 = stablehlo.convert %254 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %256 = stablehlo.broadcast_in_dim %255, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %257 = stablehlo.broadcast_in_dim %arg110, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %258 = stablehlo.subtract %256, %257 : tensor<1x512x28x28xf32>
-    %259 = stablehlo.broadcast_in_dim %258, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %260 = stablehlo.broadcast_in_dim %arg111, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %261 = stablehlo.multiply %259, %260 : tensor<1x512x28x28xf32>
-    %262 = stablehlo.convert %arg112 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %263 = stablehlo.broadcast_in_dim %261, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %264 = stablehlo.broadcast_in_dim %262, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %265 = stablehlo.multiply %263, %264 : tensor<1x512x28x28xf32>
-    %266 = stablehlo.convert %arg113 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %267 = stablehlo.broadcast_in_dim %265, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %268 = stablehlo.broadcast_in_dim %266, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %269 = stablehlo.add %267, %268 : tensor<1x512x28x28xf32>
-    %270 = stablehlo.convert %269 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %271 = stablehlo.add %253, %270 : tensor<1x512x28x28xbf16>
-    %272 = stablehlo.maximum %271, %cst_5 : tensor<1x512x28x28xbf16>
-    %273 = stablehlo.convolution(%272, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %274 = stablehlo.convert %273 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %276 = stablehlo.broadcast_in_dim %arg114, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %277 = stablehlo.subtract %275, %276 : tensor<1x128x28x28xf32>
-    %278 = stablehlo.broadcast_in_dim %277, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %279 = stablehlo.broadcast_in_dim %arg115, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %280 = stablehlo.multiply %278, %279 : tensor<1x128x28x28xf32>
-    %281 = stablehlo.convert %arg116 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %282 = stablehlo.broadcast_in_dim %280, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %283 = stablehlo.broadcast_in_dim %281, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %284 = stablehlo.multiply %282, %283 : tensor<1x128x28x28xf32>
-    %285 = stablehlo.convert %arg117 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %286 = stablehlo.broadcast_in_dim %284, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %287 = stablehlo.broadcast_in_dim %285, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %288 = stablehlo.add %286, %287 : tensor<1x128x28x28xf32>
-    %289 = stablehlo.convert %288 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %290 = stablehlo.maximum %289, %cst_4 : tensor<1x128x28x28xbf16>
-    %291 = stablehlo.convolution(%290, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %292 = stablehlo.convert %291 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %293 = stablehlo.broadcast_in_dim %292, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %294 = stablehlo.broadcast_in_dim %arg118, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %295 = stablehlo.subtract %293, %294 : tensor<1x128x28x28xf32>
-    %296 = stablehlo.broadcast_in_dim %295, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %297 = stablehlo.broadcast_in_dim %arg119, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %298 = stablehlo.multiply %296, %297 : tensor<1x128x28x28xf32>
-    %299 = stablehlo.convert %arg120 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %300 = stablehlo.broadcast_in_dim %298, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %301 = stablehlo.broadcast_in_dim %299, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %302 = stablehlo.multiply %300, %301 : tensor<1x128x28x28xf32>
-    %303 = stablehlo.convert %arg121 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %304 = stablehlo.broadcast_in_dim %302, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %305 = stablehlo.broadcast_in_dim %303, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %306 = stablehlo.add %304, %305 : tensor<1x128x28x28xf32>
-    %307 = stablehlo.convert %306 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %308 = stablehlo.maximum %307, %cst_4 : tensor<1x128x28x28xbf16>
-    %309 = stablehlo.convolution(%308, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %310 = stablehlo.convert %309 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %311 = stablehlo.broadcast_in_dim %310, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %312 = stablehlo.broadcast_in_dim %arg122, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %313 = stablehlo.subtract %311, %312 : tensor<1x512x28x28xf32>
-    %314 = stablehlo.broadcast_in_dim %313, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %315 = stablehlo.broadcast_in_dim %arg123, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %316 = stablehlo.multiply %314, %315 : tensor<1x512x28x28xf32>
-    %317 = stablehlo.convert %arg124 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %318 = stablehlo.broadcast_in_dim %316, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %319 = stablehlo.broadcast_in_dim %317, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %320 = stablehlo.multiply %318, %319 : tensor<1x512x28x28xf32>
-    %321 = stablehlo.convert %arg125 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %322 = stablehlo.broadcast_in_dim %320, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %323 = stablehlo.broadcast_in_dim %321, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %324 = stablehlo.add %322, %323 : tensor<1x512x28x28xf32>
-    %325 = stablehlo.convert %324 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %326 = stablehlo.add %325, %272 : tensor<1x512x28x28xbf16>
-    %327 = stablehlo.maximum %326, %cst_5 : tensor<1x512x28x28xbf16>
-    %328 = stablehlo.convolution(%327, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %329 = stablehlo.convert %328 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %330 = stablehlo.broadcast_in_dim %329, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %331 = stablehlo.broadcast_in_dim %arg126, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %332 = stablehlo.subtract %330, %331 : tensor<1x128x28x28xf32>
-    %333 = stablehlo.broadcast_in_dim %332, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %334 = stablehlo.broadcast_in_dim %arg127, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %335 = stablehlo.multiply %333, %334 : tensor<1x128x28x28xf32>
-    %336 = stablehlo.convert %arg128 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %337 = stablehlo.broadcast_in_dim %335, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %338 = stablehlo.broadcast_in_dim %336, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %339 = stablehlo.multiply %337, %338 : tensor<1x128x28x28xf32>
-    %340 = stablehlo.convert %arg129 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %341 = stablehlo.broadcast_in_dim %339, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %342 = stablehlo.broadcast_in_dim %340, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %343 = stablehlo.add %341, %342 : tensor<1x128x28x28xf32>
-    %344 = stablehlo.convert %343 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %345 = stablehlo.maximum %344, %cst_4 : tensor<1x128x28x28xbf16>
-    %346 = stablehlo.convolution(%345, %arg20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %347 = stablehlo.convert %346 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %348 = stablehlo.broadcast_in_dim %347, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %349 = stablehlo.broadcast_in_dim %arg130, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %350 = stablehlo.subtract %348, %349 : tensor<1x128x28x28xf32>
-    %351 = stablehlo.broadcast_in_dim %350, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %352 = stablehlo.broadcast_in_dim %arg131, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %353 = stablehlo.multiply %351, %352 : tensor<1x128x28x28xf32>
-    %354 = stablehlo.convert %arg132 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %355 = stablehlo.broadcast_in_dim %353, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %356 = stablehlo.broadcast_in_dim %354, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %357 = stablehlo.multiply %355, %356 : tensor<1x128x28x28xf32>
-    %358 = stablehlo.convert %arg133 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %359 = stablehlo.broadcast_in_dim %357, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %360 = stablehlo.broadcast_in_dim %358, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %361 = stablehlo.add %359, %360 : tensor<1x128x28x28xf32>
-    %362 = stablehlo.convert %361 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %363 = stablehlo.maximum %362, %cst_4 : tensor<1x128x28x28xbf16>
-    %364 = stablehlo.convolution(%363, %arg21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %365 = stablehlo.convert %364 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %366 = stablehlo.broadcast_in_dim %365, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %367 = stablehlo.broadcast_in_dim %arg134, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %368 = stablehlo.subtract %366, %367 : tensor<1x512x28x28xf32>
-    %369 = stablehlo.broadcast_in_dim %368, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %370 = stablehlo.broadcast_in_dim %arg135, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %371 = stablehlo.multiply %369, %370 : tensor<1x512x28x28xf32>
-    %372 = stablehlo.convert %arg136 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %373 = stablehlo.broadcast_in_dim %371, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %374 = stablehlo.broadcast_in_dim %372, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %375 = stablehlo.multiply %373, %374 : tensor<1x512x28x28xf32>
-    %376 = stablehlo.convert %arg137 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %377 = stablehlo.broadcast_in_dim %375, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %378 = stablehlo.broadcast_in_dim %376, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %379 = stablehlo.add %377, %378 : tensor<1x512x28x28xf32>
-    %380 = stablehlo.convert %379 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %381 = stablehlo.add %380, %327 : tensor<1x512x28x28xbf16>
-    %382 = stablehlo.maximum %381, %cst_5 : tensor<1x512x28x28xbf16>
-    %383 = stablehlo.convolution(%382, %arg22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<128x512x1x1xbf16>) -> tensor<1x128x28x28xbf16>
-    %384 = stablehlo.convert %383 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %385 = stablehlo.broadcast_in_dim %384, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %386 = stablehlo.broadcast_in_dim %arg138, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %387 = stablehlo.subtract %385, %386 : tensor<1x128x28x28xf32>
-    %388 = stablehlo.broadcast_in_dim %387, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %389 = stablehlo.broadcast_in_dim %arg139, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %390 = stablehlo.multiply %388, %389 : tensor<1x128x28x28xf32>
-    %391 = stablehlo.convert %arg140 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %392 = stablehlo.broadcast_in_dim %390, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %393 = stablehlo.broadcast_in_dim %391, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %394 = stablehlo.multiply %392, %393 : tensor<1x128x28x28xf32>
-    %395 = stablehlo.convert %arg141 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %396 = stablehlo.broadcast_in_dim %394, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %397 = stablehlo.broadcast_in_dim %395, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %398 = stablehlo.add %396, %397 : tensor<1x128x28x28xf32>
-    %399 = stablehlo.convert %398 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %400 = stablehlo.maximum %399, %cst_4 : tensor<1x128x28x28xbf16>
-    %401 = stablehlo.convolution(%400, %arg23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<128x128x3x3xbf16>) -> tensor<1x128x28x28xbf16>
-    %402 = stablehlo.convert %401 : (tensor<1x128x28x28xbf16>) -> tensor<1x128x28x28xf32>
-    %403 = stablehlo.broadcast_in_dim %402, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %404 = stablehlo.broadcast_in_dim %arg142, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %405 = stablehlo.subtract %403, %404 : tensor<1x128x28x28xf32>
-    %406 = stablehlo.broadcast_in_dim %405, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %407 = stablehlo.broadcast_in_dim %arg143, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %408 = stablehlo.multiply %406, %407 : tensor<1x128x28x28xf32>
-    %409 = stablehlo.convert %arg144 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %410 = stablehlo.broadcast_in_dim %408, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %411 = stablehlo.broadcast_in_dim %409, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %412 = stablehlo.multiply %410, %411 : tensor<1x128x28x28xf32>
-    %413 = stablehlo.convert %arg145 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %414 = stablehlo.broadcast_in_dim %412, dims = [0, 1, 2, 3] : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
-    %415 = stablehlo.broadcast_in_dim %413, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x28x28xf32>
-    %416 = stablehlo.add %414, %415 : tensor<1x128x28x28xf32>
-    %417 = stablehlo.convert %416 : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xbf16>
-    %418 = stablehlo.maximum %417, %cst_4 : tensor<1x128x28x28xbf16>
-    %419 = stablehlo.convolution(%418, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x28x28xbf16>, tensor<512x128x1x1xbf16>) -> tensor<1x512x28x28xbf16>
-    %420 = stablehlo.convert %419 : (tensor<1x512x28x28xbf16>) -> tensor<1x512x28x28xf32>
-    %421 = stablehlo.broadcast_in_dim %420, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %422 = stablehlo.broadcast_in_dim %arg146, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %423 = stablehlo.subtract %421, %422 : tensor<1x512x28x28xf32>
-    %424 = stablehlo.broadcast_in_dim %423, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %425 = stablehlo.broadcast_in_dim %arg147, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %426 = stablehlo.multiply %424, %425 : tensor<1x512x28x28xf32>
-    %427 = stablehlo.convert %arg148 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %428 = stablehlo.broadcast_in_dim %426, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %429 = stablehlo.broadcast_in_dim %427, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %430 = stablehlo.multiply %428, %429 : tensor<1x512x28x28xf32>
-    %431 = stablehlo.convert %arg149 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %432 = stablehlo.broadcast_in_dim %430, dims = [0, 1, 2, 3] : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xf32>
-    %433 = stablehlo.broadcast_in_dim %431, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x28x28xf32>
-    %434 = stablehlo.add %432, %433 : tensor<1x512x28x28xf32>
-    %435 = stablehlo.convert %434 : (tensor<1x512x28x28xf32>) -> tensor<1x512x28x28xbf16>
-    %436 = stablehlo.add %435, %382 : tensor<1x512x28x28xbf16>
-    %437 = stablehlo.maximum %436, %cst_5 : tensor<1x512x28x28xbf16>
-    %438 = stablehlo.convolution(%437, %arg25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x28x28xbf16>
-    %439 = stablehlo.convert %438 : (tensor<1x256x28x28xbf16>) -> tensor<1x256x28x28xf32>
-    %440 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %441 = stablehlo.broadcast_in_dim %arg150, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %442 = stablehlo.subtract %440, %441 : tensor<1x256x28x28xf32>
-    %443 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %444 = stablehlo.broadcast_in_dim %arg151, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %445 = stablehlo.multiply %443, %444 : tensor<1x256x28x28xf32>
-    %446 = stablehlo.convert %arg152 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %447 = stablehlo.broadcast_in_dim %445, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %448 = stablehlo.broadcast_in_dim %446, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %449 = stablehlo.multiply %447, %448 : tensor<1x256x28x28xf32>
-    %450 = stablehlo.convert %arg153 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %451 = stablehlo.broadcast_in_dim %449, dims = [0, 1, 2, 3] : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xf32>
-    %452 = stablehlo.broadcast_in_dim %450, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x28x28xf32>
-    %453 = stablehlo.add %451, %452 : tensor<1x256x28x28xf32>
-    %454 = stablehlo.convert %453 : (tensor<1x256x28x28xf32>) -> tensor<1x256x28x28xbf16>
-    %455 = stablehlo.maximum %454, %cst_6 : tensor<1x256x28x28xbf16>
-    %456 = stablehlo.convolution(%455, %arg26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x28x28xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %457 = stablehlo.convert %456 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %458 = stablehlo.broadcast_in_dim %457, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %459 = stablehlo.broadcast_in_dim %arg154, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %460 = stablehlo.subtract %458, %459 : tensor<1x256x14x14xf32>
-    %461 = stablehlo.broadcast_in_dim %460, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %462 = stablehlo.broadcast_in_dim %arg155, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %463 = stablehlo.multiply %461, %462 : tensor<1x256x14x14xf32>
-    %464 = stablehlo.convert %arg156 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %465 = stablehlo.broadcast_in_dim %463, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %466 = stablehlo.broadcast_in_dim %464, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %467 = stablehlo.multiply %465, %466 : tensor<1x256x14x14xf32>
-    %468 = stablehlo.convert %arg157 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %469 = stablehlo.broadcast_in_dim %467, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %470 = stablehlo.broadcast_in_dim %468, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %471 = stablehlo.add %469, %470 : tensor<1x256x14x14xf32>
-    %472 = stablehlo.convert %471 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %473 = stablehlo.maximum %472, %cst_7 : tensor<1x256x14x14xbf16>
-    %474 = stablehlo.convolution(%473, %arg27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %475 = stablehlo.convert %474 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %476 = stablehlo.broadcast_in_dim %475, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %477 = stablehlo.broadcast_in_dim %arg158, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %478 = stablehlo.subtract %476, %477 : tensor<1x1024x14x14xf32>
-    %479 = stablehlo.broadcast_in_dim %478, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %480 = stablehlo.broadcast_in_dim %arg159, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %481 = stablehlo.multiply %479, %480 : tensor<1x1024x14x14xf32>
-    %482 = stablehlo.convert %arg160 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %483 = stablehlo.broadcast_in_dim %481, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %484 = stablehlo.broadcast_in_dim %482, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %485 = stablehlo.multiply %483, %484 : tensor<1x1024x14x14xf32>
-    %486 = stablehlo.convert %arg161 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %487 = stablehlo.broadcast_in_dim %485, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %488 = stablehlo.broadcast_in_dim %486, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %489 = stablehlo.add %487, %488 : tensor<1x1024x14x14xf32>
-    %490 = stablehlo.convert %489 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %491 = stablehlo.convolution(%437, %arg28) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x28x28xbf16>, tensor<1024x512x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %492 = stablehlo.convert %491 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %493 = stablehlo.broadcast_in_dim %492, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %494 = stablehlo.broadcast_in_dim %arg162, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %495 = stablehlo.subtract %493, %494 : tensor<1x1024x14x14xf32>
-    %496 = stablehlo.broadcast_in_dim %495, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %497 = stablehlo.broadcast_in_dim %arg163, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %498 = stablehlo.multiply %496, %497 : tensor<1x1024x14x14xf32>
-    %499 = stablehlo.convert %arg164 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %500 = stablehlo.broadcast_in_dim %498, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %501 = stablehlo.broadcast_in_dim %499, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %502 = stablehlo.multiply %500, %501 : tensor<1x1024x14x14xf32>
-    %503 = stablehlo.convert %arg165 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %504 = stablehlo.broadcast_in_dim %502, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %505 = stablehlo.broadcast_in_dim %503, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %506 = stablehlo.add %504, %505 : tensor<1x1024x14x14xf32>
-    %507 = stablehlo.convert %506 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %508 = stablehlo.add %490, %507 : tensor<1x1024x14x14xbf16>
-    %509 = stablehlo.maximum %508, %cst_8 : tensor<1x1024x14x14xbf16>
-    %510 = stablehlo.convolution(%509, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x14x14xbf16>
-    %511 = stablehlo.convert %510 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %512 = stablehlo.broadcast_in_dim %511, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %513 = stablehlo.broadcast_in_dim %arg166, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %514 = stablehlo.subtract %512, %513 : tensor<1x256x14x14xf32>
-    %515 = stablehlo.broadcast_in_dim %514, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %516 = stablehlo.broadcast_in_dim %arg167, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %517 = stablehlo.multiply %515, %516 : tensor<1x256x14x14xf32>
-    %518 = stablehlo.convert %arg168 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %519 = stablehlo.broadcast_in_dim %517, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %520 = stablehlo.broadcast_in_dim %518, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %521 = stablehlo.multiply %519, %520 : tensor<1x256x14x14xf32>
-    %522 = stablehlo.convert %arg169 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %523 = stablehlo.broadcast_in_dim %521, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %524 = stablehlo.broadcast_in_dim %522, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %525 = stablehlo.add %523, %524 : tensor<1x256x14x14xf32>
-    %526 = stablehlo.convert %525 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %527 = stablehlo.maximum %526, %cst_7 : tensor<1x256x14x14xbf16>
-    %528 = stablehlo.convolution(%527, %arg30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %529 = stablehlo.convert %528 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %530 = stablehlo.broadcast_in_dim %529, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %531 = stablehlo.broadcast_in_dim %arg170, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %532 = stablehlo.subtract %530, %531 : tensor<1x256x14x14xf32>
-    %533 = stablehlo.broadcast_in_dim %532, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %534 = stablehlo.broadcast_in_dim %arg171, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %535 = stablehlo.multiply %533, %534 : tensor<1x256x14x14xf32>
-    %536 = stablehlo.convert %arg172 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %537 = stablehlo.broadcast_in_dim %535, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %538 = stablehlo.broadcast_in_dim %536, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %539 = stablehlo.multiply %537, %538 : tensor<1x256x14x14xf32>
-    %540 = stablehlo.convert %arg173 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %541 = stablehlo.broadcast_in_dim %539, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %542 = stablehlo.broadcast_in_dim %540, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %543 = stablehlo.add %541, %542 : tensor<1x256x14x14xf32>
-    %544 = stablehlo.convert %543 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %545 = stablehlo.maximum %544, %cst_7 : tensor<1x256x14x14xbf16>
-    %546 = stablehlo.convolution(%545, %arg31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %547 = stablehlo.convert %546 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %548 = stablehlo.broadcast_in_dim %547, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %549 = stablehlo.broadcast_in_dim %arg174, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %550 = stablehlo.subtract %548, %549 : tensor<1x1024x14x14xf32>
-    %551 = stablehlo.broadcast_in_dim %550, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %552 = stablehlo.broadcast_in_dim %arg175, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %553 = stablehlo.multiply %551, %552 : tensor<1x1024x14x14xf32>
-    %554 = stablehlo.convert %arg176 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %555 = stablehlo.broadcast_in_dim %553, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %556 = stablehlo.broadcast_in_dim %554, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %557 = stablehlo.multiply %555, %556 : tensor<1x1024x14x14xf32>
-    %558 = stablehlo.convert %arg177 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %559 = stablehlo.broadcast_in_dim %557, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %560 = stablehlo.broadcast_in_dim %558, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %561 = stablehlo.add %559, %560 : tensor<1x1024x14x14xf32>
-    %562 = stablehlo.convert %561 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %563 = stablehlo.add %562, %509 : tensor<1x1024x14x14xbf16>
-    %564 = stablehlo.maximum %563, %cst_8 : tensor<1x1024x14x14xbf16>
-    %565 = stablehlo.convolution(%564, %arg32) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x14x14xbf16>
-    %566 = stablehlo.convert %565 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %567 = stablehlo.broadcast_in_dim %566, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %568 = stablehlo.broadcast_in_dim %arg178, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %569 = stablehlo.subtract %567, %568 : tensor<1x256x14x14xf32>
-    %570 = stablehlo.broadcast_in_dim %569, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %571 = stablehlo.broadcast_in_dim %arg179, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %572 = stablehlo.multiply %570, %571 : tensor<1x256x14x14xf32>
-    %573 = stablehlo.convert %arg180 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %574 = stablehlo.broadcast_in_dim %572, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %575 = stablehlo.broadcast_in_dim %573, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %576 = stablehlo.multiply %574, %575 : tensor<1x256x14x14xf32>
-    %577 = stablehlo.convert %arg181 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %578 = stablehlo.broadcast_in_dim %576, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %579 = stablehlo.broadcast_in_dim %577, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %580 = stablehlo.add %578, %579 : tensor<1x256x14x14xf32>
-    %581 = stablehlo.convert %580 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %582 = stablehlo.maximum %581, %cst_7 : tensor<1x256x14x14xbf16>
-    %583 = stablehlo.convolution(%582, %arg33) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %584 = stablehlo.convert %583 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %585 = stablehlo.broadcast_in_dim %584, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %586 = stablehlo.broadcast_in_dim %arg182, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %587 = stablehlo.subtract %585, %586 : tensor<1x256x14x14xf32>
-    %588 = stablehlo.broadcast_in_dim %587, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %589 = stablehlo.broadcast_in_dim %arg183, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %590 = stablehlo.multiply %588, %589 : tensor<1x256x14x14xf32>
-    %591 = stablehlo.convert %arg184 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %592 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %593 = stablehlo.broadcast_in_dim %591, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %594 = stablehlo.multiply %592, %593 : tensor<1x256x14x14xf32>
-    %595 = stablehlo.convert %arg185 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %596 = stablehlo.broadcast_in_dim %594, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %597 = stablehlo.broadcast_in_dim %595, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %598 = stablehlo.add %596, %597 : tensor<1x256x14x14xf32>
-    %599 = stablehlo.convert %598 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %600 = stablehlo.maximum %599, %cst_7 : tensor<1x256x14x14xbf16>
-    %601 = stablehlo.convolution(%600, %arg34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %602 = stablehlo.convert %601 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %603 = stablehlo.broadcast_in_dim %602, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %604 = stablehlo.broadcast_in_dim %arg186, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %605 = stablehlo.subtract %603, %604 : tensor<1x1024x14x14xf32>
-    %606 = stablehlo.broadcast_in_dim %605, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %607 = stablehlo.broadcast_in_dim %arg187, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %608 = stablehlo.multiply %606, %607 : tensor<1x1024x14x14xf32>
-    %609 = stablehlo.convert %arg188 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %610 = stablehlo.broadcast_in_dim %608, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %611 = stablehlo.broadcast_in_dim %609, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %612 = stablehlo.multiply %610, %611 : tensor<1x1024x14x14xf32>
-    %613 = stablehlo.convert %arg189 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %614 = stablehlo.broadcast_in_dim %612, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %615 = stablehlo.broadcast_in_dim %613, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %616 = stablehlo.add %614, %615 : tensor<1x1024x14x14xf32>
-    %617 = stablehlo.convert %616 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %618 = stablehlo.add %617, %564 : tensor<1x1024x14x14xbf16>
-    %619 = stablehlo.maximum %618, %cst_8 : tensor<1x1024x14x14xbf16>
-    %620 = stablehlo.convolution(%619, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x14x14xbf16>
-    %621 = stablehlo.convert %620 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %622 = stablehlo.broadcast_in_dim %621, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %623 = stablehlo.broadcast_in_dim %arg190, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %624 = stablehlo.subtract %622, %623 : tensor<1x256x14x14xf32>
-    %625 = stablehlo.broadcast_in_dim %624, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %626 = stablehlo.broadcast_in_dim %arg191, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %627 = stablehlo.multiply %625, %626 : tensor<1x256x14x14xf32>
-    %628 = stablehlo.convert %arg192 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %629 = stablehlo.broadcast_in_dim %627, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %630 = stablehlo.broadcast_in_dim %628, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %631 = stablehlo.multiply %629, %630 : tensor<1x256x14x14xf32>
-    %632 = stablehlo.convert %arg193 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %633 = stablehlo.broadcast_in_dim %631, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %634 = stablehlo.broadcast_in_dim %632, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %635 = stablehlo.add %633, %634 : tensor<1x256x14x14xf32>
-    %636 = stablehlo.convert %635 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %637 = stablehlo.maximum %636, %cst_7 : tensor<1x256x14x14xbf16>
-    %638 = stablehlo.convolution(%637, %arg36) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %639 = stablehlo.convert %638 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %640 = stablehlo.broadcast_in_dim %639, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %641 = stablehlo.broadcast_in_dim %arg194, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %642 = stablehlo.subtract %640, %641 : tensor<1x256x14x14xf32>
-    %643 = stablehlo.broadcast_in_dim %642, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %644 = stablehlo.broadcast_in_dim %arg195, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %645 = stablehlo.multiply %643, %644 : tensor<1x256x14x14xf32>
-    %646 = stablehlo.convert %arg196 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %647 = stablehlo.broadcast_in_dim %645, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %648 = stablehlo.broadcast_in_dim %646, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %649 = stablehlo.multiply %647, %648 : tensor<1x256x14x14xf32>
-    %650 = stablehlo.convert %arg197 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %651 = stablehlo.broadcast_in_dim %649, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %652 = stablehlo.broadcast_in_dim %650, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %653 = stablehlo.add %651, %652 : tensor<1x256x14x14xf32>
-    %654 = stablehlo.convert %653 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %655 = stablehlo.maximum %654, %cst_7 : tensor<1x256x14x14xbf16>
-    %656 = stablehlo.convolution(%655, %arg37) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %657 = stablehlo.convert %656 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %658 = stablehlo.broadcast_in_dim %657, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %659 = stablehlo.broadcast_in_dim %arg198, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %660 = stablehlo.subtract %658, %659 : tensor<1x1024x14x14xf32>
-    %661 = stablehlo.broadcast_in_dim %660, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %662 = stablehlo.broadcast_in_dim %arg199, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %663 = stablehlo.multiply %661, %662 : tensor<1x1024x14x14xf32>
-    %664 = stablehlo.convert %arg200 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %665 = stablehlo.broadcast_in_dim %663, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %666 = stablehlo.broadcast_in_dim %664, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %667 = stablehlo.multiply %665, %666 : tensor<1x1024x14x14xf32>
-    %668 = stablehlo.convert %arg201 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %669 = stablehlo.broadcast_in_dim %667, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %670 = stablehlo.broadcast_in_dim %668, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %671 = stablehlo.add %669, %670 : tensor<1x1024x14x14xf32>
-    %672 = stablehlo.convert %671 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %673 = stablehlo.add %672, %619 : tensor<1x1024x14x14xbf16>
-    %674 = stablehlo.maximum %673, %cst_8 : tensor<1x1024x14x14xbf16>
-    %675 = stablehlo.convolution(%674, %arg38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x14x14xbf16>
-    %676 = stablehlo.convert %675 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %677 = stablehlo.broadcast_in_dim %676, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %678 = stablehlo.broadcast_in_dim %arg202, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %679 = stablehlo.subtract %677, %678 : tensor<1x256x14x14xf32>
-    %680 = stablehlo.broadcast_in_dim %679, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %681 = stablehlo.broadcast_in_dim %arg203, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %682 = stablehlo.multiply %680, %681 : tensor<1x256x14x14xf32>
-    %683 = stablehlo.convert %arg204 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %684 = stablehlo.broadcast_in_dim %682, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %685 = stablehlo.broadcast_in_dim %683, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %686 = stablehlo.multiply %684, %685 : tensor<1x256x14x14xf32>
-    %687 = stablehlo.convert %arg205 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %688 = stablehlo.broadcast_in_dim %686, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %689 = stablehlo.broadcast_in_dim %687, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %690 = stablehlo.add %688, %689 : tensor<1x256x14x14xf32>
-    %691 = stablehlo.convert %690 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %692 = stablehlo.maximum %691, %cst_7 : tensor<1x256x14x14xbf16>
-    %693 = stablehlo.convolution(%692, %arg39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %694 = stablehlo.convert %693 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %695 = stablehlo.broadcast_in_dim %694, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %696 = stablehlo.broadcast_in_dim %arg206, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %697 = stablehlo.subtract %695, %696 : tensor<1x256x14x14xf32>
-    %698 = stablehlo.broadcast_in_dim %697, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %699 = stablehlo.broadcast_in_dim %arg207, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %700 = stablehlo.multiply %698, %699 : tensor<1x256x14x14xf32>
-    %701 = stablehlo.convert %arg208 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %702 = stablehlo.broadcast_in_dim %700, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %703 = stablehlo.broadcast_in_dim %701, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %704 = stablehlo.multiply %702, %703 : tensor<1x256x14x14xf32>
-    %705 = stablehlo.convert %arg209 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %706 = stablehlo.broadcast_in_dim %704, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %707 = stablehlo.broadcast_in_dim %705, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %708 = stablehlo.add %706, %707 : tensor<1x256x14x14xf32>
-    %709 = stablehlo.convert %708 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %710 = stablehlo.maximum %709, %cst_7 : tensor<1x256x14x14xbf16>
-    %711 = stablehlo.convolution(%710, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %712 = stablehlo.convert %711 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %713 = stablehlo.broadcast_in_dim %712, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %714 = stablehlo.broadcast_in_dim %arg210, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %715 = stablehlo.subtract %713, %714 : tensor<1x1024x14x14xf32>
-    %716 = stablehlo.broadcast_in_dim %715, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %717 = stablehlo.broadcast_in_dim %arg211, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %718 = stablehlo.multiply %716, %717 : tensor<1x1024x14x14xf32>
-    %719 = stablehlo.convert %arg212 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %720 = stablehlo.broadcast_in_dim %718, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %721 = stablehlo.broadcast_in_dim %719, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %722 = stablehlo.multiply %720, %721 : tensor<1x1024x14x14xf32>
-    %723 = stablehlo.convert %arg213 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %724 = stablehlo.broadcast_in_dim %722, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %725 = stablehlo.broadcast_in_dim %723, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %726 = stablehlo.add %724, %725 : tensor<1x1024x14x14xf32>
-    %727 = stablehlo.convert %726 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %728 = stablehlo.add %727, %674 : tensor<1x1024x14x14xbf16>
-    %729 = stablehlo.maximum %728, %cst_8 : tensor<1x1024x14x14xbf16>
-    %730 = stablehlo.convolution(%729, %arg41) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x14x14xbf16>
-    %731 = stablehlo.convert %730 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %732 = stablehlo.broadcast_in_dim %731, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %733 = stablehlo.broadcast_in_dim %arg214, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %734 = stablehlo.subtract %732, %733 : tensor<1x256x14x14xf32>
-    %735 = stablehlo.broadcast_in_dim %734, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %736 = stablehlo.broadcast_in_dim %arg215, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %737 = stablehlo.multiply %735, %736 : tensor<1x256x14x14xf32>
-    %738 = stablehlo.convert %arg216 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %739 = stablehlo.broadcast_in_dim %737, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %740 = stablehlo.broadcast_in_dim %738, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %741 = stablehlo.multiply %739, %740 : tensor<1x256x14x14xf32>
-    %742 = stablehlo.convert %arg217 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %743 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %744 = stablehlo.broadcast_in_dim %742, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %745 = stablehlo.add %743, %744 : tensor<1x256x14x14xf32>
-    %746 = stablehlo.convert %745 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %747 = stablehlo.maximum %746, %cst_7 : tensor<1x256x14x14xbf16>
-    %748 = stablehlo.convolution(%747, %arg42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<256x256x3x3xbf16>) -> tensor<1x256x14x14xbf16>
-    %749 = stablehlo.convert %748 : (tensor<1x256x14x14xbf16>) -> tensor<1x256x14x14xf32>
-    %750 = stablehlo.broadcast_in_dim %749, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %751 = stablehlo.broadcast_in_dim %arg218, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %752 = stablehlo.subtract %750, %751 : tensor<1x256x14x14xf32>
-    %753 = stablehlo.broadcast_in_dim %752, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %754 = stablehlo.broadcast_in_dim %arg219, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %755 = stablehlo.multiply %753, %754 : tensor<1x256x14x14xf32>
-    %756 = stablehlo.convert %arg220 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %757 = stablehlo.broadcast_in_dim %755, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %758 = stablehlo.broadcast_in_dim %756, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %759 = stablehlo.multiply %757, %758 : tensor<1x256x14x14xf32>
-    %760 = stablehlo.convert %arg221 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %761 = stablehlo.broadcast_in_dim %759, dims = [0, 1, 2, 3] : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
-    %762 = stablehlo.broadcast_in_dim %760, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x14x14xf32>
-    %763 = stablehlo.add %761, %762 : tensor<1x256x14x14xf32>
-    %764 = stablehlo.convert %763 : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xbf16>
-    %765 = stablehlo.maximum %764, %cst_7 : tensor<1x256x14x14xbf16>
-    %766 = stablehlo.convolution(%765, %arg43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x14x14xbf16>, tensor<1024x256x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %767 = stablehlo.convert %766 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xf32>
-    %768 = stablehlo.broadcast_in_dim %767, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %769 = stablehlo.broadcast_in_dim %arg222, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %770 = stablehlo.subtract %768, %769 : tensor<1x1024x14x14xf32>
-    %771 = stablehlo.broadcast_in_dim %770, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %772 = stablehlo.broadcast_in_dim %arg223, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %773 = stablehlo.multiply %771, %772 : tensor<1x1024x14x14xf32>
-    %774 = stablehlo.convert %arg224 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %775 = stablehlo.broadcast_in_dim %773, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %776 = stablehlo.broadcast_in_dim %774, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %777 = stablehlo.multiply %775, %776 : tensor<1x1024x14x14xf32>
-    %778 = stablehlo.convert %arg225 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %779 = stablehlo.broadcast_in_dim %777, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xf32>
-    %780 = stablehlo.broadcast_in_dim %778, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x14x14xf32>
-    %781 = stablehlo.add %779, %780 : tensor<1x1024x14x14xf32>
-    %782 = stablehlo.convert %781 : (tensor<1x1024x14x14xf32>) -> tensor<1x1024x14x14xbf16>
-    %783 = stablehlo.add %782, %729 : tensor<1x1024x14x14xbf16>
-    %784 = stablehlo.maximum %783, %cst_8 : tensor<1x1024x14x14xbf16>
-    %785 = stablehlo.convolution(%784, %arg44) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x14x14xbf16>
-    %786 = stablehlo.convert %785 : (tensor<1x512x14x14xbf16>) -> tensor<1x512x14x14xf32>
-    %787 = stablehlo.broadcast_in_dim %786, dims = [0, 1, 2, 3] : (tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xf32>
-    %788 = stablehlo.broadcast_in_dim %arg226, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x14x14xf32>
-    %789 = stablehlo.subtract %787, %788 : tensor<1x512x14x14xf32>
-    %790 = stablehlo.broadcast_in_dim %789, dims = [0, 1, 2, 3] : (tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xf32>
-    %791 = stablehlo.broadcast_in_dim %arg227, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x14x14xf32>
-    %792 = stablehlo.multiply %790, %791 : tensor<1x512x14x14xf32>
-    %793 = stablehlo.convert %arg228 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %794 = stablehlo.broadcast_in_dim %792, dims = [0, 1, 2, 3] : (tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xf32>
-    %795 = stablehlo.broadcast_in_dim %793, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x14x14xf32>
-    %796 = stablehlo.multiply %794, %795 : tensor<1x512x14x14xf32>
-    %797 = stablehlo.convert %arg229 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %798 = stablehlo.broadcast_in_dim %796, dims = [0, 1, 2, 3] : (tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xf32>
-    %799 = stablehlo.broadcast_in_dim %797, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x14x14xf32>
-    %800 = stablehlo.add %798, %799 : tensor<1x512x14x14xf32>
-    %801 = stablehlo.convert %800 : (tensor<1x512x14x14xf32>) -> tensor<1x512x14x14xbf16>
-    %802 = stablehlo.maximum %801, %cst_9 : tensor<1x512x14x14xbf16>
-    %803 = stablehlo.convolution(%802, %arg45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x14x14xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %804 = stablehlo.convert %803 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %805 = stablehlo.broadcast_in_dim %804, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %806 = stablehlo.broadcast_in_dim %arg230, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %807 = stablehlo.subtract %805, %806 : tensor<1x512x7x7xf32>
-    %808 = stablehlo.broadcast_in_dim %807, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %809 = stablehlo.broadcast_in_dim %arg231, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %810 = stablehlo.multiply %808, %809 : tensor<1x512x7x7xf32>
-    %811 = stablehlo.convert %arg232 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %812 = stablehlo.broadcast_in_dim %810, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %813 = stablehlo.broadcast_in_dim %811, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %814 = stablehlo.multiply %812, %813 : tensor<1x512x7x7xf32>
-    %815 = stablehlo.convert %arg233 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %816 = stablehlo.broadcast_in_dim %814, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %817 = stablehlo.broadcast_in_dim %815, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %818 = stablehlo.add %816, %817 : tensor<1x512x7x7xf32>
-    %819 = stablehlo.convert %818 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %820 = stablehlo.maximum %819, %cst_10 : tensor<1x512x7x7xbf16>
-    %821 = stablehlo.convolution(%820, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<2048x512x1x1xbf16>) -> tensor<1x2048x7x7xbf16>
-    %822 = stablehlo.convert %821 : (tensor<1x2048x7x7xbf16>) -> tensor<1x2048x7x7xf32>
-    %823 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %824 = stablehlo.broadcast_in_dim %arg234, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %825 = stablehlo.subtract %823, %824 : tensor<1x2048x7x7xf32>
-    %826 = stablehlo.broadcast_in_dim %825, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %827 = stablehlo.broadcast_in_dim %arg235, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %828 = stablehlo.multiply %826, %827 : tensor<1x2048x7x7xf32>
-    %829 = stablehlo.convert %arg236 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %830 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %831 = stablehlo.broadcast_in_dim %829, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %832 = stablehlo.multiply %830, %831 : tensor<1x2048x7x7xf32>
-    %833 = stablehlo.convert %arg237 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %834 = stablehlo.broadcast_in_dim %832, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %835 = stablehlo.broadcast_in_dim %833, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %836 = stablehlo.add %834, %835 : tensor<1x2048x7x7xf32>
-    %837 = stablehlo.convert %836 : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xbf16>
-    %838 = stablehlo.convolution(%784, %arg47) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x14x14xbf16>, tensor<2048x1024x1x1xbf16>) -> tensor<1x2048x7x7xbf16>
-    %839 = stablehlo.convert %838 : (tensor<1x2048x7x7xbf16>) -> tensor<1x2048x7x7xf32>
-    %840 = stablehlo.broadcast_in_dim %839, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %841 = stablehlo.broadcast_in_dim %arg238, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %842 = stablehlo.subtract %840, %841 : tensor<1x2048x7x7xf32>
-    %843 = stablehlo.broadcast_in_dim %842, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %844 = stablehlo.broadcast_in_dim %arg239, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %845 = stablehlo.multiply %843, %844 : tensor<1x2048x7x7xf32>
-    %846 = stablehlo.convert %arg240 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %847 = stablehlo.broadcast_in_dim %845, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %848 = stablehlo.broadcast_in_dim %846, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %849 = stablehlo.multiply %847, %848 : tensor<1x2048x7x7xf32>
-    %850 = stablehlo.convert %arg241 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %851 = stablehlo.broadcast_in_dim %849, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %852 = stablehlo.broadcast_in_dim %850, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %853 = stablehlo.add %851, %852 : tensor<1x2048x7x7xf32>
-    %854 = stablehlo.convert %853 : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xbf16>
-    %855 = stablehlo.add %837, %854 : tensor<1x2048x7x7xbf16>
-    %856 = stablehlo.maximum %855, %cst_11 : tensor<1x2048x7x7xbf16>
-    %857 = stablehlo.convolution(%856, %arg48) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2048x7x7xbf16>, tensor<512x2048x1x1xbf16>) -> tensor<1x512x7x7xbf16>
-    %858 = stablehlo.convert %857 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %859 = stablehlo.broadcast_in_dim %858, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %860 = stablehlo.broadcast_in_dim %arg242, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %861 = stablehlo.subtract %859, %860 : tensor<1x512x7x7xf32>
-    %862 = stablehlo.broadcast_in_dim %861, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %863 = stablehlo.broadcast_in_dim %arg243, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %864 = stablehlo.multiply %862, %863 : tensor<1x512x7x7xf32>
-    %865 = stablehlo.convert %arg244 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %866 = stablehlo.broadcast_in_dim %864, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %867 = stablehlo.broadcast_in_dim %865, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %868 = stablehlo.multiply %866, %867 : tensor<1x512x7x7xf32>
-    %869 = stablehlo.convert %arg245 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %870 = stablehlo.broadcast_in_dim %868, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %871 = stablehlo.broadcast_in_dim %869, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %872 = stablehlo.add %870, %871 : tensor<1x512x7x7xf32>
-    %873 = stablehlo.convert %872 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %874 = stablehlo.maximum %873, %cst_10 : tensor<1x512x7x7xbf16>
-    %875 = stablehlo.convolution(%874, %arg49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %876 = stablehlo.convert %875 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %877 = stablehlo.broadcast_in_dim %876, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %878 = stablehlo.broadcast_in_dim %arg246, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %879 = stablehlo.subtract %877, %878 : tensor<1x512x7x7xf32>
-    %880 = stablehlo.broadcast_in_dim %879, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %881 = stablehlo.broadcast_in_dim %arg247, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %882 = stablehlo.multiply %880, %881 : tensor<1x512x7x7xf32>
-    %883 = stablehlo.convert %arg248 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %884 = stablehlo.broadcast_in_dim %882, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %885 = stablehlo.broadcast_in_dim %883, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %886 = stablehlo.multiply %884, %885 : tensor<1x512x7x7xf32>
-    %887 = stablehlo.convert %arg249 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %888 = stablehlo.broadcast_in_dim %886, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %889 = stablehlo.broadcast_in_dim %887, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %890 = stablehlo.add %888, %889 : tensor<1x512x7x7xf32>
-    %891 = stablehlo.convert %890 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %892 = stablehlo.maximum %891, %cst_10 : tensor<1x512x7x7xbf16>
-    %893 = stablehlo.convolution(%892, %arg50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<2048x512x1x1xbf16>) -> tensor<1x2048x7x7xbf16>
-    %894 = stablehlo.convert %893 : (tensor<1x2048x7x7xbf16>) -> tensor<1x2048x7x7xf32>
-    %895 = stablehlo.broadcast_in_dim %894, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %896 = stablehlo.broadcast_in_dim %arg250, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %897 = stablehlo.subtract %895, %896 : tensor<1x2048x7x7xf32>
-    %898 = stablehlo.broadcast_in_dim %897, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %899 = stablehlo.broadcast_in_dim %arg251, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %900 = stablehlo.multiply %898, %899 : tensor<1x2048x7x7xf32>
-    %901 = stablehlo.convert %arg252 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %902 = stablehlo.broadcast_in_dim %900, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %903 = stablehlo.broadcast_in_dim %901, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %904 = stablehlo.multiply %902, %903 : tensor<1x2048x7x7xf32>
-    %905 = stablehlo.convert %arg253 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %906 = stablehlo.broadcast_in_dim %904, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %907 = stablehlo.broadcast_in_dim %905, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %908 = stablehlo.add %906, %907 : tensor<1x2048x7x7xf32>
-    %909 = stablehlo.convert %908 : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xbf16>
-    %910 = stablehlo.add %909, %856 : tensor<1x2048x7x7xbf16>
-    %911 = stablehlo.maximum %910, %cst_11 : tensor<1x2048x7x7xbf16>
-    %912 = stablehlo.convolution(%911, %arg51) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2048x7x7xbf16>, tensor<512x2048x1x1xbf16>) -> tensor<1x512x7x7xbf16>
-    %913 = stablehlo.convert %912 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %914 = stablehlo.broadcast_in_dim %913, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %915 = stablehlo.broadcast_in_dim %arg254, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %916 = stablehlo.subtract %914, %915 : tensor<1x512x7x7xf32>
-    %917 = stablehlo.broadcast_in_dim %916, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %918 = stablehlo.broadcast_in_dim %arg255, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %919 = stablehlo.multiply %917, %918 : tensor<1x512x7x7xf32>
-    %920 = stablehlo.convert %arg256 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %921 = stablehlo.broadcast_in_dim %919, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %922 = stablehlo.broadcast_in_dim %920, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %923 = stablehlo.multiply %921, %922 : tensor<1x512x7x7xf32>
-    %924 = stablehlo.convert %arg257 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %925 = stablehlo.broadcast_in_dim %923, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %926 = stablehlo.broadcast_in_dim %924, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %927 = stablehlo.add %925, %926 : tensor<1x512x7x7xf32>
-    %928 = stablehlo.convert %927 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %929 = stablehlo.maximum %928, %cst_10 : tensor<1x512x7x7xbf16>
-    %930 = stablehlo.convolution(%929, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<512x512x3x3xbf16>) -> tensor<1x512x7x7xbf16>
-    %931 = stablehlo.convert %930 : (tensor<1x512x7x7xbf16>) -> tensor<1x512x7x7xf32>
-    %932 = stablehlo.broadcast_in_dim %931, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %933 = stablehlo.broadcast_in_dim %arg258, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %934 = stablehlo.subtract %932, %933 : tensor<1x512x7x7xf32>
-    %935 = stablehlo.broadcast_in_dim %934, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %936 = stablehlo.broadcast_in_dim %arg259, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %937 = stablehlo.multiply %935, %936 : tensor<1x512x7x7xf32>
-    %938 = stablehlo.convert %arg260 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %939 = stablehlo.broadcast_in_dim %937, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %940 = stablehlo.broadcast_in_dim %938, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %941 = stablehlo.multiply %939, %940 : tensor<1x512x7x7xf32>
-    %942 = stablehlo.convert %arg261 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %943 = stablehlo.broadcast_in_dim %941, dims = [0, 1, 2, 3] : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
-    %944 = stablehlo.broadcast_in_dim %942, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x7x7xf32>
-    %945 = stablehlo.add %943, %944 : tensor<1x512x7x7xf32>
-    %946 = stablehlo.convert %945 : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xbf16>
-    %947 = stablehlo.maximum %946, %cst_10 : tensor<1x512x7x7xbf16>
-    %948 = stablehlo.convolution(%947, %arg53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x7x7xbf16>, tensor<2048x512x1x1xbf16>) -> tensor<1x2048x7x7xbf16>
-    %949 = stablehlo.convert %948 : (tensor<1x2048x7x7xbf16>) -> tensor<1x2048x7x7xf32>
-    %950 = stablehlo.broadcast_in_dim %949, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %951 = stablehlo.broadcast_in_dim %arg262, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %952 = stablehlo.subtract %950, %951 : tensor<1x2048x7x7xf32>
-    %953 = stablehlo.broadcast_in_dim %952, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %954 = stablehlo.broadcast_in_dim %arg263, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %955 = stablehlo.multiply %953, %954 : tensor<1x2048x7x7xf32>
-    %956 = stablehlo.convert %arg264 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %957 = stablehlo.broadcast_in_dim %955, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %958 = stablehlo.broadcast_in_dim %956, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %959 = stablehlo.multiply %957, %958 : tensor<1x2048x7x7xf32>
-    %960 = stablehlo.convert %arg265 : (tensor<2048x1x1xbf16>) -> tensor<2048x1x1xf32>
-    %961 = stablehlo.broadcast_in_dim %959, dims = [0, 1, 2, 3] : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xf32>
-    %962 = stablehlo.broadcast_in_dim %960, dims = [1, 2, 3] : (tensor<2048x1x1xf32>) -> tensor<1x2048x7x7xf32>
-    %963 = stablehlo.add %961, %962 : tensor<1x2048x7x7xf32>
-    %964 = stablehlo.convert %963 : (tensor<1x2048x7x7xf32>) -> tensor<1x2048x7x7xbf16>
-    %965 = stablehlo.add %964, %911 : tensor<1x2048x7x7xbf16>
-    %966 = stablehlo.maximum %965, %cst_11 : tensor<1x2048x7x7xbf16>
-    %967 = stablehlo.reduce(%966 init: %cst_12) applies stablehlo.add across dimensions = [2, 3] : (tensor<1x2048x7x7xbf16>, tensor<bf16>) -> tensor<1x2048xbf16>
-    %968 = stablehlo.reshape %967 : (tensor<1x2048xbf16>) -> tensor<1x2048x1x1xbf16>
-    %969 = stablehlo.convert %cst_13 : (tensor<1xi64>) -> tensor<1xbf16>
-    %970 = stablehlo.reshape %969 : (tensor<1xbf16>) -> tensor<bf16>
-    %971 = stablehlo.broadcast_in_dim %968, dims = [0, 1, 2, 3] : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048x1x1xbf16>
-    %972 = stablehlo.broadcast_in_dim %970, dims = [] : (tensor<bf16>) -> tensor<1x2048x1x1xbf16>
-    %973 = stablehlo.divide %971, %972 : tensor<1x2048x1x1xbf16>
-    %974 = stablehlo.reshape %973 : (tensor<1x2048x1x1xbf16>) -> tensor<1x2048xbf16>
-    %975 = stablehlo.convert %974 : (tensor<1x2048xbf16>) -> tensor<1x2048xf32>
-    %976 = stablehlo.dot_general %975, %arg266, contracting_dims = [1] x [0] : (tensor<1x2048xf32>, tensor<2048x1000xf32>) -> tensor<1x1000xf32>
-    %977 = stablehlo.convert %cst_14 : (tensor<1xi64>) -> tensor<1xf32>
-    %978 = stablehlo.reshape %977 : (tensor<1xf32>) -> tensor<f32>
-    %979 = stablehlo.broadcast_in_dim %976, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %980 = stablehlo.broadcast_in_dim %978, dims = [] : (tensor<f32>) -> tensor<1x1000xf32>
-    %981 = stablehlo.multiply %979, %980 : tensor<1x1000xf32>
-    %982 = stablehlo.broadcast_in_dim %981, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %983 = stablehlo.broadcast_in_dim %arg267, dims = [1] : (tensor<1000xf32>) -> tensor<1x1000xf32>
-    %984 = stablehlo.add %982, %983 : tensor<1x1000xf32>
-    %985 = stablehlo.convert %984 : (tensor<1x1000xf32>) -> tensor<1x1000xbf16>
-    return %985 : tensor<1x1000xbf16>
-  }
-}
diff --git a/mlir_tests/SegFormer.mlir b/mlir_tests/SegFormer.mlir
deleted file mode 100644
index d2ac2890..00000000
--- a/mlir_tests/SegFormer.mlir
+++ /dev/null
@@ -1,2515 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x512x512xbf16>, %arg1: tensor<32x3x7x7xbf16>, %arg2: tensor<32xbf16>, %arg3: tensor<32xbf16>, %arg4: tensor<32xbf16>, %arg5: tensor<32xbf16>, %arg6: tensor<32xbf16>, %arg7: tensor<32x32x8x8xbf16>, %arg8: tensor<32xbf16>, %arg9: tensor<32xbf16>, %arg10: tensor<32xbf16>, %arg11: tensor<32xbf16>, %arg12: tensor<32xbf16>, %arg13: tensor<128x1x3x3xbf16>, %arg14: tensor<128xbf16>, %arg15: tensor<32xbf16>, %arg16: tensor<32xbf16>, %arg17: tensor<32xbf16>, %arg18: tensor<32x32x8x8xbf16>, %arg19: tensor<32xbf16>, %arg20: tensor<32xbf16>, %arg21: tensor<32xbf16>, %arg22: tensor<32xbf16>, %arg23: tensor<32xbf16>, %arg24: tensor<128x1x3x3xbf16>, %arg25: tensor<128xbf16>, %arg26: tensor<32xbf16>, %arg27: tensor<32xbf16>, %arg28: tensor<32xbf16>, %arg29: tensor<64x32x3x3xbf16>, %arg30: tensor<64xbf16>, %arg31: tensor<64xbf16>, %arg32: tensor<64xbf16>, %arg33: tensor<64xbf16>, %arg34: tensor<64xbf16>, %arg35: tensor<64x64x4x4xbf16>, %arg36: tensor<64xbf16>, %arg37: tensor<64xbf16>, %arg38: tensor<64xbf16>, %arg39: tensor<64xbf16>, %arg40: tensor<64xbf16>, %arg41: tensor<256x1x3x3xbf16>, %arg42: tensor<256xbf16>, %arg43: tensor<64xbf16>, %arg44: tensor<64xbf16>, %arg45: tensor<64xbf16>, %arg46: tensor<64x64x4x4xbf16>, %arg47: tensor<64xbf16>, %arg48: tensor<64xbf16>, %arg49: tensor<64xbf16>, %arg50: tensor<64xbf16>, %arg51: tensor<64xbf16>, %arg52: tensor<256x1x3x3xbf16>, %arg53: tensor<256xbf16>, %arg54: tensor<64xbf16>, %arg55: tensor<64xbf16>, %arg56: tensor<64xbf16>, %arg57: tensor<160x64x3x3xbf16>, %arg58: tensor<160xbf16>, %arg59: tensor<160xbf16>, %arg60: tensor<160xbf16>, %arg61: tensor<160xbf16>, %arg62: tensor<160xbf16>, %arg63: tensor<160x160x2x2xbf16>, %arg64: tensor<160xbf16>, %arg65: tensor<160xbf16>, %arg66: tensor<160xbf16>, %arg67: tensor<160xbf16>, %arg68: tensor<160xbf16>, %arg69: tensor<640x1x3x3xbf16>, %arg70: tensor<640xbf16>, %arg71: tensor<160xbf16>, %arg72: tensor<160xbf16>, %arg73: tensor<160xbf16>, %arg74: tensor<160x160x2x2xbf16>, %arg75: tensor<160xbf16>, %arg76: tensor<160xbf16>, %arg77: tensor<160xbf16>, %arg78: tensor<160xbf16>, %arg79: tensor<160xbf16>, %arg80: tensor<640x1x3x3xbf16>, %arg81: tensor<640xbf16>, %arg82: tensor<160xbf16>, %arg83: tensor<160xbf16>, %arg84: tensor<160xbf16>, %arg85: tensor<256x160x3x3xbf16>, %arg86: tensor<256xbf16>, %arg87: tensor<256xbf16>, %arg88: tensor<256xbf16>, %arg89: tensor<256xbf16>, %arg90: tensor<256xbf16>, %arg91: tensor<256xbf16>, %arg92: tensor<256xbf16>, %arg93: tensor<1024x1x3x3xbf16>, %arg94: tensor<1024xbf16>, %arg95: tensor<256xbf16>, %arg96: tensor<256xbf16>, %arg97: tensor<256xbf16>, %arg98: tensor<256xbf16>, %arg99: tensor<256xbf16>, %arg100: tensor<1024x1x3x3xbf16>, %arg101: tensor<1024xbf16>, %arg102: tensor<256xbf16>, %arg103: tensor<256xbf16>, %arg104: tensor<256xbf16>, %arg105: tensor<256xbf16>, %arg106: tensor<256xbf16>, %arg107: tensor<256xbf16>, %arg108: tensor<256xbf16>, %arg109: tensor<256x1024x1x1xbf16>, %arg110: tensor<150x256x1x1xbf16>, %arg111: tensor<150xbf16>, %arg112: tensor<32x32xf32>, %arg113: tensor<32xf32>, %arg114: tensor<32x32xf32>, %arg115: tensor<32xf32>, %arg116: tensor<32x32xf32>, %arg117: tensor<32xf32>, %arg118: tensor<32x32xf32>, %arg119: tensor<32xf32>, %arg120: tensor<32x128xf32>, %arg121: tensor<128xf32>, %arg122: tensor<128x32xbf16>, %arg123: tensor<32x32xf32>, %arg124: tensor<32xf32>, %arg125: tensor<32x32xf32>, %arg126: tensor<32xf32>, %arg127: tensor<32x32xf32>, %arg128: tensor<32xf32>, %arg129: tensor<32x32xf32>, %arg130: tensor<32xf32>, %arg131: tensor<32x128xf32>, %arg132: tensor<128xf32>, %arg133: tensor<128x32xbf16>, %arg134: tensor<64x64xf32>, %arg135: tensor<64xf32>, %arg136: tensor<64x64xf32>, %arg137: tensor<64xf32>, %arg138: tensor<64x64xf32>, %arg139: tensor<64xf32>, %arg140: tensor<64x64xf32>, %arg141: tensor<64xf32>, %arg142: tensor<64x256xf32>, %arg143: tensor<256xf32>, %arg144: tensor<256x64xbf16>, %arg145: tensor<64x64xf32>, %arg146: tensor<64xf32>, %arg147: tensor<64x64xf32>, %arg148: tensor<64xf32>, %arg149: tensor<64x64xf32>, %arg150: tensor<64xf32>, %arg151: tensor<64x64xf32>, %arg152: tensor<64xf32>, %arg153: tensor<64x256xf32>, %arg154: tensor<256xf32>, %arg155: tensor<256x64xbf16>, %arg156: tensor<160x160xf32>, %arg157: tensor<160xf32>, %arg158: tensor<160x160xf32>, %arg159: tensor<160xf32>, %arg160: tensor<160x160xf32>, %arg161: tensor<160xf32>, %arg162: tensor<160x160xf32>, %arg163: tensor<160xf32>, %arg164: tensor<160x640xf32>, %arg165: tensor<640xf32>, %arg166: tensor<640x160xbf16>, %arg167: tensor<160x160xf32>, %arg168: tensor<160xf32>, %arg169: tensor<160x160xf32>, %arg170: tensor<160xf32>, %arg171: tensor<160x160xf32>, %arg172: tensor<160xf32>, %arg173: tensor<160x160xf32>, %arg174: tensor<160xf32>, %arg175: tensor<160x640xf32>, %arg176: tensor<640xf32>, %arg177: tensor<640x160xbf16>, %arg178: tensor<256x256xf32>, %arg179: tensor<256xf32>, %arg180: tensor<256x256xf32>, %arg181: tensor<256xf32>, %arg182: tensor<256x256xf32>, %arg183: tensor<256xf32>, %arg184: tensor<256x256xf32>, %arg185: tensor<256xf32>, %arg186: tensor<256x1024xf32>, %arg187: tensor<1024xf32>, %arg188: tensor<1024x256xbf16>, %arg189: tensor<256x256xf32>, %arg190: tensor<256xf32>, %arg191: tensor<256x256xf32>, %arg192: tensor<256xf32>, %arg193: tensor<256x256xf32>, %arg194: tensor<256xf32>, %arg195: tensor<256x256xf32>, %arg196: tensor<256xf32>, %arg197: tensor<256x1024xf32>, %arg198: tensor<1024xf32>, %arg199: tensor<1024x256xbf16>, %arg200: tensor<32x256xbf16>, %arg201: tensor<256x128x128xbf16>, %arg202: tensor<256x128x128xbf16>, %arg203: tensor<64x256xbf16>, %arg204: tensor<256x64x128xbf16>, %arg205: tensor<256x64x128xbf16>, %arg206: tensor<160x256xbf16>, %arg207: tensor<256x32x128xbf16>, %arg208: tensor<256x32x128xbf16>, %arg209: tensor<256x256xbf16>, %arg210: tensor<256x16x128xbf16>, %arg211: tensor<256x16x128xbf16>, %arg212: tensor<256x1x1xf32>, %arg213: tensor<256x1x1xf32>, %arg214: tensor<256x1x1xbf16>, %arg215: tensor<256x1x1xbf16>) -> tensor<1x150x128x128xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x16384x128xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x16384x128xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x16384x128xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x16384x128xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x16384x128xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x16384x128xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x16384x128xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x16384x128xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x16384x128xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x16384x128xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x16384x128xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x16384x128xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x16384x128xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x16384x128xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x16384x128xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x16384x128xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x16384x128xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x16384x128xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x16384x128xf32>
-    %cst_21 = stablehlo.constant dense<1.000000e+00> : tensor<1x4096x256xbf16>
-    %cst_22 = stablehlo.constant dense<2.000000e+00> : tensor<1x4096x256xbf16>
-    %cst_23 = stablehlo.constant dense<5.000000e-01> : tensor<1x4096x256xbf16>
-    %cst_24 = stablehlo.constant dense<-4.000000e+00> : tensor<1x4096x256xf32>
-    %cst_25 = stablehlo.constant dense<4.000000e+00> : tensor<1x4096x256xf32>
-    %cst_26 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x4096x256xf32>
-    %cst_27 = stablehlo.constant dense<2.77068146E-8> : tensor<1x4096x256xf32>
-    %cst_28 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x4096x256xf32>
-    %cst_29 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x4096x256xf32>
-    %cst_30 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x4096x256xf32>
-    %cst_31 = stablehlo.constant dense<-2.954600e-03> : tensor<1x4096x256xf32>
-    %cst_32 = stablehlo.constant dense<-0.0160960332> : tensor<1x4096x256xf32>
-    %cst_33 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x4096x256xf32>
-    %cst_34 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x4096x256xf32>
-    %cst_35 = stablehlo.constant dense<-0.00168282702> : tensor<1x4096x256xf32>
-    %cst_36 = stablehlo.constant dense<-0.00737332925> : tensor<1x4096x256xf32>
-    %cst_37 = stablehlo.constant dense<-0.0142647391> : tensor<1x4096x256xf32>
-    %cst_38 = stablehlo.constant dense<-1.000000e+00> : tensor<1x4096x256xf32>
-    %cst_39 = stablehlo.constant dense<1.000000e+00> : tensor<1x4096x256xf32>
-    %cst_40 = stablehlo.constant dense<1.000000e+00> : tensor<1x1024x640xbf16>
-    %cst_41 = stablehlo.constant dense<2.000000e+00> : tensor<1x1024x640xbf16>
-    %cst_42 = stablehlo.constant dense<5.000000e-01> : tensor<1x1024x640xbf16>
-    %cst_43 = stablehlo.constant dense<-4.000000e+00> : tensor<1x1024x640xf32>
-    %cst_44 = stablehlo.constant dense<4.000000e+00> : tensor<1x1024x640xf32>
-    %cst_45 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x1024x640xf32>
-    %cst_46 = stablehlo.constant dense<2.77068146E-8> : tensor<1x1024x640xf32>
-    %cst_47 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x1024x640xf32>
-    %cst_48 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x1024x640xf32>
-    %cst_49 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x1024x640xf32>
-    %cst_50 = stablehlo.constant dense<-2.954600e-03> : tensor<1x1024x640xf32>
-    %cst_51 = stablehlo.constant dense<-0.0160960332> : tensor<1x1024x640xf32>
-    %cst_52 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x1024x640xf32>
-    %cst_53 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x1024x640xf32>
-    %cst_54 = stablehlo.constant dense<-0.00168282702> : tensor<1x1024x640xf32>
-    %cst_55 = stablehlo.constant dense<-0.00737332925> : tensor<1x1024x640xf32>
-    %cst_56 = stablehlo.constant dense<-0.0142647391> : tensor<1x1024x640xf32>
-    %cst_57 = stablehlo.constant dense<-1.000000e+00> : tensor<1x1024x640xf32>
-    %cst_58 = stablehlo.constant dense<1.000000e+00> : tensor<1x1024x640xf32>
-    %cst_59 = stablehlo.constant dense<1.000000e+00> : tensor<1x256x1024xbf16>
-    %cst_60 = stablehlo.constant dense<2.000000e+00> : tensor<1x256x1024xbf16>
-    %cst_61 = stablehlo.constant dense<5.000000e-01> : tensor<1x256x1024xbf16>
-    %cst_62 = stablehlo.constant dense<-4.000000e+00> : tensor<1x256x1024xf32>
-    %cst_63 = stablehlo.constant dense<4.000000e+00> : tensor<1x256x1024xf32>
-    %cst_64 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x256x1024xf32>
-    %cst_65 = stablehlo.constant dense<2.77068146E-8> : tensor<1x256x1024xf32>
-    %cst_66 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x256x1024xf32>
-    %cst_67 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x256x1024xf32>
-    %cst_68 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x256x1024xf32>
-    %cst_69 = stablehlo.constant dense<-2.954600e-03> : tensor<1x256x1024xf32>
-    %cst_70 = stablehlo.constant dense<-0.0160960332> : tensor<1x256x1024xf32>
-    %cst_71 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x256x1024xf32>
-    %cst_72 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x256x1024xf32>
-    %cst_73 = stablehlo.constant dense<-0.00168282702> : tensor<1x256x1024xf32>
-    %cst_74 = stablehlo.constant dense<-0.00737332925> : tensor<1x256x1024xf32>
-    %cst_75 = stablehlo.constant dense<-0.0142647391> : tensor<1x256x1024xf32>
-    %cst_76 = stablehlo.constant dense<-1.000000e+00> : tensor<1x256x1024xf32>
-    %cst_77 = stablehlo.constant dense<1.000000e+00> : tensor<1x256x1024xf32>
-    %cst_78 = stablehlo.constant dense<0.000000e+00> : tensor<1x256x128x128xbf16>
-    %cst_79 = arith.constant dense<32> : tensor<1xi64>
-    %cst_80 = arith.constant dense<1.000000e-05> : tensor<1xf64>
-    %cst_81 = arith.constant dense<1> : tensor<1xi64>
-    %cst_82 = arith.constant dense<5.6568542494923806> : tensor<1xf64>
-    %cst_83 = arith.constant dense<64> : tensor<1xi64>
-    %cst_84 = arith.constant dense<160> : tensor<1xi64>
-    %cst_85 = arith.constant dense<256> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[3, 3], [3, 3]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x512x512xbf16>, tensor<32x3x7x7xbf16>) -> tensor<1x32x128x128xbf16>
-    %1 = stablehlo.reshape %arg2 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x32x128x128xbf16>) -> tensor<1x32x128x128xbf16>
-    %3 = stablehlo.broadcast_in_dim %1, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x128x128xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x32x128x128xbf16>
-    %5 = stablehlo.reshape %4 : (tensor<1x32x128x128xbf16>) -> tensor<1x32x16384xbf16>
-    %6 = stablehlo.transpose %5, dims = [0, 2, 1] : (tensor<1x32x16384xbf16>) -> tensor<1x16384x32xbf16>
-    %7 = stablehlo.convert %6 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xf32>
-    %8 = stablehlo.convert %7 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf64>
-    %9 = stablehlo.reduce(%8 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %10 = stablehlo.reshape %9 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %11 = stablehlo.convert %cst_79 : (tensor<1xi64>) -> tensor<1xf64>
-    %12 = stablehlo.reshape %11 : (tensor<1xf64>) -> tensor<f64>
-    %13 = stablehlo.broadcast_in_dim %10, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [] : (tensor<f64>) -> tensor<1x16384x1xf64>
-    %15 = stablehlo.divide %13, %14 : tensor<1x16384x1xf64>
-    %16 = stablehlo.broadcast_in_dim %8, dims = [0, 1, 2] : (tensor<1x16384x32xf64>) -> tensor<1x16384x32xf64>
-    %17 = stablehlo.broadcast_in_dim %15, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x32xf64>
-    %18 = stablehlo.subtract %16, %17 : tensor<1x16384x32xf64>
-    %19 = stablehlo.multiply %18, %18 : tensor<1x16384x32xf64>
-    %20 = stablehlo.reduce(%19 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %21 = stablehlo.reshape %20 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %22 = stablehlo.broadcast_in_dim %21, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %23 = stablehlo.divide %22, %14 : tensor<1x16384x1xf64>
-    %24 = stablehlo.convert %23 : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf32>
-    %25 = stablehlo.reduce(%7 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf32>, tensor<f32>) -> tensor<1x16384xf32>
-    %26 = stablehlo.reshape %25 : (tensor<1x16384xf32>) -> tensor<1x16384x1xf32>
-    %27 = stablehlo.convert %cst_79 : (tensor<1xi64>) -> tensor<1xf32>
-    %28 = stablehlo.reshape %27 : (tensor<1xf32>) -> tensor<f32>
-    %29 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %30 = stablehlo.broadcast_in_dim %28, dims = [] : (tensor<f32>) -> tensor<1x16384x1xf32>
-    %31 = stablehlo.divide %29, %30 : tensor<1x16384x1xf32>
-    %32 = stablehlo.convert %cst_80 : (tensor<1xf64>) -> tensor<1xf32>
-    %33 = stablehlo.reshape %32 : (tensor<1xf32>) -> tensor<f32>
-    %34 = stablehlo.broadcast_in_dim %24, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %35 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x16384x1xf32>
-    %36 = stablehlo.add %34, %35 : tensor<1x16384x1xf32>
-    %37 = stablehlo.rsqrt %36 : tensor<1x16384x1xf32>
-    %38 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %39 = stablehlo.broadcast_in_dim %31, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %40 = stablehlo.subtract %38, %39 : tensor<1x16384x32xf32>
-    %41 = stablehlo.broadcast_in_dim %40, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %42 = stablehlo.broadcast_in_dim %37, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %43 = stablehlo.multiply %41, %42 : tensor<1x16384x32xf32>
-    %44 = stablehlo.convert %arg3 : (tensor<32xbf16>) -> tensor<32xf32>
-    %45 = stablehlo.broadcast_in_dim %43, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %47 = stablehlo.multiply %45, %46 : tensor<1x16384x32xf32>
-    %48 = stablehlo.convert %arg4 : (tensor<32xbf16>) -> tensor<32xf32>
-    %49 = stablehlo.broadcast_in_dim %47, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %51 = stablehlo.add %49, %50 : tensor<1x16384x32xf32>
-    %52 = stablehlo.convert %51 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xbf16>
-    %53 = stablehlo.convert %52 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xf32>
-    %54 = stablehlo.convert %53 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf64>
-    %55 = stablehlo.reduce(%54 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %56 = stablehlo.reshape %55 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %57 = stablehlo.broadcast_in_dim %56, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %58 = stablehlo.divide %57, %14 : tensor<1x16384x1xf64>
-    %59 = stablehlo.broadcast_in_dim %54, dims = [0, 1, 2] : (tensor<1x16384x32xf64>) -> tensor<1x16384x32xf64>
-    %60 = stablehlo.broadcast_in_dim %58, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x32xf64>
-    %61 = stablehlo.subtract %59, %60 : tensor<1x16384x32xf64>
-    %62 = stablehlo.multiply %61, %61 : tensor<1x16384x32xf64>
-    %63 = stablehlo.reduce(%62 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %64 = stablehlo.reshape %63 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %65 = stablehlo.broadcast_in_dim %64, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %66 = stablehlo.divide %65, %14 : tensor<1x16384x1xf64>
-    %67 = stablehlo.convert %66 : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf32>
-    %68 = stablehlo.reduce(%53 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf32>, tensor<f32>) -> tensor<1x16384xf32>
-    %69 = stablehlo.reshape %68 : (tensor<1x16384xf32>) -> tensor<1x16384x1xf32>
-    %70 = stablehlo.broadcast_in_dim %69, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %71 = stablehlo.divide %70, %30 : tensor<1x16384x1xf32>
-    %72 = stablehlo.broadcast_in_dim %67, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %73 = stablehlo.add %72, %35 : tensor<1x16384x1xf32>
-    %74 = stablehlo.rsqrt %73 : tensor<1x16384x1xf32>
-    %75 = stablehlo.broadcast_in_dim %53, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %76 = stablehlo.broadcast_in_dim %71, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %77 = stablehlo.subtract %75, %76 : tensor<1x16384x32xf32>
-    %78 = stablehlo.broadcast_in_dim %77, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %79 = stablehlo.broadcast_in_dim %74, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %80 = stablehlo.multiply %78, %79 : tensor<1x16384x32xf32>
-    %81 = stablehlo.convert %arg5 : (tensor<32xbf16>) -> tensor<32xf32>
-    %82 = stablehlo.broadcast_in_dim %80, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %83 = stablehlo.broadcast_in_dim %81, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %84 = stablehlo.multiply %82, %83 : tensor<1x16384x32xf32>
-    %85 = stablehlo.convert %arg6 : (tensor<32xbf16>) -> tensor<32xf32>
-    %86 = stablehlo.broadcast_in_dim %84, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %87 = stablehlo.broadcast_in_dim %85, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %88 = stablehlo.add %86, %87 : tensor<1x16384x32xf32>
-    %89 = stablehlo.convert %88 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xbf16>
-    %90 = stablehlo.reshape %89 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %91 = stablehlo.convert %90 : (tensor<16384x32xbf16>) -> tensor<16384x32xf32>
-    %92 = stablehlo.dot_general %91, %arg112, contracting_dims = [1] x [0] : (tensor<16384x32xf32>, tensor<32x32xf32>) -> tensor<16384x32xf32>
-    %93 = stablehlo.convert %cst_81 : (tensor<1xi64>) -> tensor<1xf32>
-    %94 = stablehlo.reshape %93 : (tensor<1xf32>) -> tensor<f32>
-    %95 = stablehlo.broadcast_in_dim %92, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %96 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<16384x32xf32>
-    %97 = stablehlo.multiply %95, %96 : tensor<16384x32xf32>
-    %98 = stablehlo.broadcast_in_dim %97, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %99 = stablehlo.broadcast_in_dim %arg113, dims = [1] : (tensor<32xf32>) -> tensor<16384x32xf32>
-    %100 = stablehlo.add %98, %99 : tensor<16384x32xf32>
-    %101 = stablehlo.convert %100 : (tensor<16384x32xf32>) -> tensor<16384x32xbf16>
-    %102 = stablehlo.reshape %101 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %103 = stablehlo.reshape %102 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x1x32xbf16>
-    %104 = stablehlo.transpose %103, dims = [0, 2, 1, 3] : (tensor<1x16384x1x32xbf16>) -> tensor<1x1x16384x32xbf16>
-    %105 = stablehlo.transpose %89, dims = [0, 2, 1] : (tensor<1x16384x32xbf16>) -> tensor<1x32x16384xbf16>
-    %106 = stablehlo.reshape %105 : (tensor<1x32x16384xbf16>) -> tensor<1x32x128x128xbf16>
-    %107 = stablehlo.convolution(%106, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [8, 8], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x128x128xbf16>, tensor<32x32x8x8xbf16>) -> tensor<1x32x16x16xbf16>
-    %108 = stablehlo.reshape %arg8 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %109 = stablehlo.broadcast_in_dim %107, dims = [0, 1, 2, 3] : (tensor<1x32x16x16xbf16>) -> tensor<1x32x16x16xbf16>
-    %110 = stablehlo.broadcast_in_dim %108, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x16x16xbf16>
-    %111 = stablehlo.add %109, %110 : tensor<1x32x16x16xbf16>
-    %112 = stablehlo.reshape %111 : (tensor<1x32x16x16xbf16>) -> tensor<1x32x256xbf16>
-    %113 = stablehlo.transpose %112, dims = [0, 2, 1] : (tensor<1x32x256xbf16>) -> tensor<1x256x32xbf16>
-    %114 = stablehlo.convert %113 : (tensor<1x256x32xbf16>) -> tensor<1x256x32xf32>
-    %115 = stablehlo.convert %114 : (tensor<1x256x32xf32>) -> tensor<1x256x32xf64>
-    %116 = stablehlo.reduce(%115 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x32xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %117 = stablehlo.reshape %116 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %119 = stablehlo.broadcast_in_dim %12, dims = [] : (tensor<f64>) -> tensor<1x256x1xf64>
-    %120 = stablehlo.divide %118, %119 : tensor<1x256x1xf64>
-    %121 = stablehlo.broadcast_in_dim %115, dims = [0, 1, 2] : (tensor<1x256x32xf64>) -> tensor<1x256x32xf64>
-    %122 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x32xf64>
-    %123 = stablehlo.subtract %121, %122 : tensor<1x256x32xf64>
-    %124 = stablehlo.multiply %123, %123 : tensor<1x256x32xf64>
-    %125 = stablehlo.reduce(%124 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x32xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %126 = stablehlo.reshape %125 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %127 = stablehlo.broadcast_in_dim %126, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %128 = stablehlo.divide %127, %119 : tensor<1x256x1xf64>
-    %129 = stablehlo.convert %128 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %130 = stablehlo.reduce(%114 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x32xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %131 = stablehlo.reshape %130 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %132 = stablehlo.broadcast_in_dim %131, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %133 = stablehlo.broadcast_in_dim %28, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %134 = stablehlo.divide %132, %133 : tensor<1x256x1xf32>
-    %135 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %136 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %137 = stablehlo.add %135, %136 : tensor<1x256x1xf32>
-    %138 = stablehlo.rsqrt %137 : tensor<1x256x1xf32>
-    %139 = stablehlo.broadcast_in_dim %114, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %140 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x32xf32>
-    %141 = stablehlo.subtract %139, %140 : tensor<1x256x32xf32>
-    %142 = stablehlo.broadcast_in_dim %141, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %143 = stablehlo.broadcast_in_dim %138, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x32xf32>
-    %144 = stablehlo.multiply %142, %143 : tensor<1x256x32xf32>
-    %145 = stablehlo.convert %arg9 : (tensor<32xbf16>) -> tensor<32xf32>
-    %146 = stablehlo.broadcast_in_dim %144, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %147 = stablehlo.broadcast_in_dim %145, dims = [2] : (tensor<32xf32>) -> tensor<1x256x32xf32>
-    %148 = stablehlo.multiply %146, %147 : tensor<1x256x32xf32>
-    %149 = stablehlo.convert %arg10 : (tensor<32xbf16>) -> tensor<32xf32>
-    %150 = stablehlo.broadcast_in_dim %148, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %151 = stablehlo.broadcast_in_dim %149, dims = [2] : (tensor<32xf32>) -> tensor<1x256x32xf32>
-    %152 = stablehlo.add %150, %151 : tensor<1x256x32xf32>
-    %153 = stablehlo.convert %152 : (tensor<1x256x32xf32>) -> tensor<1x256x32xbf16>
-    %154 = stablehlo.reshape %153 : (tensor<1x256x32xbf16>) -> tensor<256x32xbf16>
-    %155 = stablehlo.convert %154 : (tensor<256x32xbf16>) -> tensor<256x32xf32>
-    %156 = stablehlo.dot_general %155, %arg114, contracting_dims = [1] x [0] : (tensor<256x32xf32>, tensor<32x32xf32>) -> tensor<256x32xf32>
-    %157 = stablehlo.broadcast_in_dim %156, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %158 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<256x32xf32>
-    %159 = stablehlo.multiply %157, %158 : tensor<256x32xf32>
-    %160 = stablehlo.broadcast_in_dim %159, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %161 = stablehlo.broadcast_in_dim %arg115, dims = [1] : (tensor<32xf32>) -> tensor<256x32xf32>
-    %162 = stablehlo.add %160, %161 : tensor<256x32xf32>
-    %163 = stablehlo.convert %162 : (tensor<256x32xf32>) -> tensor<256x32xbf16>
-    %164 = stablehlo.reshape %163 : (tensor<256x32xbf16>) -> tensor<1x256x32xbf16>
-    %165 = stablehlo.reshape %164 : (tensor<1x256x32xbf16>) -> tensor<1x256x1x32xbf16>
-    %166 = stablehlo.transpose %165, dims = [0, 2, 1, 3] : (tensor<1x256x1x32xbf16>) -> tensor<1x1x256x32xbf16>
-    %167 = stablehlo.dot_general %155, %arg116, contracting_dims = [1] x [0] : (tensor<256x32xf32>, tensor<32x32xf32>) -> tensor<256x32xf32>
-    %168 = stablehlo.broadcast_in_dim %167, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %169 = stablehlo.multiply %168, %158 : tensor<256x32xf32>
-    %170 = stablehlo.broadcast_in_dim %169, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %171 = stablehlo.broadcast_in_dim %arg117, dims = [1] : (tensor<32xf32>) -> tensor<256x32xf32>
-    %172 = stablehlo.add %170, %171 : tensor<256x32xf32>
-    %173 = stablehlo.convert %172 : (tensor<256x32xf32>) -> tensor<256x32xbf16>
-    %174 = stablehlo.reshape %173 : (tensor<256x32xbf16>) -> tensor<1x256x32xbf16>
-    %175 = stablehlo.reshape %174 : (tensor<1x256x32xbf16>) -> tensor<1x256x1x32xbf16>
-    %176 = stablehlo.transpose %175, dims = [0, 2, 1, 3] : (tensor<1x256x1x32xbf16>) -> tensor<1x1x256x32xbf16>
-    %177 = stablehlo.transpose %166, dims = [0, 1, 3, 2] : (tensor<1x1x256x32xbf16>) -> tensor<1x1x32x256xbf16>
-    %178 = stablehlo.reshape %104 : (tensor<1x1x16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %179 = stablehlo.reshape %177 : (tensor<1x1x32x256xbf16>) -> tensor<1x32x256xbf16>
-    %180 = stablehlo.broadcast_in_dim %179, dims = [0, 1, 2] : (tensor<1x32x256xbf16>) -> tensor<1x32x256xbf16>
-    %181 = stablehlo.dot_general %178, %180, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x16384x32xbf16>, tensor<1x32x256xbf16>) -> tensor<1x16384x256xbf16>
-    %182 = stablehlo.reshape %181 : (tensor<1x16384x256xbf16>) -> tensor<1x1x16384x256xbf16>
-    %183 = stablehlo.convert %cst_82 : (tensor<1xf64>) -> tensor<1xbf16>
-    %184 = stablehlo.reshape %183 : (tensor<1xbf16>) -> tensor<bf16>
-    %185 = stablehlo.broadcast_in_dim %182, dims = [0, 1, 2, 3] : (tensor<1x1x16384x256xbf16>) -> tensor<1x1x16384x256xbf16>
-    %186 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x1x16384x256xbf16>
-    %187 = stablehlo.divide %185, %186 : tensor<1x1x16384x256xbf16>
-    %188 = stablehlo.convert %187 : (tensor<1x1x16384x256xbf16>) -> tensor<1x1x16384x256xf32>
-    %189 = stablehlo.reduce(%188 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x1x16384x256xf32>, tensor<f32>) -> tensor<1x1x16384xf32>
-    %190 = stablehlo.reshape %189 : (tensor<1x1x16384xf32>) -> tensor<1x1x16384x1xf32>
-    %191 = stablehlo.broadcast_in_dim %188, dims = [0, 1, 2, 3] : (tensor<1x1x16384x256xf32>) -> tensor<1x1x16384x256xf32>
-    %192 = stablehlo.broadcast_in_dim %190, dims = [0, 1, 2, 3] : (tensor<1x1x16384x1xf32>) -> tensor<1x1x16384x256xf32>
-    %193 = stablehlo.subtract %191, %192 : tensor<1x1x16384x256xf32>
-    %194 = stablehlo.exponential %193 : tensor<1x1x16384x256xf32>
-    %195 = stablehlo.reduce(%194 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x1x16384x256xf32>, tensor<f32>) -> tensor<1x1x16384xf32>
-    %196 = stablehlo.reshape %195 : (tensor<1x1x16384xf32>) -> tensor<1x1x16384x1xf32>
-    %197 = stablehlo.broadcast_in_dim %194, dims = [0, 1, 2, 3] : (tensor<1x1x16384x256xf32>) -> tensor<1x1x16384x256xf32>
-    %198 = stablehlo.broadcast_in_dim %196, dims = [0, 1, 2, 3] : (tensor<1x1x16384x1xf32>) -> tensor<1x1x16384x256xf32>
-    %199 = stablehlo.divide %197, %198 : tensor<1x1x16384x256xf32>
-    %200 = stablehlo.convert %199 : (tensor<1x1x16384x256xf32>) -> tensor<1x1x16384x256xbf16>
-    %201 = stablehlo.reshape %200 : (tensor<1x1x16384x256xbf16>) -> tensor<1x16384x256xbf16>
-    %202 = stablehlo.reshape %176 : (tensor<1x1x256x32xbf16>) -> tensor<1x256x32xbf16>
-    %203 = stablehlo.broadcast_in_dim %202, dims = [0, 1, 2] : (tensor<1x256x32xbf16>) -> tensor<1x256x32xbf16>
-    %204 = stablehlo.dot_general %201, %203, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x16384x256xbf16>, tensor<1x256x32xbf16>) -> tensor<1x16384x32xbf16>
-    %205 = stablehlo.reshape %204 : (tensor<1x16384x32xbf16>) -> tensor<1x1x16384x32xbf16>
-    %206 = stablehlo.transpose %205, dims = [0, 2, 1, 3] : (tensor<1x1x16384x32xbf16>) -> tensor<1x16384x1x32xbf16>
-    %207 = stablehlo.reshape %206 : (tensor<1x16384x1x32xbf16>) -> tensor<1x16384x32xbf16>
-    %208 = stablehlo.reshape %207 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %209 = stablehlo.convert %208 : (tensor<16384x32xbf16>) -> tensor<16384x32xf32>
-    %210 = stablehlo.dot_general %209, %arg118, contracting_dims = [1] x [0] : (tensor<16384x32xf32>, tensor<32x32xf32>) -> tensor<16384x32xf32>
-    %211 = stablehlo.broadcast_in_dim %210, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %212 = stablehlo.multiply %211, %96 : tensor<16384x32xf32>
-    %213 = stablehlo.broadcast_in_dim %212, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %214 = stablehlo.broadcast_in_dim %arg119, dims = [1] : (tensor<32xf32>) -> tensor<16384x32xf32>
-    %215 = stablehlo.add %213, %214 : tensor<16384x32xf32>
-    %216 = stablehlo.convert %215 : (tensor<16384x32xf32>) -> tensor<16384x32xbf16>
-    %217 = stablehlo.reshape %216 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %218 = stablehlo.add %217, %52 : tensor<1x16384x32xbf16>
-    %219 = stablehlo.convert %218 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xf32>
-    %220 = stablehlo.convert %219 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf64>
-    %221 = stablehlo.reduce(%220 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %222 = stablehlo.reshape %221 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %223 = stablehlo.broadcast_in_dim %222, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %224 = stablehlo.divide %223, %14 : tensor<1x16384x1xf64>
-    %225 = stablehlo.broadcast_in_dim %220, dims = [0, 1, 2] : (tensor<1x16384x32xf64>) -> tensor<1x16384x32xf64>
-    %226 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x32xf64>
-    %227 = stablehlo.subtract %225, %226 : tensor<1x16384x32xf64>
-    %228 = stablehlo.multiply %227, %227 : tensor<1x16384x32xf64>
-    %229 = stablehlo.reduce(%228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %230 = stablehlo.reshape %229 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %231 = stablehlo.broadcast_in_dim %230, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %232 = stablehlo.divide %231, %14 : tensor<1x16384x1xf64>
-    %233 = stablehlo.convert %232 : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf32>
-    %234 = stablehlo.reduce(%219 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf32>, tensor<f32>) -> tensor<1x16384xf32>
-    %235 = stablehlo.reshape %234 : (tensor<1x16384xf32>) -> tensor<1x16384x1xf32>
-    %236 = stablehlo.broadcast_in_dim %235, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %237 = stablehlo.divide %236, %30 : tensor<1x16384x1xf32>
-    %238 = stablehlo.broadcast_in_dim %233, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %239 = stablehlo.add %238, %35 : tensor<1x16384x1xf32>
-    %240 = stablehlo.rsqrt %239 : tensor<1x16384x1xf32>
-    %241 = stablehlo.broadcast_in_dim %219, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %242 = stablehlo.broadcast_in_dim %237, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %243 = stablehlo.subtract %241, %242 : tensor<1x16384x32xf32>
-    %244 = stablehlo.broadcast_in_dim %243, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %245 = stablehlo.broadcast_in_dim %240, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %246 = stablehlo.multiply %244, %245 : tensor<1x16384x32xf32>
-    %247 = stablehlo.convert %arg11 : (tensor<32xbf16>) -> tensor<32xf32>
-    %248 = stablehlo.broadcast_in_dim %246, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %249 = stablehlo.broadcast_in_dim %247, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %250 = stablehlo.multiply %248, %249 : tensor<1x16384x32xf32>
-    %251 = stablehlo.convert %arg12 : (tensor<32xbf16>) -> tensor<32xf32>
-    %252 = stablehlo.broadcast_in_dim %250, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %253 = stablehlo.broadcast_in_dim %251, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %254 = stablehlo.add %252, %253 : tensor<1x16384x32xf32>
-    %255 = stablehlo.convert %254 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xbf16>
-    %256 = stablehlo.reshape %255 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %257 = stablehlo.convert %256 : (tensor<16384x32xbf16>) -> tensor<16384x32xf32>
-    %258 = stablehlo.dot_general %257, %arg120, contracting_dims = [1] x [0] : (tensor<16384x32xf32>, tensor<32x128xf32>) -> tensor<16384x128xf32>
-    %259 = stablehlo.broadcast_in_dim %258, dims = [0, 1] : (tensor<16384x128xf32>) -> tensor<16384x128xf32>
-    %260 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<16384x128xf32>
-    %261 = stablehlo.multiply %259, %260 : tensor<16384x128xf32>
-    %262 = stablehlo.broadcast_in_dim %261, dims = [0, 1] : (tensor<16384x128xf32>) -> tensor<16384x128xf32>
-    %263 = stablehlo.broadcast_in_dim %arg121, dims = [1] : (tensor<128xf32>) -> tensor<16384x128xf32>
-    %264 = stablehlo.add %262, %263 : tensor<16384x128xf32>
-    %265 = stablehlo.convert %264 : (tensor<16384x128xf32>) -> tensor<16384x128xbf16>
-    %266 = stablehlo.reshape %265 : (tensor<16384x128xbf16>) -> tensor<1x16384x128xbf16>
-    %267 = stablehlo.transpose %266, dims = [0, 2, 1] : (tensor<1x16384x128xbf16>) -> tensor<1x128x16384xbf16>
-    %268 = stablehlo.reshape %267 : (tensor<1x128x16384xbf16>) -> tensor<1x128x128x128xbf16>
-    %269 = stablehlo.convolution(%268, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x128x128xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x128x128xbf16>
-    %270 = stablehlo.reshape %arg14 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %271 = stablehlo.broadcast_in_dim %269, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %272 = stablehlo.broadcast_in_dim %270, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x128x128xbf16>
-    %273 = stablehlo.add %271, %272 : tensor<1x128x128x128xbf16>
-    %274 = stablehlo.reshape %273 : (tensor<1x128x128x128xbf16>) -> tensor<1x128x16384xbf16>
-    %275 = stablehlo.transpose %274, dims = [0, 2, 1] : (tensor<1x128x16384xbf16>) -> tensor<1x16384x128xbf16>
-    %276 = stablehlo.multiply %275, %cst_4 : tensor<1x16384x128xbf16>
-    %277 = stablehlo.rsqrt %cst_3 : tensor<1x16384x128xbf16>
-    %278 = stablehlo.multiply %275, %277 : tensor<1x16384x128xbf16>
-    %279 = stablehlo.convert %278 : (tensor<1x16384x128xbf16>) -> tensor<1x16384x128xf32>
-    %280 = stablehlo.clamp %cst_5, %279, %cst_6 : tensor<1x16384x128xf32>
-    %281 = stablehlo.multiply %280, %280 : tensor<1x16384x128xf32>
-    %282 = stablehlo.multiply %cst_7, %281 : tensor<1x16384x128xf32>
-    %283 = stablehlo.add %282, %cst_8 : tensor<1x16384x128xf32>
-    %284 = stablehlo.multiply %283, %281 : tensor<1x16384x128xf32>
-    %285 = stablehlo.add %284, %cst_9 : tensor<1x16384x128xf32>
-    %286 = stablehlo.multiply %285, %281 : tensor<1x16384x128xf32>
-    %287 = stablehlo.add %286, %cst_10 : tensor<1x16384x128xf32>
-    %288 = stablehlo.multiply %287, %281 : tensor<1x16384x128xf32>
-    %289 = stablehlo.add %288, %cst_11 : tensor<1x16384x128xf32>
-    %290 = stablehlo.multiply %289, %281 : tensor<1x16384x128xf32>
-    %291 = stablehlo.add %290, %cst_12 : tensor<1x16384x128xf32>
-    %292 = stablehlo.multiply %291, %281 : tensor<1x16384x128xf32>
-    %293 = stablehlo.add %292, %cst_13 : tensor<1x16384x128xf32>
-    %294 = stablehlo.multiply %cst_14, %281 : tensor<1x16384x128xf32>
-    %295 = stablehlo.add %294, %cst_15 : tensor<1x16384x128xf32>
-    %296 = stablehlo.multiply %295, %281 : tensor<1x16384x128xf32>
-    %297 = stablehlo.add %296, %cst_16 : tensor<1x16384x128xf32>
-    %298 = stablehlo.multiply %297, %281 : tensor<1x16384x128xf32>
-    %299 = stablehlo.add %298, %cst_17 : tensor<1x16384x128xf32>
-    %300 = stablehlo.multiply %299, %281 : tensor<1x16384x128xf32>
-    %301 = stablehlo.add %300, %cst_18 : tensor<1x16384x128xf32>
-    %302 = stablehlo.multiply %280, %293 : tensor<1x16384x128xf32>
-    %303 = stablehlo.divide %302, %301 : tensor<1x16384x128xf32>
-    %304 = stablehlo.clamp %cst_19, %303, %cst_20 : tensor<1x16384x128xf32>
-    %305 = stablehlo.convert %304 : (tensor<1x16384x128xf32>) -> tensor<1x16384x128xbf16>
-    %306 = stablehlo.add %305, %cst_2 : tensor<1x16384x128xbf16>
-    %307 = stablehlo.multiply %306, %276 : tensor<1x16384x128xbf16>
-    %308 = stablehlo.reshape %307 : (tensor<1x16384x128xbf16>) -> tensor<16384x128xbf16>
-    %309 = stablehlo.dot_general %308, %arg122, contracting_dims = [1] x [0] : (tensor<16384x128xbf16>, tensor<128x32xbf16>) -> tensor<16384x32xbf16>
-    %310 = stablehlo.reshape %309 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %311 = stablehlo.broadcast_in_dim %310, dims = [0, 1, 2] : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %312 = stablehlo.broadcast_in_dim %arg15, dims = [2] : (tensor<32xbf16>) -> tensor<1x16384x32xbf16>
-    %313 = stablehlo.add %311, %312 : tensor<1x16384x32xbf16>
-    %314 = stablehlo.reshape %313 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %315 = stablehlo.reshape %314 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %316 = stablehlo.add %315, %218 : tensor<1x16384x32xbf16>
-    %317 = stablehlo.convert %316 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xf32>
-    %318 = stablehlo.convert %317 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf64>
-    %319 = stablehlo.reduce(%318 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %320 = stablehlo.reshape %319 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %321 = stablehlo.broadcast_in_dim %320, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %322 = stablehlo.divide %321, %14 : tensor<1x16384x1xf64>
-    %323 = stablehlo.broadcast_in_dim %318, dims = [0, 1, 2] : (tensor<1x16384x32xf64>) -> tensor<1x16384x32xf64>
-    %324 = stablehlo.broadcast_in_dim %322, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x32xf64>
-    %325 = stablehlo.subtract %323, %324 : tensor<1x16384x32xf64>
-    %326 = stablehlo.multiply %325, %325 : tensor<1x16384x32xf64>
-    %327 = stablehlo.reduce(%326 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %328 = stablehlo.reshape %327 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %329 = stablehlo.broadcast_in_dim %328, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %330 = stablehlo.divide %329, %14 : tensor<1x16384x1xf64>
-    %331 = stablehlo.convert %330 : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf32>
-    %332 = stablehlo.reduce(%317 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf32>, tensor<f32>) -> tensor<1x16384xf32>
-    %333 = stablehlo.reshape %332 : (tensor<1x16384xf32>) -> tensor<1x16384x1xf32>
-    %334 = stablehlo.broadcast_in_dim %333, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %335 = stablehlo.divide %334, %30 : tensor<1x16384x1xf32>
-    %336 = stablehlo.broadcast_in_dim %331, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %337 = stablehlo.add %336, %35 : tensor<1x16384x1xf32>
-    %338 = stablehlo.rsqrt %337 : tensor<1x16384x1xf32>
-    %339 = stablehlo.broadcast_in_dim %317, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %340 = stablehlo.broadcast_in_dim %335, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %341 = stablehlo.subtract %339, %340 : tensor<1x16384x32xf32>
-    %342 = stablehlo.broadcast_in_dim %341, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %343 = stablehlo.broadcast_in_dim %338, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %344 = stablehlo.multiply %342, %343 : tensor<1x16384x32xf32>
-    %345 = stablehlo.convert %arg16 : (tensor<32xbf16>) -> tensor<32xf32>
-    %346 = stablehlo.broadcast_in_dim %344, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %347 = stablehlo.broadcast_in_dim %345, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %348 = stablehlo.multiply %346, %347 : tensor<1x16384x32xf32>
-    %349 = stablehlo.convert %arg17 : (tensor<32xbf16>) -> tensor<32xf32>
-    %350 = stablehlo.broadcast_in_dim %348, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %351 = stablehlo.broadcast_in_dim %349, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %352 = stablehlo.add %350, %351 : tensor<1x16384x32xf32>
-    %353 = stablehlo.convert %352 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xbf16>
-    %354 = stablehlo.reshape %353 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %355 = stablehlo.convert %354 : (tensor<16384x32xbf16>) -> tensor<16384x32xf32>
-    %356 = stablehlo.dot_general %355, %arg123, contracting_dims = [1] x [0] : (tensor<16384x32xf32>, tensor<32x32xf32>) -> tensor<16384x32xf32>
-    %357 = stablehlo.broadcast_in_dim %356, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %358 = stablehlo.multiply %357, %96 : tensor<16384x32xf32>
-    %359 = stablehlo.broadcast_in_dim %358, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %360 = stablehlo.broadcast_in_dim %arg124, dims = [1] : (tensor<32xf32>) -> tensor<16384x32xf32>
-    %361 = stablehlo.add %359, %360 : tensor<16384x32xf32>
-    %362 = stablehlo.convert %361 : (tensor<16384x32xf32>) -> tensor<16384x32xbf16>
-    %363 = stablehlo.reshape %362 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %364 = stablehlo.reshape %363 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x1x32xbf16>
-    %365 = stablehlo.transpose %364, dims = [0, 2, 1, 3] : (tensor<1x16384x1x32xbf16>) -> tensor<1x1x16384x32xbf16>
-    %366 = stablehlo.transpose %353, dims = [0, 2, 1] : (tensor<1x16384x32xbf16>) -> tensor<1x32x16384xbf16>
-    %367 = stablehlo.reshape %366 : (tensor<1x32x16384xbf16>) -> tensor<1x32x128x128xbf16>
-    %368 = stablehlo.convolution(%367, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [8, 8], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x128x128xbf16>, tensor<32x32x8x8xbf16>) -> tensor<1x32x16x16xbf16>
-    %369 = stablehlo.reshape %arg19 : (tensor<32xbf16>) -> tensor<32x1x1xbf16>
-    %370 = stablehlo.broadcast_in_dim %368, dims = [0, 1, 2, 3] : (tensor<1x32x16x16xbf16>) -> tensor<1x32x16x16xbf16>
-    %371 = stablehlo.broadcast_in_dim %369, dims = [1, 2, 3] : (tensor<32x1x1xbf16>) -> tensor<1x32x16x16xbf16>
-    %372 = stablehlo.add %370, %371 : tensor<1x32x16x16xbf16>
-    %373 = stablehlo.reshape %372 : (tensor<1x32x16x16xbf16>) -> tensor<1x32x256xbf16>
-    %374 = stablehlo.transpose %373, dims = [0, 2, 1] : (tensor<1x32x256xbf16>) -> tensor<1x256x32xbf16>
-    %375 = stablehlo.convert %374 : (tensor<1x256x32xbf16>) -> tensor<1x256x32xf32>
-    %376 = stablehlo.convert %375 : (tensor<1x256x32xf32>) -> tensor<1x256x32xf64>
-    %377 = stablehlo.reduce(%376 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x32xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %378 = stablehlo.reshape %377 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %379 = stablehlo.broadcast_in_dim %378, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %380 = stablehlo.divide %379, %119 : tensor<1x256x1xf64>
-    %381 = stablehlo.broadcast_in_dim %376, dims = [0, 1, 2] : (tensor<1x256x32xf64>) -> tensor<1x256x32xf64>
-    %382 = stablehlo.broadcast_in_dim %380, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x32xf64>
-    %383 = stablehlo.subtract %381, %382 : tensor<1x256x32xf64>
-    %384 = stablehlo.multiply %383, %383 : tensor<1x256x32xf64>
-    %385 = stablehlo.reduce(%384 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x32xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %386 = stablehlo.reshape %385 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %387 = stablehlo.broadcast_in_dim %386, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %388 = stablehlo.divide %387, %119 : tensor<1x256x1xf64>
-    %389 = stablehlo.convert %388 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %390 = stablehlo.reduce(%375 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x32xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %391 = stablehlo.reshape %390 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %392 = stablehlo.broadcast_in_dim %391, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %393 = stablehlo.divide %392, %133 : tensor<1x256x1xf32>
-    %394 = stablehlo.broadcast_in_dim %389, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %395 = stablehlo.add %394, %136 : tensor<1x256x1xf32>
-    %396 = stablehlo.rsqrt %395 : tensor<1x256x1xf32>
-    %397 = stablehlo.broadcast_in_dim %375, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %398 = stablehlo.broadcast_in_dim %393, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x32xf32>
-    %399 = stablehlo.subtract %397, %398 : tensor<1x256x32xf32>
-    %400 = stablehlo.broadcast_in_dim %399, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %401 = stablehlo.broadcast_in_dim %396, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x32xf32>
-    %402 = stablehlo.multiply %400, %401 : tensor<1x256x32xf32>
-    %403 = stablehlo.convert %arg20 : (tensor<32xbf16>) -> tensor<32xf32>
-    %404 = stablehlo.broadcast_in_dim %402, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %405 = stablehlo.broadcast_in_dim %403, dims = [2] : (tensor<32xf32>) -> tensor<1x256x32xf32>
-    %406 = stablehlo.multiply %404, %405 : tensor<1x256x32xf32>
-    %407 = stablehlo.convert %arg21 : (tensor<32xbf16>) -> tensor<32xf32>
-    %408 = stablehlo.broadcast_in_dim %406, dims = [0, 1, 2] : (tensor<1x256x32xf32>) -> tensor<1x256x32xf32>
-    %409 = stablehlo.broadcast_in_dim %407, dims = [2] : (tensor<32xf32>) -> tensor<1x256x32xf32>
-    %410 = stablehlo.add %408, %409 : tensor<1x256x32xf32>
-    %411 = stablehlo.convert %410 : (tensor<1x256x32xf32>) -> tensor<1x256x32xbf16>
-    %412 = stablehlo.reshape %411 : (tensor<1x256x32xbf16>) -> tensor<256x32xbf16>
-    %413 = stablehlo.convert %412 : (tensor<256x32xbf16>) -> tensor<256x32xf32>
-    %414 = stablehlo.dot_general %413, %arg125, contracting_dims = [1] x [0] : (tensor<256x32xf32>, tensor<32x32xf32>) -> tensor<256x32xf32>
-    %415 = stablehlo.broadcast_in_dim %414, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %416 = stablehlo.multiply %415, %158 : tensor<256x32xf32>
-    %417 = stablehlo.broadcast_in_dim %416, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %418 = stablehlo.broadcast_in_dim %arg126, dims = [1] : (tensor<32xf32>) -> tensor<256x32xf32>
-    %419 = stablehlo.add %417, %418 : tensor<256x32xf32>
-    %420 = stablehlo.convert %419 : (tensor<256x32xf32>) -> tensor<256x32xbf16>
-    %421 = stablehlo.reshape %420 : (tensor<256x32xbf16>) -> tensor<1x256x32xbf16>
-    %422 = stablehlo.reshape %421 : (tensor<1x256x32xbf16>) -> tensor<1x256x1x32xbf16>
-    %423 = stablehlo.transpose %422, dims = [0, 2, 1, 3] : (tensor<1x256x1x32xbf16>) -> tensor<1x1x256x32xbf16>
-    %424 = stablehlo.dot_general %413, %arg127, contracting_dims = [1] x [0] : (tensor<256x32xf32>, tensor<32x32xf32>) -> tensor<256x32xf32>
-    %425 = stablehlo.broadcast_in_dim %424, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %426 = stablehlo.multiply %425, %158 : tensor<256x32xf32>
-    %427 = stablehlo.broadcast_in_dim %426, dims = [0, 1] : (tensor<256x32xf32>) -> tensor<256x32xf32>
-    %428 = stablehlo.broadcast_in_dim %arg128, dims = [1] : (tensor<32xf32>) -> tensor<256x32xf32>
-    %429 = stablehlo.add %427, %428 : tensor<256x32xf32>
-    %430 = stablehlo.convert %429 : (tensor<256x32xf32>) -> tensor<256x32xbf16>
-    %431 = stablehlo.reshape %430 : (tensor<256x32xbf16>) -> tensor<1x256x32xbf16>
-    %432 = stablehlo.reshape %431 : (tensor<1x256x32xbf16>) -> tensor<1x256x1x32xbf16>
-    %433 = stablehlo.transpose %432, dims = [0, 2, 1, 3] : (tensor<1x256x1x32xbf16>) -> tensor<1x1x256x32xbf16>
-    %434 = stablehlo.transpose %423, dims = [0, 1, 3, 2] : (tensor<1x1x256x32xbf16>) -> tensor<1x1x32x256xbf16>
-    %435 = stablehlo.reshape %365 : (tensor<1x1x16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %436 = stablehlo.reshape %434 : (tensor<1x1x32x256xbf16>) -> tensor<1x32x256xbf16>
-    %437 = stablehlo.broadcast_in_dim %436, dims = [0, 1, 2] : (tensor<1x32x256xbf16>) -> tensor<1x32x256xbf16>
-    %438 = stablehlo.dot_general %435, %437, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x16384x32xbf16>, tensor<1x32x256xbf16>) -> tensor<1x16384x256xbf16>
-    %439 = stablehlo.reshape %438 : (tensor<1x16384x256xbf16>) -> tensor<1x1x16384x256xbf16>
-    %440 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2, 3] : (tensor<1x1x16384x256xbf16>) -> tensor<1x1x16384x256xbf16>
-    %441 = stablehlo.divide %440, %186 : tensor<1x1x16384x256xbf16>
-    %442 = stablehlo.convert %441 : (tensor<1x1x16384x256xbf16>) -> tensor<1x1x16384x256xf32>
-    %443 = stablehlo.reduce(%442 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x1x16384x256xf32>, tensor<f32>) -> tensor<1x1x16384xf32>
-    %444 = stablehlo.reshape %443 : (tensor<1x1x16384xf32>) -> tensor<1x1x16384x1xf32>
-    %445 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2, 3] : (tensor<1x1x16384x256xf32>) -> tensor<1x1x16384x256xf32>
-    %446 = stablehlo.broadcast_in_dim %444, dims = [0, 1, 2, 3] : (tensor<1x1x16384x1xf32>) -> tensor<1x1x16384x256xf32>
-    %447 = stablehlo.subtract %445, %446 : tensor<1x1x16384x256xf32>
-    %448 = stablehlo.exponential %447 : tensor<1x1x16384x256xf32>
-    %449 = stablehlo.reduce(%448 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x1x16384x256xf32>, tensor<f32>) -> tensor<1x1x16384xf32>
-    %450 = stablehlo.reshape %449 : (tensor<1x1x16384xf32>) -> tensor<1x1x16384x1xf32>
-    %451 = stablehlo.broadcast_in_dim %448, dims = [0, 1, 2, 3] : (tensor<1x1x16384x256xf32>) -> tensor<1x1x16384x256xf32>
-    %452 = stablehlo.broadcast_in_dim %450, dims = [0, 1, 2, 3] : (tensor<1x1x16384x1xf32>) -> tensor<1x1x16384x256xf32>
-    %453 = stablehlo.divide %451, %452 : tensor<1x1x16384x256xf32>
-    %454 = stablehlo.convert %453 : (tensor<1x1x16384x256xf32>) -> tensor<1x1x16384x256xbf16>
-    %455 = stablehlo.reshape %454 : (tensor<1x1x16384x256xbf16>) -> tensor<1x16384x256xbf16>
-    %456 = stablehlo.reshape %433 : (tensor<1x1x256x32xbf16>) -> tensor<1x256x32xbf16>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2] : (tensor<1x256x32xbf16>) -> tensor<1x256x32xbf16>
-    %458 = stablehlo.dot_general %455, %457, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x16384x256xbf16>, tensor<1x256x32xbf16>) -> tensor<1x16384x32xbf16>
-    %459 = stablehlo.reshape %458 : (tensor<1x16384x32xbf16>) -> tensor<1x1x16384x32xbf16>
-    %460 = stablehlo.transpose %459, dims = [0, 2, 1, 3] : (tensor<1x1x16384x32xbf16>) -> tensor<1x16384x1x32xbf16>
-    %461 = stablehlo.reshape %460 : (tensor<1x16384x1x32xbf16>) -> tensor<1x16384x32xbf16>
-    %462 = stablehlo.reshape %461 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %463 = stablehlo.convert %462 : (tensor<16384x32xbf16>) -> tensor<16384x32xf32>
-    %464 = stablehlo.dot_general %463, %arg129, contracting_dims = [1] x [0] : (tensor<16384x32xf32>, tensor<32x32xf32>) -> tensor<16384x32xf32>
-    %465 = stablehlo.broadcast_in_dim %464, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %466 = stablehlo.multiply %465, %96 : tensor<16384x32xf32>
-    %467 = stablehlo.broadcast_in_dim %466, dims = [0, 1] : (tensor<16384x32xf32>) -> tensor<16384x32xf32>
-    %468 = stablehlo.broadcast_in_dim %arg130, dims = [1] : (tensor<32xf32>) -> tensor<16384x32xf32>
-    %469 = stablehlo.add %467, %468 : tensor<16384x32xf32>
-    %470 = stablehlo.convert %469 : (tensor<16384x32xf32>) -> tensor<16384x32xbf16>
-    %471 = stablehlo.reshape %470 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %472 = stablehlo.add %471, %316 : tensor<1x16384x32xbf16>
-    %473 = stablehlo.convert %472 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xf32>
-    %474 = stablehlo.convert %473 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf64>
-    %475 = stablehlo.reduce(%474 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %476 = stablehlo.reshape %475 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %477 = stablehlo.broadcast_in_dim %476, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %478 = stablehlo.divide %477, %14 : tensor<1x16384x1xf64>
-    %479 = stablehlo.broadcast_in_dim %474, dims = [0, 1, 2] : (tensor<1x16384x32xf64>) -> tensor<1x16384x32xf64>
-    %480 = stablehlo.broadcast_in_dim %478, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x32xf64>
-    %481 = stablehlo.subtract %479, %480 : tensor<1x16384x32xf64>
-    %482 = stablehlo.multiply %481, %481 : tensor<1x16384x32xf64>
-    %483 = stablehlo.reduce(%482 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %484 = stablehlo.reshape %483 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %485 = stablehlo.broadcast_in_dim %484, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %486 = stablehlo.divide %485, %14 : tensor<1x16384x1xf64>
-    %487 = stablehlo.convert %486 : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf32>
-    %488 = stablehlo.reduce(%473 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf32>, tensor<f32>) -> tensor<1x16384xf32>
-    %489 = stablehlo.reshape %488 : (tensor<1x16384xf32>) -> tensor<1x16384x1xf32>
-    %490 = stablehlo.broadcast_in_dim %489, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %491 = stablehlo.divide %490, %30 : tensor<1x16384x1xf32>
-    %492 = stablehlo.broadcast_in_dim %487, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %493 = stablehlo.add %492, %35 : tensor<1x16384x1xf32>
-    %494 = stablehlo.rsqrt %493 : tensor<1x16384x1xf32>
-    %495 = stablehlo.broadcast_in_dim %473, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %496 = stablehlo.broadcast_in_dim %491, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %497 = stablehlo.subtract %495, %496 : tensor<1x16384x32xf32>
-    %498 = stablehlo.broadcast_in_dim %497, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %499 = stablehlo.broadcast_in_dim %494, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %500 = stablehlo.multiply %498, %499 : tensor<1x16384x32xf32>
-    %501 = stablehlo.convert %arg22 : (tensor<32xbf16>) -> tensor<32xf32>
-    %502 = stablehlo.broadcast_in_dim %500, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %503 = stablehlo.broadcast_in_dim %501, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %504 = stablehlo.multiply %502, %503 : tensor<1x16384x32xf32>
-    %505 = stablehlo.convert %arg23 : (tensor<32xbf16>) -> tensor<32xf32>
-    %506 = stablehlo.broadcast_in_dim %504, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %507 = stablehlo.broadcast_in_dim %505, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %508 = stablehlo.add %506, %507 : tensor<1x16384x32xf32>
-    %509 = stablehlo.convert %508 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xbf16>
-    %510 = stablehlo.reshape %509 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %511 = stablehlo.convert %510 : (tensor<16384x32xbf16>) -> tensor<16384x32xf32>
-    %512 = stablehlo.dot_general %511, %arg131, contracting_dims = [1] x [0] : (tensor<16384x32xf32>, tensor<32x128xf32>) -> tensor<16384x128xf32>
-    %513 = stablehlo.broadcast_in_dim %512, dims = [0, 1] : (tensor<16384x128xf32>) -> tensor<16384x128xf32>
-    %514 = stablehlo.multiply %513, %260 : tensor<16384x128xf32>
-    %515 = stablehlo.broadcast_in_dim %514, dims = [0, 1] : (tensor<16384x128xf32>) -> tensor<16384x128xf32>
-    %516 = stablehlo.broadcast_in_dim %arg132, dims = [1] : (tensor<128xf32>) -> tensor<16384x128xf32>
-    %517 = stablehlo.add %515, %516 : tensor<16384x128xf32>
-    %518 = stablehlo.convert %517 : (tensor<16384x128xf32>) -> tensor<16384x128xbf16>
-    %519 = stablehlo.reshape %518 : (tensor<16384x128xbf16>) -> tensor<1x16384x128xbf16>
-    %520 = stablehlo.transpose %519, dims = [0, 2, 1] : (tensor<1x16384x128xbf16>) -> tensor<1x128x16384xbf16>
-    %521 = stablehlo.reshape %520 : (tensor<1x128x16384xbf16>) -> tensor<1x128x128x128xbf16>
-    %522 = stablehlo.convolution(%521, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 128 : i64} : (tensor<1x128x128x128xbf16>, tensor<128x1x3x3xbf16>) -> tensor<1x128x128x128xbf16>
-    %523 = stablehlo.reshape %arg25 : (tensor<128xbf16>) -> tensor<128x1x1xbf16>
-    %524 = stablehlo.broadcast_in_dim %522, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %525 = stablehlo.broadcast_in_dim %523, dims = [1, 2, 3] : (tensor<128x1x1xbf16>) -> tensor<1x128x128x128xbf16>
-    %526 = stablehlo.add %524, %525 : tensor<1x128x128x128xbf16>
-    %527 = stablehlo.reshape %526 : (tensor<1x128x128x128xbf16>) -> tensor<1x128x16384xbf16>
-    %528 = stablehlo.transpose %527, dims = [0, 2, 1] : (tensor<1x128x16384xbf16>) -> tensor<1x16384x128xbf16>
-    %529 = stablehlo.multiply %528, %cst_4 : tensor<1x16384x128xbf16>
-    %530 = stablehlo.multiply %528, %277 : tensor<1x16384x128xbf16>
-    %531 = stablehlo.convert %530 : (tensor<1x16384x128xbf16>) -> tensor<1x16384x128xf32>
-    %532 = stablehlo.clamp %cst_5, %531, %cst_6 : tensor<1x16384x128xf32>
-    %533 = stablehlo.multiply %532, %532 : tensor<1x16384x128xf32>
-    %534 = stablehlo.multiply %cst_7, %533 : tensor<1x16384x128xf32>
-    %535 = stablehlo.add %534, %cst_8 : tensor<1x16384x128xf32>
-    %536 = stablehlo.multiply %535, %533 : tensor<1x16384x128xf32>
-    %537 = stablehlo.add %536, %cst_9 : tensor<1x16384x128xf32>
-    %538 = stablehlo.multiply %537, %533 : tensor<1x16384x128xf32>
-    %539 = stablehlo.add %538, %cst_10 : tensor<1x16384x128xf32>
-    %540 = stablehlo.multiply %539, %533 : tensor<1x16384x128xf32>
-    %541 = stablehlo.add %540, %cst_11 : tensor<1x16384x128xf32>
-    %542 = stablehlo.multiply %541, %533 : tensor<1x16384x128xf32>
-    %543 = stablehlo.add %542, %cst_12 : tensor<1x16384x128xf32>
-    %544 = stablehlo.multiply %543, %533 : tensor<1x16384x128xf32>
-    %545 = stablehlo.add %544, %cst_13 : tensor<1x16384x128xf32>
-    %546 = stablehlo.multiply %cst_14, %533 : tensor<1x16384x128xf32>
-    %547 = stablehlo.add %546, %cst_15 : tensor<1x16384x128xf32>
-    %548 = stablehlo.multiply %547, %533 : tensor<1x16384x128xf32>
-    %549 = stablehlo.add %548, %cst_16 : tensor<1x16384x128xf32>
-    %550 = stablehlo.multiply %549, %533 : tensor<1x16384x128xf32>
-    %551 = stablehlo.add %550, %cst_17 : tensor<1x16384x128xf32>
-    %552 = stablehlo.multiply %551, %533 : tensor<1x16384x128xf32>
-    %553 = stablehlo.add %552, %cst_18 : tensor<1x16384x128xf32>
-    %554 = stablehlo.multiply %532, %545 : tensor<1x16384x128xf32>
-    %555 = stablehlo.divide %554, %553 : tensor<1x16384x128xf32>
-    %556 = stablehlo.clamp %cst_19, %555, %cst_20 : tensor<1x16384x128xf32>
-    %557 = stablehlo.convert %556 : (tensor<1x16384x128xf32>) -> tensor<1x16384x128xbf16>
-    %558 = stablehlo.add %557, %cst_2 : tensor<1x16384x128xbf16>
-    %559 = stablehlo.multiply %558, %529 : tensor<1x16384x128xbf16>
-    %560 = stablehlo.reshape %559 : (tensor<1x16384x128xbf16>) -> tensor<16384x128xbf16>
-    %561 = stablehlo.dot_general %560, %arg133, contracting_dims = [1] x [0] : (tensor<16384x128xbf16>, tensor<128x32xbf16>) -> tensor<16384x32xbf16>
-    %562 = stablehlo.reshape %561 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %563 = stablehlo.broadcast_in_dim %562, dims = [0, 1, 2] : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %564 = stablehlo.broadcast_in_dim %arg26, dims = [2] : (tensor<32xbf16>) -> tensor<1x16384x32xbf16>
-    %565 = stablehlo.add %563, %564 : tensor<1x16384x32xbf16>
-    %566 = stablehlo.reshape %565 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %567 = stablehlo.reshape %566 : (tensor<16384x32xbf16>) -> tensor<1x16384x32xbf16>
-    %568 = stablehlo.add %567, %472 : tensor<1x16384x32xbf16>
-    %569 = stablehlo.convert %568 : (tensor<1x16384x32xbf16>) -> tensor<1x16384x32xf32>
-    %570 = stablehlo.convert %569 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf64>
-    %571 = stablehlo.reduce(%570 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %572 = stablehlo.reshape %571 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %573 = stablehlo.broadcast_in_dim %572, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %574 = stablehlo.divide %573, %14 : tensor<1x16384x1xf64>
-    %575 = stablehlo.broadcast_in_dim %570, dims = [0, 1, 2] : (tensor<1x16384x32xf64>) -> tensor<1x16384x32xf64>
-    %576 = stablehlo.broadcast_in_dim %574, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x32xf64>
-    %577 = stablehlo.subtract %575, %576 : tensor<1x16384x32xf64>
-    %578 = stablehlo.multiply %577, %577 : tensor<1x16384x32xf64>
-    %579 = stablehlo.reduce(%578 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf64>, tensor<f64>) -> tensor<1x16384xf64>
-    %580 = stablehlo.reshape %579 : (tensor<1x16384xf64>) -> tensor<1x16384x1xf64>
-    %581 = stablehlo.broadcast_in_dim %580, dims = [0, 1, 2] : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf64>
-    %582 = stablehlo.divide %581, %14 : tensor<1x16384x1xf64>
-    %583 = stablehlo.convert %582 : (tensor<1x16384x1xf64>) -> tensor<1x16384x1xf32>
-    %584 = stablehlo.reduce(%569 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16384x32xf32>, tensor<f32>) -> tensor<1x16384xf32>
-    %585 = stablehlo.reshape %584 : (tensor<1x16384xf32>) -> tensor<1x16384x1xf32>
-    %586 = stablehlo.broadcast_in_dim %585, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %587 = stablehlo.divide %586, %30 : tensor<1x16384x1xf32>
-    %588 = stablehlo.broadcast_in_dim %583, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x1xf32>
-    %589 = stablehlo.add %588, %35 : tensor<1x16384x1xf32>
-    %590 = stablehlo.rsqrt %589 : tensor<1x16384x1xf32>
-    %591 = stablehlo.broadcast_in_dim %569, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %592 = stablehlo.broadcast_in_dim %587, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %593 = stablehlo.subtract %591, %592 : tensor<1x16384x32xf32>
-    %594 = stablehlo.broadcast_in_dim %593, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %595 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2] : (tensor<1x16384x1xf32>) -> tensor<1x16384x32xf32>
-    %596 = stablehlo.multiply %594, %595 : tensor<1x16384x32xf32>
-    %597 = stablehlo.convert %arg27 : (tensor<32xbf16>) -> tensor<32xf32>
-    %598 = stablehlo.broadcast_in_dim %596, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %599 = stablehlo.broadcast_in_dim %597, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %600 = stablehlo.multiply %598, %599 : tensor<1x16384x32xf32>
-    %601 = stablehlo.convert %arg28 : (tensor<32xbf16>) -> tensor<32xf32>
-    %602 = stablehlo.broadcast_in_dim %600, dims = [0, 1, 2] : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xf32>
-    %603 = stablehlo.broadcast_in_dim %601, dims = [2] : (tensor<32xf32>) -> tensor<1x16384x32xf32>
-    %604 = stablehlo.add %602, %603 : tensor<1x16384x32xf32>
-    %605 = stablehlo.convert %604 : (tensor<1x16384x32xf32>) -> tensor<1x16384x32xbf16>
-    %606 = stablehlo.reshape %605 : (tensor<1x16384x32xbf16>) -> tensor<1x128x128x32xbf16>
-    %607 = stablehlo.transpose %606, dims = [0, 3, 1, 2] : (tensor<1x128x128x32xbf16>) -> tensor<1x32x128x128xbf16>
-    %608 = stablehlo.convolution(%607, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x128x128xbf16>, tensor<64x32x3x3xbf16>) -> tensor<1x64x64x64xbf16>
-    %609 = stablehlo.reshape %arg30 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %610 = stablehlo.broadcast_in_dim %608, dims = [0, 1, 2, 3] : (tensor<1x64x64x64xbf16>) -> tensor<1x64x64x64xbf16>
-    %611 = stablehlo.broadcast_in_dim %609, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x64x64xbf16>
-    %612 = stablehlo.add %610, %611 : tensor<1x64x64x64xbf16>
-    %613 = stablehlo.reshape %612 : (tensor<1x64x64x64xbf16>) -> tensor<1x64x4096xbf16>
-    %614 = stablehlo.transpose %613, dims = [0, 2, 1] : (tensor<1x64x4096xbf16>) -> tensor<1x4096x64xbf16>
-    %615 = stablehlo.convert %614 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xf32>
-    %616 = stablehlo.convert %615 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf64>
-    %617 = stablehlo.reduce(%616 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %618 = stablehlo.reshape %617 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %619 = stablehlo.convert %cst_83 : (tensor<1xi64>) -> tensor<1xf64>
-    %620 = stablehlo.reshape %619 : (tensor<1xf64>) -> tensor<f64>
-    %621 = stablehlo.broadcast_in_dim %618, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %622 = stablehlo.broadcast_in_dim %620, dims = [] : (tensor<f64>) -> tensor<1x4096x1xf64>
-    %623 = stablehlo.divide %621, %622 : tensor<1x4096x1xf64>
-    %624 = stablehlo.broadcast_in_dim %616, dims = [0, 1, 2] : (tensor<1x4096x64xf64>) -> tensor<1x4096x64xf64>
-    %625 = stablehlo.broadcast_in_dim %623, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x64xf64>
-    %626 = stablehlo.subtract %624, %625 : tensor<1x4096x64xf64>
-    %627 = stablehlo.multiply %626, %626 : tensor<1x4096x64xf64>
-    %628 = stablehlo.reduce(%627 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %629 = stablehlo.reshape %628 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %630 = stablehlo.broadcast_in_dim %629, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %631 = stablehlo.divide %630, %622 : tensor<1x4096x1xf64>
-    %632 = stablehlo.convert %631 : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf32>
-    %633 = stablehlo.reduce(%615 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf32>, tensor<f32>) -> tensor<1x4096xf32>
-    %634 = stablehlo.reshape %633 : (tensor<1x4096xf32>) -> tensor<1x4096x1xf32>
-    %635 = stablehlo.convert %cst_83 : (tensor<1xi64>) -> tensor<1xf32>
-    %636 = stablehlo.reshape %635 : (tensor<1xf32>) -> tensor<f32>
-    %637 = stablehlo.broadcast_in_dim %634, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %638 = stablehlo.broadcast_in_dim %636, dims = [] : (tensor<f32>) -> tensor<1x4096x1xf32>
-    %639 = stablehlo.divide %637, %638 : tensor<1x4096x1xf32>
-    %640 = stablehlo.broadcast_in_dim %632, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %641 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x4096x1xf32>
-    %642 = stablehlo.add %640, %641 : tensor<1x4096x1xf32>
-    %643 = stablehlo.rsqrt %642 : tensor<1x4096x1xf32>
-    %644 = stablehlo.broadcast_in_dim %615, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %645 = stablehlo.broadcast_in_dim %639, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %646 = stablehlo.subtract %644, %645 : tensor<1x4096x64xf32>
-    %647 = stablehlo.broadcast_in_dim %646, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %648 = stablehlo.broadcast_in_dim %643, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %649 = stablehlo.multiply %647, %648 : tensor<1x4096x64xf32>
-    %650 = stablehlo.convert %arg31 : (tensor<64xbf16>) -> tensor<64xf32>
-    %651 = stablehlo.broadcast_in_dim %649, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %652 = stablehlo.broadcast_in_dim %650, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %653 = stablehlo.multiply %651, %652 : tensor<1x4096x64xf32>
-    %654 = stablehlo.convert %arg32 : (tensor<64xbf16>) -> tensor<64xf32>
-    %655 = stablehlo.broadcast_in_dim %653, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %656 = stablehlo.broadcast_in_dim %654, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %657 = stablehlo.add %655, %656 : tensor<1x4096x64xf32>
-    %658 = stablehlo.convert %657 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xbf16>
-    %659 = stablehlo.convert %658 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xf32>
-    %660 = stablehlo.convert %659 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf64>
-    %661 = stablehlo.reduce(%660 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %662 = stablehlo.reshape %661 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %663 = stablehlo.broadcast_in_dim %662, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %664 = stablehlo.divide %663, %622 : tensor<1x4096x1xf64>
-    %665 = stablehlo.broadcast_in_dim %660, dims = [0, 1, 2] : (tensor<1x4096x64xf64>) -> tensor<1x4096x64xf64>
-    %666 = stablehlo.broadcast_in_dim %664, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x64xf64>
-    %667 = stablehlo.subtract %665, %666 : tensor<1x4096x64xf64>
-    %668 = stablehlo.multiply %667, %667 : tensor<1x4096x64xf64>
-    %669 = stablehlo.reduce(%668 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %670 = stablehlo.reshape %669 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %671 = stablehlo.broadcast_in_dim %670, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %672 = stablehlo.divide %671, %622 : tensor<1x4096x1xf64>
-    %673 = stablehlo.convert %672 : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf32>
-    %674 = stablehlo.reduce(%659 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf32>, tensor<f32>) -> tensor<1x4096xf32>
-    %675 = stablehlo.reshape %674 : (tensor<1x4096xf32>) -> tensor<1x4096x1xf32>
-    %676 = stablehlo.broadcast_in_dim %675, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %677 = stablehlo.divide %676, %638 : tensor<1x4096x1xf32>
-    %678 = stablehlo.broadcast_in_dim %673, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %679 = stablehlo.add %678, %641 : tensor<1x4096x1xf32>
-    %680 = stablehlo.rsqrt %679 : tensor<1x4096x1xf32>
-    %681 = stablehlo.broadcast_in_dim %659, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %682 = stablehlo.broadcast_in_dim %677, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %683 = stablehlo.subtract %681, %682 : tensor<1x4096x64xf32>
-    %684 = stablehlo.broadcast_in_dim %683, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %685 = stablehlo.broadcast_in_dim %680, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %686 = stablehlo.multiply %684, %685 : tensor<1x4096x64xf32>
-    %687 = stablehlo.convert %arg33 : (tensor<64xbf16>) -> tensor<64xf32>
-    %688 = stablehlo.broadcast_in_dim %686, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %689 = stablehlo.broadcast_in_dim %687, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %690 = stablehlo.multiply %688, %689 : tensor<1x4096x64xf32>
-    %691 = stablehlo.convert %arg34 : (tensor<64xbf16>) -> tensor<64xf32>
-    %692 = stablehlo.broadcast_in_dim %690, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %693 = stablehlo.broadcast_in_dim %691, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %694 = stablehlo.add %692, %693 : tensor<1x4096x64xf32>
-    %695 = stablehlo.convert %694 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xbf16>
-    %696 = stablehlo.reshape %695 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %697 = stablehlo.convert %696 : (tensor<4096x64xbf16>) -> tensor<4096x64xf32>
-    %698 = stablehlo.dot_general %697, %arg134, contracting_dims = [1] x [0] : (tensor<4096x64xf32>, tensor<64x64xf32>) -> tensor<4096x64xf32>
-    %699 = stablehlo.broadcast_in_dim %698, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %700 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<4096x64xf32>
-    %701 = stablehlo.multiply %699, %700 : tensor<4096x64xf32>
-    %702 = stablehlo.broadcast_in_dim %701, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %703 = stablehlo.broadcast_in_dim %arg135, dims = [1] : (tensor<64xf32>) -> tensor<4096x64xf32>
-    %704 = stablehlo.add %702, %703 : tensor<4096x64xf32>
-    %705 = stablehlo.convert %704 : (tensor<4096x64xf32>) -> tensor<4096x64xbf16>
-    %706 = stablehlo.reshape %705 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %707 = stablehlo.reshape %706 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x2x32xbf16>
-    %708 = stablehlo.transpose %707, dims = [0, 2, 1, 3] : (tensor<1x4096x2x32xbf16>) -> tensor<1x2x4096x32xbf16>
-    %709 = stablehlo.transpose %695, dims = [0, 2, 1] : (tensor<1x4096x64xbf16>) -> tensor<1x64x4096xbf16>
-    %710 = stablehlo.reshape %709 : (tensor<1x64x4096xbf16>) -> tensor<1x64x64x64xbf16>
-    %711 = stablehlo.convolution(%710, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x64x64xbf16>, tensor<64x64x4x4xbf16>) -> tensor<1x64x16x16xbf16>
-    %712 = stablehlo.reshape %arg36 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %713 = stablehlo.broadcast_in_dim %711, dims = [0, 1, 2, 3] : (tensor<1x64x16x16xbf16>) -> tensor<1x64x16x16xbf16>
-    %714 = stablehlo.broadcast_in_dim %712, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x16x16xbf16>
-    %715 = stablehlo.add %713, %714 : tensor<1x64x16x16xbf16>
-    %716 = stablehlo.reshape %715 : (tensor<1x64x16x16xbf16>) -> tensor<1x64x256xbf16>
-    %717 = stablehlo.transpose %716, dims = [0, 2, 1] : (tensor<1x64x256xbf16>) -> tensor<1x256x64xbf16>
-    %718 = stablehlo.convert %717 : (tensor<1x256x64xbf16>) -> tensor<1x256x64xf32>
-    %719 = stablehlo.convert %718 : (tensor<1x256x64xf32>) -> tensor<1x256x64xf64>
-    %720 = stablehlo.reduce(%719 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x64xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %721 = stablehlo.reshape %720 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %722 = stablehlo.broadcast_in_dim %721, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %723 = stablehlo.broadcast_in_dim %620, dims = [] : (tensor<f64>) -> tensor<1x256x1xf64>
-    %724 = stablehlo.divide %722, %723 : tensor<1x256x1xf64>
-    %725 = stablehlo.broadcast_in_dim %719, dims = [0, 1, 2] : (tensor<1x256x64xf64>) -> tensor<1x256x64xf64>
-    %726 = stablehlo.broadcast_in_dim %724, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x64xf64>
-    %727 = stablehlo.subtract %725, %726 : tensor<1x256x64xf64>
-    %728 = stablehlo.multiply %727, %727 : tensor<1x256x64xf64>
-    %729 = stablehlo.reduce(%728 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x64xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %730 = stablehlo.reshape %729 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %731 = stablehlo.broadcast_in_dim %730, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %732 = stablehlo.divide %731, %723 : tensor<1x256x1xf64>
-    %733 = stablehlo.convert %732 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %734 = stablehlo.reduce(%718 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x64xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %735 = stablehlo.reshape %734 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %736 = stablehlo.broadcast_in_dim %735, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %737 = stablehlo.broadcast_in_dim %636, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %738 = stablehlo.divide %736, %737 : tensor<1x256x1xf32>
-    %739 = stablehlo.broadcast_in_dim %733, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %740 = stablehlo.add %739, %136 : tensor<1x256x1xf32>
-    %741 = stablehlo.rsqrt %740 : tensor<1x256x1xf32>
-    %742 = stablehlo.broadcast_in_dim %718, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %743 = stablehlo.broadcast_in_dim %738, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x64xf32>
-    %744 = stablehlo.subtract %742, %743 : tensor<1x256x64xf32>
-    %745 = stablehlo.broadcast_in_dim %744, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %746 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x64xf32>
-    %747 = stablehlo.multiply %745, %746 : tensor<1x256x64xf32>
-    %748 = stablehlo.convert %arg37 : (tensor<64xbf16>) -> tensor<64xf32>
-    %749 = stablehlo.broadcast_in_dim %747, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %750 = stablehlo.broadcast_in_dim %748, dims = [2] : (tensor<64xf32>) -> tensor<1x256x64xf32>
-    %751 = stablehlo.multiply %749, %750 : tensor<1x256x64xf32>
-    %752 = stablehlo.convert %arg38 : (tensor<64xbf16>) -> tensor<64xf32>
-    %753 = stablehlo.broadcast_in_dim %751, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %754 = stablehlo.broadcast_in_dim %752, dims = [2] : (tensor<64xf32>) -> tensor<1x256x64xf32>
-    %755 = stablehlo.add %753, %754 : tensor<1x256x64xf32>
-    %756 = stablehlo.convert %755 : (tensor<1x256x64xf32>) -> tensor<1x256x64xbf16>
-    %757 = stablehlo.reshape %756 : (tensor<1x256x64xbf16>) -> tensor<256x64xbf16>
-    %758 = stablehlo.convert %757 : (tensor<256x64xbf16>) -> tensor<256x64xf32>
-    %759 = stablehlo.dot_general %758, %arg136, contracting_dims = [1] x [0] : (tensor<256x64xf32>, tensor<64x64xf32>) -> tensor<256x64xf32>
-    %760 = stablehlo.broadcast_in_dim %759, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %761 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<256x64xf32>
-    %762 = stablehlo.multiply %760, %761 : tensor<256x64xf32>
-    %763 = stablehlo.broadcast_in_dim %762, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %764 = stablehlo.broadcast_in_dim %arg137, dims = [1] : (tensor<64xf32>) -> tensor<256x64xf32>
-    %765 = stablehlo.add %763, %764 : tensor<256x64xf32>
-    %766 = stablehlo.convert %765 : (tensor<256x64xf32>) -> tensor<256x64xbf16>
-    %767 = stablehlo.reshape %766 : (tensor<256x64xbf16>) -> tensor<1x256x64xbf16>
-    %768 = stablehlo.reshape %767 : (tensor<1x256x64xbf16>) -> tensor<1x256x2x32xbf16>
-    %769 = stablehlo.transpose %768, dims = [0, 2, 1, 3] : (tensor<1x256x2x32xbf16>) -> tensor<1x2x256x32xbf16>
-    %770 = stablehlo.dot_general %758, %arg138, contracting_dims = [1] x [0] : (tensor<256x64xf32>, tensor<64x64xf32>) -> tensor<256x64xf32>
-    %771 = stablehlo.broadcast_in_dim %770, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %772 = stablehlo.multiply %771, %761 : tensor<256x64xf32>
-    %773 = stablehlo.broadcast_in_dim %772, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %774 = stablehlo.broadcast_in_dim %arg139, dims = [1] : (tensor<64xf32>) -> tensor<256x64xf32>
-    %775 = stablehlo.add %773, %774 : tensor<256x64xf32>
-    %776 = stablehlo.convert %775 : (tensor<256x64xf32>) -> tensor<256x64xbf16>
-    %777 = stablehlo.reshape %776 : (tensor<256x64xbf16>) -> tensor<1x256x64xbf16>
-    %778 = stablehlo.reshape %777 : (tensor<1x256x64xbf16>) -> tensor<1x256x2x32xbf16>
-    %779 = stablehlo.transpose %778, dims = [0, 2, 1, 3] : (tensor<1x256x2x32xbf16>) -> tensor<1x2x256x32xbf16>
-    %780 = stablehlo.transpose %769, dims = [0, 1, 3, 2] : (tensor<1x2x256x32xbf16>) -> tensor<1x2x32x256xbf16>
-    %781 = stablehlo.reshape %708 : (tensor<1x2x4096x32xbf16>) -> tensor<2x4096x32xbf16>
-    %782 = stablehlo.reshape %780 : (tensor<1x2x32x256xbf16>) -> tensor<2x32x256xbf16>
-    %783 = stablehlo.broadcast_in_dim %782, dims = [0, 1, 2] : (tensor<2x32x256xbf16>) -> tensor<2x32x256xbf16>
-    %784 = stablehlo.dot_general %781, %783, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4096x32xbf16>, tensor<2x32x256xbf16>) -> tensor<2x4096x256xbf16>
-    %785 = stablehlo.reshape %784 : (tensor<2x4096x256xbf16>) -> tensor<1x2x4096x256xbf16>
-    %786 = stablehlo.broadcast_in_dim %785, dims = [0, 1, 2, 3] : (tensor<1x2x4096x256xbf16>) -> tensor<1x2x4096x256xbf16>
-    %787 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x2x4096x256xbf16>
-    %788 = stablehlo.divide %786, %787 : tensor<1x2x4096x256xbf16>
-    %789 = stablehlo.convert %788 : (tensor<1x2x4096x256xbf16>) -> tensor<1x2x4096x256xf32>
-    %790 = stablehlo.reduce(%789 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4096x256xf32>, tensor<f32>) -> tensor<1x2x4096xf32>
-    %791 = stablehlo.reshape %790 : (tensor<1x2x4096xf32>) -> tensor<1x2x4096x1xf32>
-    %792 = stablehlo.broadcast_in_dim %789, dims = [0, 1, 2, 3] : (tensor<1x2x4096x256xf32>) -> tensor<1x2x4096x256xf32>
-    %793 = stablehlo.broadcast_in_dim %791, dims = [0, 1, 2, 3] : (tensor<1x2x4096x1xf32>) -> tensor<1x2x4096x256xf32>
-    %794 = stablehlo.subtract %792, %793 : tensor<1x2x4096x256xf32>
-    %795 = stablehlo.exponential %794 : tensor<1x2x4096x256xf32>
-    %796 = stablehlo.reduce(%795 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4096x256xf32>, tensor<f32>) -> tensor<1x2x4096xf32>
-    %797 = stablehlo.reshape %796 : (tensor<1x2x4096xf32>) -> tensor<1x2x4096x1xf32>
-    %798 = stablehlo.broadcast_in_dim %795, dims = [0, 1, 2, 3] : (tensor<1x2x4096x256xf32>) -> tensor<1x2x4096x256xf32>
-    %799 = stablehlo.broadcast_in_dim %797, dims = [0, 1, 2, 3] : (tensor<1x2x4096x1xf32>) -> tensor<1x2x4096x256xf32>
-    %800 = stablehlo.divide %798, %799 : tensor<1x2x4096x256xf32>
-    %801 = stablehlo.convert %800 : (tensor<1x2x4096x256xf32>) -> tensor<1x2x4096x256xbf16>
-    %802 = stablehlo.reshape %801 : (tensor<1x2x4096x256xbf16>) -> tensor<2x4096x256xbf16>
-    %803 = stablehlo.reshape %779 : (tensor<1x2x256x32xbf16>) -> tensor<2x256x32xbf16>
-    %804 = stablehlo.broadcast_in_dim %803, dims = [0, 1, 2] : (tensor<2x256x32xbf16>) -> tensor<2x256x32xbf16>
-    %805 = stablehlo.dot_general %802, %804, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4096x256xbf16>, tensor<2x256x32xbf16>) -> tensor<2x4096x32xbf16>
-    %806 = stablehlo.reshape %805 : (tensor<2x4096x32xbf16>) -> tensor<1x2x4096x32xbf16>
-    %807 = stablehlo.transpose %806, dims = [0, 2, 1, 3] : (tensor<1x2x4096x32xbf16>) -> tensor<1x4096x2x32xbf16>
-    %808 = stablehlo.reshape %807 : (tensor<1x4096x2x32xbf16>) -> tensor<1x4096x64xbf16>
-    %809 = stablehlo.reshape %808 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %810 = stablehlo.convert %809 : (tensor<4096x64xbf16>) -> tensor<4096x64xf32>
-    %811 = stablehlo.dot_general %810, %arg140, contracting_dims = [1] x [0] : (tensor<4096x64xf32>, tensor<64x64xf32>) -> tensor<4096x64xf32>
-    %812 = stablehlo.broadcast_in_dim %811, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %813 = stablehlo.multiply %812, %700 : tensor<4096x64xf32>
-    %814 = stablehlo.broadcast_in_dim %813, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %815 = stablehlo.broadcast_in_dim %arg141, dims = [1] : (tensor<64xf32>) -> tensor<4096x64xf32>
-    %816 = stablehlo.add %814, %815 : tensor<4096x64xf32>
-    %817 = stablehlo.convert %816 : (tensor<4096x64xf32>) -> tensor<4096x64xbf16>
-    %818 = stablehlo.reshape %817 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %819 = stablehlo.add %818, %658 : tensor<1x4096x64xbf16>
-    %820 = stablehlo.convert %819 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xf32>
-    %821 = stablehlo.convert %820 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf64>
-    %822 = stablehlo.reduce(%821 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %823 = stablehlo.reshape %822 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %824 = stablehlo.broadcast_in_dim %823, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %825 = stablehlo.divide %824, %622 : tensor<1x4096x1xf64>
-    %826 = stablehlo.broadcast_in_dim %821, dims = [0, 1, 2] : (tensor<1x4096x64xf64>) -> tensor<1x4096x64xf64>
-    %827 = stablehlo.broadcast_in_dim %825, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x64xf64>
-    %828 = stablehlo.subtract %826, %827 : tensor<1x4096x64xf64>
-    %829 = stablehlo.multiply %828, %828 : tensor<1x4096x64xf64>
-    %830 = stablehlo.reduce(%829 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %831 = stablehlo.reshape %830 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %832 = stablehlo.broadcast_in_dim %831, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %833 = stablehlo.divide %832, %622 : tensor<1x4096x1xf64>
-    %834 = stablehlo.convert %833 : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf32>
-    %835 = stablehlo.reduce(%820 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf32>, tensor<f32>) -> tensor<1x4096xf32>
-    %836 = stablehlo.reshape %835 : (tensor<1x4096xf32>) -> tensor<1x4096x1xf32>
-    %837 = stablehlo.broadcast_in_dim %836, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %838 = stablehlo.divide %837, %638 : tensor<1x4096x1xf32>
-    %839 = stablehlo.broadcast_in_dim %834, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %840 = stablehlo.add %839, %641 : tensor<1x4096x1xf32>
-    %841 = stablehlo.rsqrt %840 : tensor<1x4096x1xf32>
-    %842 = stablehlo.broadcast_in_dim %820, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %843 = stablehlo.broadcast_in_dim %838, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %844 = stablehlo.subtract %842, %843 : tensor<1x4096x64xf32>
-    %845 = stablehlo.broadcast_in_dim %844, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %846 = stablehlo.broadcast_in_dim %841, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %847 = stablehlo.multiply %845, %846 : tensor<1x4096x64xf32>
-    %848 = stablehlo.convert %arg39 : (tensor<64xbf16>) -> tensor<64xf32>
-    %849 = stablehlo.broadcast_in_dim %847, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %850 = stablehlo.broadcast_in_dim %848, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %851 = stablehlo.multiply %849, %850 : tensor<1x4096x64xf32>
-    %852 = stablehlo.convert %arg40 : (tensor<64xbf16>) -> tensor<64xf32>
-    %853 = stablehlo.broadcast_in_dim %851, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %854 = stablehlo.broadcast_in_dim %852, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %855 = stablehlo.add %853, %854 : tensor<1x4096x64xf32>
-    %856 = stablehlo.convert %855 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xbf16>
-    %857 = stablehlo.reshape %856 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %858 = stablehlo.convert %857 : (tensor<4096x64xbf16>) -> tensor<4096x64xf32>
-    %859 = stablehlo.dot_general %858, %arg142, contracting_dims = [1] x [0] : (tensor<4096x64xf32>, tensor<64x256xf32>) -> tensor<4096x256xf32>
-    %860 = stablehlo.broadcast_in_dim %859, dims = [0, 1] : (tensor<4096x256xf32>) -> tensor<4096x256xf32>
-    %861 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<4096x256xf32>
-    %862 = stablehlo.multiply %860, %861 : tensor<4096x256xf32>
-    %863 = stablehlo.broadcast_in_dim %862, dims = [0, 1] : (tensor<4096x256xf32>) -> tensor<4096x256xf32>
-    %864 = stablehlo.broadcast_in_dim %arg143, dims = [1] : (tensor<256xf32>) -> tensor<4096x256xf32>
-    %865 = stablehlo.add %863, %864 : tensor<4096x256xf32>
-    %866 = stablehlo.convert %865 : (tensor<4096x256xf32>) -> tensor<4096x256xbf16>
-    %867 = stablehlo.reshape %866 : (tensor<4096x256xbf16>) -> tensor<1x4096x256xbf16>
-    %868 = stablehlo.transpose %867, dims = [0, 2, 1] : (tensor<1x4096x256xbf16>) -> tensor<1x256x4096xbf16>
-    %869 = stablehlo.reshape %868 : (tensor<1x256x4096xbf16>) -> tensor<1x256x64x64xbf16>
-    %870 = stablehlo.convolution(%869, %arg41) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x64x64xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %871 = stablehlo.reshape %arg42 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %872 = stablehlo.broadcast_in_dim %870, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %873 = stablehlo.broadcast_in_dim %871, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x64x64xbf16>
-    %874 = stablehlo.add %872, %873 : tensor<1x256x64x64xbf16>
-    %875 = stablehlo.reshape %874 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x4096xbf16>
-    %876 = stablehlo.transpose %875, dims = [0, 2, 1] : (tensor<1x256x4096xbf16>) -> tensor<1x4096x256xbf16>
-    %877 = stablehlo.multiply %876, %cst_23 : tensor<1x4096x256xbf16>
-    %878 = stablehlo.rsqrt %cst_22 : tensor<1x4096x256xbf16>
-    %879 = stablehlo.multiply %876, %878 : tensor<1x4096x256xbf16>
-    %880 = stablehlo.convert %879 : (tensor<1x4096x256xbf16>) -> tensor<1x4096x256xf32>
-    %881 = stablehlo.clamp %cst_24, %880, %cst_25 : tensor<1x4096x256xf32>
-    %882 = stablehlo.multiply %881, %881 : tensor<1x4096x256xf32>
-    %883 = stablehlo.multiply %cst_26, %882 : tensor<1x4096x256xf32>
-    %884 = stablehlo.add %883, %cst_27 : tensor<1x4096x256xf32>
-    %885 = stablehlo.multiply %884, %882 : tensor<1x4096x256xf32>
-    %886 = stablehlo.add %885, %cst_28 : tensor<1x4096x256xf32>
-    %887 = stablehlo.multiply %886, %882 : tensor<1x4096x256xf32>
-    %888 = stablehlo.add %887, %cst_29 : tensor<1x4096x256xf32>
-    %889 = stablehlo.multiply %888, %882 : tensor<1x4096x256xf32>
-    %890 = stablehlo.add %889, %cst_30 : tensor<1x4096x256xf32>
-    %891 = stablehlo.multiply %890, %882 : tensor<1x4096x256xf32>
-    %892 = stablehlo.add %891, %cst_31 : tensor<1x4096x256xf32>
-    %893 = stablehlo.multiply %892, %882 : tensor<1x4096x256xf32>
-    %894 = stablehlo.add %893, %cst_32 : tensor<1x4096x256xf32>
-    %895 = stablehlo.multiply %cst_33, %882 : tensor<1x4096x256xf32>
-    %896 = stablehlo.add %895, %cst_34 : tensor<1x4096x256xf32>
-    %897 = stablehlo.multiply %896, %882 : tensor<1x4096x256xf32>
-    %898 = stablehlo.add %897, %cst_35 : tensor<1x4096x256xf32>
-    %899 = stablehlo.multiply %898, %882 : tensor<1x4096x256xf32>
-    %900 = stablehlo.add %899, %cst_36 : tensor<1x4096x256xf32>
-    %901 = stablehlo.multiply %900, %882 : tensor<1x4096x256xf32>
-    %902 = stablehlo.add %901, %cst_37 : tensor<1x4096x256xf32>
-    %903 = stablehlo.multiply %881, %894 : tensor<1x4096x256xf32>
-    %904 = stablehlo.divide %903, %902 : tensor<1x4096x256xf32>
-    %905 = stablehlo.clamp %cst_38, %904, %cst_39 : tensor<1x4096x256xf32>
-    %906 = stablehlo.convert %905 : (tensor<1x4096x256xf32>) -> tensor<1x4096x256xbf16>
-    %907 = stablehlo.add %906, %cst_21 : tensor<1x4096x256xbf16>
-    %908 = stablehlo.multiply %907, %877 : tensor<1x4096x256xbf16>
-    %909 = stablehlo.reshape %908 : (tensor<1x4096x256xbf16>) -> tensor<4096x256xbf16>
-    %910 = stablehlo.dot_general %909, %arg144, contracting_dims = [1] x [0] : (tensor<4096x256xbf16>, tensor<256x64xbf16>) -> tensor<4096x64xbf16>
-    %911 = stablehlo.reshape %910 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %912 = stablehlo.broadcast_in_dim %911, dims = [0, 1, 2] : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %913 = stablehlo.broadcast_in_dim %arg43, dims = [2] : (tensor<64xbf16>) -> tensor<1x4096x64xbf16>
-    %914 = stablehlo.add %912, %913 : tensor<1x4096x64xbf16>
-    %915 = stablehlo.reshape %914 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %916 = stablehlo.reshape %915 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %917 = stablehlo.add %916, %819 : tensor<1x4096x64xbf16>
-    %918 = stablehlo.convert %917 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xf32>
-    %919 = stablehlo.convert %918 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf64>
-    %920 = stablehlo.reduce(%919 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %921 = stablehlo.reshape %920 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %922 = stablehlo.broadcast_in_dim %921, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %923 = stablehlo.divide %922, %622 : tensor<1x4096x1xf64>
-    %924 = stablehlo.broadcast_in_dim %919, dims = [0, 1, 2] : (tensor<1x4096x64xf64>) -> tensor<1x4096x64xf64>
-    %925 = stablehlo.broadcast_in_dim %923, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x64xf64>
-    %926 = stablehlo.subtract %924, %925 : tensor<1x4096x64xf64>
-    %927 = stablehlo.multiply %926, %926 : tensor<1x4096x64xf64>
-    %928 = stablehlo.reduce(%927 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %929 = stablehlo.reshape %928 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %930 = stablehlo.broadcast_in_dim %929, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %931 = stablehlo.divide %930, %622 : tensor<1x4096x1xf64>
-    %932 = stablehlo.convert %931 : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf32>
-    %933 = stablehlo.reduce(%918 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf32>, tensor<f32>) -> tensor<1x4096xf32>
-    %934 = stablehlo.reshape %933 : (tensor<1x4096xf32>) -> tensor<1x4096x1xf32>
-    %935 = stablehlo.broadcast_in_dim %934, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %936 = stablehlo.divide %935, %638 : tensor<1x4096x1xf32>
-    %937 = stablehlo.broadcast_in_dim %932, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %938 = stablehlo.add %937, %641 : tensor<1x4096x1xf32>
-    %939 = stablehlo.rsqrt %938 : tensor<1x4096x1xf32>
-    %940 = stablehlo.broadcast_in_dim %918, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %941 = stablehlo.broadcast_in_dim %936, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %942 = stablehlo.subtract %940, %941 : tensor<1x4096x64xf32>
-    %943 = stablehlo.broadcast_in_dim %942, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %944 = stablehlo.broadcast_in_dim %939, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %945 = stablehlo.multiply %943, %944 : tensor<1x4096x64xf32>
-    %946 = stablehlo.convert %arg44 : (tensor<64xbf16>) -> tensor<64xf32>
-    %947 = stablehlo.broadcast_in_dim %945, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %948 = stablehlo.broadcast_in_dim %946, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %949 = stablehlo.multiply %947, %948 : tensor<1x4096x64xf32>
-    %950 = stablehlo.convert %arg45 : (tensor<64xbf16>) -> tensor<64xf32>
-    %951 = stablehlo.broadcast_in_dim %949, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %952 = stablehlo.broadcast_in_dim %950, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %953 = stablehlo.add %951, %952 : tensor<1x4096x64xf32>
-    %954 = stablehlo.convert %953 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xbf16>
-    %955 = stablehlo.reshape %954 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %956 = stablehlo.convert %955 : (tensor<4096x64xbf16>) -> tensor<4096x64xf32>
-    %957 = stablehlo.dot_general %956, %arg145, contracting_dims = [1] x [0] : (tensor<4096x64xf32>, tensor<64x64xf32>) -> tensor<4096x64xf32>
-    %958 = stablehlo.broadcast_in_dim %957, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %959 = stablehlo.multiply %958, %700 : tensor<4096x64xf32>
-    %960 = stablehlo.broadcast_in_dim %959, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %961 = stablehlo.broadcast_in_dim %arg146, dims = [1] : (tensor<64xf32>) -> tensor<4096x64xf32>
-    %962 = stablehlo.add %960, %961 : tensor<4096x64xf32>
-    %963 = stablehlo.convert %962 : (tensor<4096x64xf32>) -> tensor<4096x64xbf16>
-    %964 = stablehlo.reshape %963 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %965 = stablehlo.reshape %964 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x2x32xbf16>
-    %966 = stablehlo.transpose %965, dims = [0, 2, 1, 3] : (tensor<1x4096x2x32xbf16>) -> tensor<1x2x4096x32xbf16>
-    %967 = stablehlo.transpose %954, dims = [0, 2, 1] : (tensor<1x4096x64xbf16>) -> tensor<1x64x4096xbf16>
-    %968 = stablehlo.reshape %967 : (tensor<1x64x4096xbf16>) -> tensor<1x64x64x64xbf16>
-    %969 = stablehlo.convolution(%968, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x64x64xbf16>, tensor<64x64x4x4xbf16>) -> tensor<1x64x16x16xbf16>
-    %970 = stablehlo.reshape %arg47 : (tensor<64xbf16>) -> tensor<64x1x1xbf16>
-    %971 = stablehlo.broadcast_in_dim %969, dims = [0, 1, 2, 3] : (tensor<1x64x16x16xbf16>) -> tensor<1x64x16x16xbf16>
-    %972 = stablehlo.broadcast_in_dim %970, dims = [1, 2, 3] : (tensor<64x1x1xbf16>) -> tensor<1x64x16x16xbf16>
-    %973 = stablehlo.add %971, %972 : tensor<1x64x16x16xbf16>
-    %974 = stablehlo.reshape %973 : (tensor<1x64x16x16xbf16>) -> tensor<1x64x256xbf16>
-    %975 = stablehlo.transpose %974, dims = [0, 2, 1] : (tensor<1x64x256xbf16>) -> tensor<1x256x64xbf16>
-    %976 = stablehlo.convert %975 : (tensor<1x256x64xbf16>) -> tensor<1x256x64xf32>
-    %977 = stablehlo.convert %976 : (tensor<1x256x64xf32>) -> tensor<1x256x64xf64>
-    %978 = stablehlo.reduce(%977 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x64xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %979 = stablehlo.reshape %978 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %980 = stablehlo.broadcast_in_dim %979, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %981 = stablehlo.divide %980, %723 : tensor<1x256x1xf64>
-    %982 = stablehlo.broadcast_in_dim %977, dims = [0, 1, 2] : (tensor<1x256x64xf64>) -> tensor<1x256x64xf64>
-    %983 = stablehlo.broadcast_in_dim %981, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x64xf64>
-    %984 = stablehlo.subtract %982, %983 : tensor<1x256x64xf64>
-    %985 = stablehlo.multiply %984, %984 : tensor<1x256x64xf64>
-    %986 = stablehlo.reduce(%985 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x64xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %987 = stablehlo.reshape %986 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %988 = stablehlo.broadcast_in_dim %987, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %989 = stablehlo.divide %988, %723 : tensor<1x256x1xf64>
-    %990 = stablehlo.convert %989 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %991 = stablehlo.reduce(%976 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x64xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %992 = stablehlo.reshape %991 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %993 = stablehlo.broadcast_in_dim %992, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %994 = stablehlo.divide %993, %737 : tensor<1x256x1xf32>
-    %995 = stablehlo.broadcast_in_dim %990, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %996 = stablehlo.add %995, %136 : tensor<1x256x1xf32>
-    %997 = stablehlo.rsqrt %996 : tensor<1x256x1xf32>
-    %998 = stablehlo.broadcast_in_dim %976, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %999 = stablehlo.broadcast_in_dim %994, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x64xf32>
-    %1000 = stablehlo.subtract %998, %999 : tensor<1x256x64xf32>
-    %1001 = stablehlo.broadcast_in_dim %1000, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %1002 = stablehlo.broadcast_in_dim %997, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x64xf32>
-    %1003 = stablehlo.multiply %1001, %1002 : tensor<1x256x64xf32>
-    %1004 = stablehlo.convert %arg48 : (tensor<64xbf16>) -> tensor<64xf32>
-    %1005 = stablehlo.broadcast_in_dim %1003, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %1006 = stablehlo.broadcast_in_dim %1004, dims = [2] : (tensor<64xf32>) -> tensor<1x256x64xf32>
-    %1007 = stablehlo.multiply %1005, %1006 : tensor<1x256x64xf32>
-    %1008 = stablehlo.convert %arg49 : (tensor<64xbf16>) -> tensor<64xf32>
-    %1009 = stablehlo.broadcast_in_dim %1007, dims = [0, 1, 2] : (tensor<1x256x64xf32>) -> tensor<1x256x64xf32>
-    %1010 = stablehlo.broadcast_in_dim %1008, dims = [2] : (tensor<64xf32>) -> tensor<1x256x64xf32>
-    %1011 = stablehlo.add %1009, %1010 : tensor<1x256x64xf32>
-    %1012 = stablehlo.convert %1011 : (tensor<1x256x64xf32>) -> tensor<1x256x64xbf16>
-    %1013 = stablehlo.reshape %1012 : (tensor<1x256x64xbf16>) -> tensor<256x64xbf16>
-    %1014 = stablehlo.convert %1013 : (tensor<256x64xbf16>) -> tensor<256x64xf32>
-    %1015 = stablehlo.dot_general %1014, %arg147, contracting_dims = [1] x [0] : (tensor<256x64xf32>, tensor<64x64xf32>) -> tensor<256x64xf32>
-    %1016 = stablehlo.broadcast_in_dim %1015, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %1017 = stablehlo.multiply %1016, %761 : tensor<256x64xf32>
-    %1018 = stablehlo.broadcast_in_dim %1017, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %1019 = stablehlo.broadcast_in_dim %arg148, dims = [1] : (tensor<64xf32>) -> tensor<256x64xf32>
-    %1020 = stablehlo.add %1018, %1019 : tensor<256x64xf32>
-    %1021 = stablehlo.convert %1020 : (tensor<256x64xf32>) -> tensor<256x64xbf16>
-    %1022 = stablehlo.reshape %1021 : (tensor<256x64xbf16>) -> tensor<1x256x64xbf16>
-    %1023 = stablehlo.reshape %1022 : (tensor<1x256x64xbf16>) -> tensor<1x256x2x32xbf16>
-    %1024 = stablehlo.transpose %1023, dims = [0, 2, 1, 3] : (tensor<1x256x2x32xbf16>) -> tensor<1x2x256x32xbf16>
-    %1025 = stablehlo.dot_general %1014, %arg149, contracting_dims = [1] x [0] : (tensor<256x64xf32>, tensor<64x64xf32>) -> tensor<256x64xf32>
-    %1026 = stablehlo.broadcast_in_dim %1025, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %1027 = stablehlo.multiply %1026, %761 : tensor<256x64xf32>
-    %1028 = stablehlo.broadcast_in_dim %1027, dims = [0, 1] : (tensor<256x64xf32>) -> tensor<256x64xf32>
-    %1029 = stablehlo.broadcast_in_dim %arg150, dims = [1] : (tensor<64xf32>) -> tensor<256x64xf32>
-    %1030 = stablehlo.add %1028, %1029 : tensor<256x64xf32>
-    %1031 = stablehlo.convert %1030 : (tensor<256x64xf32>) -> tensor<256x64xbf16>
-    %1032 = stablehlo.reshape %1031 : (tensor<256x64xbf16>) -> tensor<1x256x64xbf16>
-    %1033 = stablehlo.reshape %1032 : (tensor<1x256x64xbf16>) -> tensor<1x256x2x32xbf16>
-    %1034 = stablehlo.transpose %1033, dims = [0, 2, 1, 3] : (tensor<1x256x2x32xbf16>) -> tensor<1x2x256x32xbf16>
-    %1035 = stablehlo.transpose %1024, dims = [0, 1, 3, 2] : (tensor<1x2x256x32xbf16>) -> tensor<1x2x32x256xbf16>
-    %1036 = stablehlo.reshape %966 : (tensor<1x2x4096x32xbf16>) -> tensor<2x4096x32xbf16>
-    %1037 = stablehlo.reshape %1035 : (tensor<1x2x32x256xbf16>) -> tensor<2x32x256xbf16>
-    %1038 = stablehlo.broadcast_in_dim %1037, dims = [0, 1, 2] : (tensor<2x32x256xbf16>) -> tensor<2x32x256xbf16>
-    %1039 = stablehlo.dot_general %1036, %1038, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4096x32xbf16>, tensor<2x32x256xbf16>) -> tensor<2x4096x256xbf16>
-    %1040 = stablehlo.reshape %1039 : (tensor<2x4096x256xbf16>) -> tensor<1x2x4096x256xbf16>
-    %1041 = stablehlo.broadcast_in_dim %1040, dims = [0, 1, 2, 3] : (tensor<1x2x4096x256xbf16>) -> tensor<1x2x4096x256xbf16>
-    %1042 = stablehlo.divide %1041, %787 : tensor<1x2x4096x256xbf16>
-    %1043 = stablehlo.convert %1042 : (tensor<1x2x4096x256xbf16>) -> tensor<1x2x4096x256xf32>
-    %1044 = stablehlo.reduce(%1043 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x2x4096x256xf32>, tensor<f32>) -> tensor<1x2x4096xf32>
-    %1045 = stablehlo.reshape %1044 : (tensor<1x2x4096xf32>) -> tensor<1x2x4096x1xf32>
-    %1046 = stablehlo.broadcast_in_dim %1043, dims = [0, 1, 2, 3] : (tensor<1x2x4096x256xf32>) -> tensor<1x2x4096x256xf32>
-    %1047 = stablehlo.broadcast_in_dim %1045, dims = [0, 1, 2, 3] : (tensor<1x2x4096x1xf32>) -> tensor<1x2x4096x256xf32>
-    %1048 = stablehlo.subtract %1046, %1047 : tensor<1x2x4096x256xf32>
-    %1049 = stablehlo.exponential %1048 : tensor<1x2x4096x256xf32>
-    %1050 = stablehlo.reduce(%1049 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x2x4096x256xf32>, tensor<f32>) -> tensor<1x2x4096xf32>
-    %1051 = stablehlo.reshape %1050 : (tensor<1x2x4096xf32>) -> tensor<1x2x4096x1xf32>
-    %1052 = stablehlo.broadcast_in_dim %1049, dims = [0, 1, 2, 3] : (tensor<1x2x4096x256xf32>) -> tensor<1x2x4096x256xf32>
-    %1053 = stablehlo.broadcast_in_dim %1051, dims = [0, 1, 2, 3] : (tensor<1x2x4096x1xf32>) -> tensor<1x2x4096x256xf32>
-    %1054 = stablehlo.divide %1052, %1053 : tensor<1x2x4096x256xf32>
-    %1055 = stablehlo.convert %1054 : (tensor<1x2x4096x256xf32>) -> tensor<1x2x4096x256xbf16>
-    %1056 = stablehlo.reshape %1055 : (tensor<1x2x4096x256xbf16>) -> tensor<2x4096x256xbf16>
-    %1057 = stablehlo.reshape %1034 : (tensor<1x2x256x32xbf16>) -> tensor<2x256x32xbf16>
-    %1058 = stablehlo.broadcast_in_dim %1057, dims = [0, 1, 2] : (tensor<2x256x32xbf16>) -> tensor<2x256x32xbf16>
-    %1059 = stablehlo.dot_general %1056, %1058, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<2x4096x256xbf16>, tensor<2x256x32xbf16>) -> tensor<2x4096x32xbf16>
-    %1060 = stablehlo.reshape %1059 : (tensor<2x4096x32xbf16>) -> tensor<1x2x4096x32xbf16>
-    %1061 = stablehlo.transpose %1060, dims = [0, 2, 1, 3] : (tensor<1x2x4096x32xbf16>) -> tensor<1x4096x2x32xbf16>
-    %1062 = stablehlo.reshape %1061 : (tensor<1x4096x2x32xbf16>) -> tensor<1x4096x64xbf16>
-    %1063 = stablehlo.reshape %1062 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %1064 = stablehlo.convert %1063 : (tensor<4096x64xbf16>) -> tensor<4096x64xf32>
-    %1065 = stablehlo.dot_general %1064, %arg151, contracting_dims = [1] x [0] : (tensor<4096x64xf32>, tensor<64x64xf32>) -> tensor<4096x64xf32>
-    %1066 = stablehlo.broadcast_in_dim %1065, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %1067 = stablehlo.multiply %1066, %700 : tensor<4096x64xf32>
-    %1068 = stablehlo.broadcast_in_dim %1067, dims = [0, 1] : (tensor<4096x64xf32>) -> tensor<4096x64xf32>
-    %1069 = stablehlo.broadcast_in_dim %arg152, dims = [1] : (tensor<64xf32>) -> tensor<4096x64xf32>
-    %1070 = stablehlo.add %1068, %1069 : tensor<4096x64xf32>
-    %1071 = stablehlo.convert %1070 : (tensor<4096x64xf32>) -> tensor<4096x64xbf16>
-    %1072 = stablehlo.reshape %1071 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %1073 = stablehlo.add %1072, %917 : tensor<1x4096x64xbf16>
-    %1074 = stablehlo.convert %1073 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xf32>
-    %1075 = stablehlo.convert %1074 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf64>
-    %1076 = stablehlo.reduce(%1075 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %1077 = stablehlo.reshape %1076 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %1078 = stablehlo.broadcast_in_dim %1077, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %1079 = stablehlo.divide %1078, %622 : tensor<1x4096x1xf64>
-    %1080 = stablehlo.broadcast_in_dim %1075, dims = [0, 1, 2] : (tensor<1x4096x64xf64>) -> tensor<1x4096x64xf64>
-    %1081 = stablehlo.broadcast_in_dim %1079, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x64xf64>
-    %1082 = stablehlo.subtract %1080, %1081 : tensor<1x4096x64xf64>
-    %1083 = stablehlo.multiply %1082, %1082 : tensor<1x4096x64xf64>
-    %1084 = stablehlo.reduce(%1083 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %1085 = stablehlo.reshape %1084 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %1086 = stablehlo.broadcast_in_dim %1085, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %1087 = stablehlo.divide %1086, %622 : tensor<1x4096x1xf64>
-    %1088 = stablehlo.convert %1087 : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf32>
-    %1089 = stablehlo.reduce(%1074 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf32>, tensor<f32>) -> tensor<1x4096xf32>
-    %1090 = stablehlo.reshape %1089 : (tensor<1x4096xf32>) -> tensor<1x4096x1xf32>
-    %1091 = stablehlo.broadcast_in_dim %1090, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %1092 = stablehlo.divide %1091, %638 : tensor<1x4096x1xf32>
-    %1093 = stablehlo.broadcast_in_dim %1088, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %1094 = stablehlo.add %1093, %641 : tensor<1x4096x1xf32>
-    %1095 = stablehlo.rsqrt %1094 : tensor<1x4096x1xf32>
-    %1096 = stablehlo.broadcast_in_dim %1074, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1097 = stablehlo.broadcast_in_dim %1092, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %1098 = stablehlo.subtract %1096, %1097 : tensor<1x4096x64xf32>
-    %1099 = stablehlo.broadcast_in_dim %1098, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1100 = stablehlo.broadcast_in_dim %1095, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %1101 = stablehlo.multiply %1099, %1100 : tensor<1x4096x64xf32>
-    %1102 = stablehlo.convert %arg50 : (tensor<64xbf16>) -> tensor<64xf32>
-    %1103 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1104 = stablehlo.broadcast_in_dim %1102, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %1105 = stablehlo.multiply %1103, %1104 : tensor<1x4096x64xf32>
-    %1106 = stablehlo.convert %arg51 : (tensor<64xbf16>) -> tensor<64xf32>
-    %1107 = stablehlo.broadcast_in_dim %1105, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1108 = stablehlo.broadcast_in_dim %1106, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %1109 = stablehlo.add %1107, %1108 : tensor<1x4096x64xf32>
-    %1110 = stablehlo.convert %1109 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xbf16>
-    %1111 = stablehlo.reshape %1110 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %1112 = stablehlo.convert %1111 : (tensor<4096x64xbf16>) -> tensor<4096x64xf32>
-    %1113 = stablehlo.dot_general %1112, %arg153, contracting_dims = [1] x [0] : (tensor<4096x64xf32>, tensor<64x256xf32>) -> tensor<4096x256xf32>
-    %1114 = stablehlo.broadcast_in_dim %1113, dims = [0, 1] : (tensor<4096x256xf32>) -> tensor<4096x256xf32>
-    %1115 = stablehlo.multiply %1114, %861 : tensor<4096x256xf32>
-    %1116 = stablehlo.broadcast_in_dim %1115, dims = [0, 1] : (tensor<4096x256xf32>) -> tensor<4096x256xf32>
-    %1117 = stablehlo.broadcast_in_dim %arg154, dims = [1] : (tensor<256xf32>) -> tensor<4096x256xf32>
-    %1118 = stablehlo.add %1116, %1117 : tensor<4096x256xf32>
-    %1119 = stablehlo.convert %1118 : (tensor<4096x256xf32>) -> tensor<4096x256xbf16>
-    %1120 = stablehlo.reshape %1119 : (tensor<4096x256xbf16>) -> tensor<1x4096x256xbf16>
-    %1121 = stablehlo.transpose %1120, dims = [0, 2, 1] : (tensor<1x4096x256xbf16>) -> tensor<1x256x4096xbf16>
-    %1122 = stablehlo.reshape %1121 : (tensor<1x256x4096xbf16>) -> tensor<1x256x64x64xbf16>
-    %1123 = stablehlo.convolution(%1122, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 256 : i64} : (tensor<1x256x64x64xbf16>, tensor<256x1x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %1124 = stablehlo.reshape %arg53 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %1125 = stablehlo.broadcast_in_dim %1123, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1126 = stablehlo.broadcast_in_dim %1124, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x64x64xbf16>
-    %1127 = stablehlo.add %1125, %1126 : tensor<1x256x64x64xbf16>
-    %1128 = stablehlo.reshape %1127 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x4096xbf16>
-    %1129 = stablehlo.transpose %1128, dims = [0, 2, 1] : (tensor<1x256x4096xbf16>) -> tensor<1x4096x256xbf16>
-    %1130 = stablehlo.multiply %1129, %cst_23 : tensor<1x4096x256xbf16>
-    %1131 = stablehlo.multiply %1129, %878 : tensor<1x4096x256xbf16>
-    %1132 = stablehlo.convert %1131 : (tensor<1x4096x256xbf16>) -> tensor<1x4096x256xf32>
-    %1133 = stablehlo.clamp %cst_24, %1132, %cst_25 : tensor<1x4096x256xf32>
-    %1134 = stablehlo.multiply %1133, %1133 : tensor<1x4096x256xf32>
-    %1135 = stablehlo.multiply %cst_26, %1134 : tensor<1x4096x256xf32>
-    %1136 = stablehlo.add %1135, %cst_27 : tensor<1x4096x256xf32>
-    %1137 = stablehlo.multiply %1136, %1134 : tensor<1x4096x256xf32>
-    %1138 = stablehlo.add %1137, %cst_28 : tensor<1x4096x256xf32>
-    %1139 = stablehlo.multiply %1138, %1134 : tensor<1x4096x256xf32>
-    %1140 = stablehlo.add %1139, %cst_29 : tensor<1x4096x256xf32>
-    %1141 = stablehlo.multiply %1140, %1134 : tensor<1x4096x256xf32>
-    %1142 = stablehlo.add %1141, %cst_30 : tensor<1x4096x256xf32>
-    %1143 = stablehlo.multiply %1142, %1134 : tensor<1x4096x256xf32>
-    %1144 = stablehlo.add %1143, %cst_31 : tensor<1x4096x256xf32>
-    %1145 = stablehlo.multiply %1144, %1134 : tensor<1x4096x256xf32>
-    %1146 = stablehlo.add %1145, %cst_32 : tensor<1x4096x256xf32>
-    %1147 = stablehlo.multiply %cst_33, %1134 : tensor<1x4096x256xf32>
-    %1148 = stablehlo.add %1147, %cst_34 : tensor<1x4096x256xf32>
-    %1149 = stablehlo.multiply %1148, %1134 : tensor<1x4096x256xf32>
-    %1150 = stablehlo.add %1149, %cst_35 : tensor<1x4096x256xf32>
-    %1151 = stablehlo.multiply %1150, %1134 : tensor<1x4096x256xf32>
-    %1152 = stablehlo.add %1151, %cst_36 : tensor<1x4096x256xf32>
-    %1153 = stablehlo.multiply %1152, %1134 : tensor<1x4096x256xf32>
-    %1154 = stablehlo.add %1153, %cst_37 : tensor<1x4096x256xf32>
-    %1155 = stablehlo.multiply %1133, %1146 : tensor<1x4096x256xf32>
-    %1156 = stablehlo.divide %1155, %1154 : tensor<1x4096x256xf32>
-    %1157 = stablehlo.clamp %cst_38, %1156, %cst_39 : tensor<1x4096x256xf32>
-    %1158 = stablehlo.convert %1157 : (tensor<1x4096x256xf32>) -> tensor<1x4096x256xbf16>
-    %1159 = stablehlo.add %1158, %cst_21 : tensor<1x4096x256xbf16>
-    %1160 = stablehlo.multiply %1159, %1130 : tensor<1x4096x256xbf16>
-    %1161 = stablehlo.reshape %1160 : (tensor<1x4096x256xbf16>) -> tensor<4096x256xbf16>
-    %1162 = stablehlo.dot_general %1161, %arg155, contracting_dims = [1] x [0] : (tensor<4096x256xbf16>, tensor<256x64xbf16>) -> tensor<4096x64xbf16>
-    %1163 = stablehlo.reshape %1162 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %1164 = stablehlo.broadcast_in_dim %1163, dims = [0, 1, 2] : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %1165 = stablehlo.broadcast_in_dim %arg54, dims = [2] : (tensor<64xbf16>) -> tensor<1x4096x64xbf16>
-    %1166 = stablehlo.add %1164, %1165 : tensor<1x4096x64xbf16>
-    %1167 = stablehlo.reshape %1166 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %1168 = stablehlo.reshape %1167 : (tensor<4096x64xbf16>) -> tensor<1x4096x64xbf16>
-    %1169 = stablehlo.add %1168, %1073 : tensor<1x4096x64xbf16>
-    %1170 = stablehlo.convert %1169 : (tensor<1x4096x64xbf16>) -> tensor<1x4096x64xf32>
-    %1171 = stablehlo.convert %1170 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf64>
-    %1172 = stablehlo.reduce(%1171 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %1173 = stablehlo.reshape %1172 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %1174 = stablehlo.broadcast_in_dim %1173, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %1175 = stablehlo.divide %1174, %622 : tensor<1x4096x1xf64>
-    %1176 = stablehlo.broadcast_in_dim %1171, dims = [0, 1, 2] : (tensor<1x4096x64xf64>) -> tensor<1x4096x64xf64>
-    %1177 = stablehlo.broadcast_in_dim %1175, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x64xf64>
-    %1178 = stablehlo.subtract %1176, %1177 : tensor<1x4096x64xf64>
-    %1179 = stablehlo.multiply %1178, %1178 : tensor<1x4096x64xf64>
-    %1180 = stablehlo.reduce(%1179 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf64>, tensor<f64>) -> tensor<1x4096xf64>
-    %1181 = stablehlo.reshape %1180 : (tensor<1x4096xf64>) -> tensor<1x4096x1xf64>
-    %1182 = stablehlo.broadcast_in_dim %1181, dims = [0, 1, 2] : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf64>
-    %1183 = stablehlo.divide %1182, %622 : tensor<1x4096x1xf64>
-    %1184 = stablehlo.convert %1183 : (tensor<1x4096x1xf64>) -> tensor<1x4096x1xf32>
-    %1185 = stablehlo.reduce(%1170 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x4096x64xf32>, tensor<f32>) -> tensor<1x4096xf32>
-    %1186 = stablehlo.reshape %1185 : (tensor<1x4096xf32>) -> tensor<1x4096x1xf32>
-    %1187 = stablehlo.broadcast_in_dim %1186, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %1188 = stablehlo.divide %1187, %638 : tensor<1x4096x1xf32>
-    %1189 = stablehlo.broadcast_in_dim %1184, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x1xf32>
-    %1190 = stablehlo.add %1189, %641 : tensor<1x4096x1xf32>
-    %1191 = stablehlo.rsqrt %1190 : tensor<1x4096x1xf32>
-    %1192 = stablehlo.broadcast_in_dim %1170, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1193 = stablehlo.broadcast_in_dim %1188, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %1194 = stablehlo.subtract %1192, %1193 : tensor<1x4096x64xf32>
-    %1195 = stablehlo.broadcast_in_dim %1194, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1196 = stablehlo.broadcast_in_dim %1191, dims = [0, 1, 2] : (tensor<1x4096x1xf32>) -> tensor<1x4096x64xf32>
-    %1197 = stablehlo.multiply %1195, %1196 : tensor<1x4096x64xf32>
-    %1198 = stablehlo.convert %arg55 : (tensor<64xbf16>) -> tensor<64xf32>
-    %1199 = stablehlo.broadcast_in_dim %1197, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1200 = stablehlo.broadcast_in_dim %1198, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %1201 = stablehlo.multiply %1199, %1200 : tensor<1x4096x64xf32>
-    %1202 = stablehlo.convert %arg56 : (tensor<64xbf16>) -> tensor<64xf32>
-    %1203 = stablehlo.broadcast_in_dim %1201, dims = [0, 1, 2] : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xf32>
-    %1204 = stablehlo.broadcast_in_dim %1202, dims = [2] : (tensor<64xf32>) -> tensor<1x4096x64xf32>
-    %1205 = stablehlo.add %1203, %1204 : tensor<1x4096x64xf32>
-    %1206 = stablehlo.convert %1205 : (tensor<1x4096x64xf32>) -> tensor<1x4096x64xbf16>
-    %1207 = stablehlo.reshape %1206 : (tensor<1x4096x64xbf16>) -> tensor<1x64x64x64xbf16>
-    %1208 = stablehlo.transpose %1207, dims = [0, 3, 1, 2] : (tensor<1x64x64x64xbf16>) -> tensor<1x64x64x64xbf16>
-    %1209 = stablehlo.convolution(%1208, %arg57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x64x64xbf16>, tensor<160x64x3x3xbf16>) -> tensor<1x160x32x32xbf16>
-    %1210 = stablehlo.reshape %arg58 : (tensor<160xbf16>) -> tensor<160x1x1xbf16>
-    %1211 = stablehlo.broadcast_in_dim %1209, dims = [0, 1, 2, 3] : (tensor<1x160x32x32xbf16>) -> tensor<1x160x32x32xbf16>
-    %1212 = stablehlo.broadcast_in_dim %1210, dims = [1, 2, 3] : (tensor<160x1x1xbf16>) -> tensor<1x160x32x32xbf16>
-    %1213 = stablehlo.add %1211, %1212 : tensor<1x160x32x32xbf16>
-    %1214 = stablehlo.reshape %1213 : (tensor<1x160x32x32xbf16>) -> tensor<1x160x1024xbf16>
-    %1215 = stablehlo.transpose %1214, dims = [0, 2, 1] : (tensor<1x160x1024xbf16>) -> tensor<1x1024x160xbf16>
-    %1216 = stablehlo.convert %1215 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xf32>
-    %1217 = stablehlo.convert %1216 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf64>
-    %1218 = stablehlo.reduce(%1217 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1219 = stablehlo.reshape %1218 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1220 = stablehlo.convert %cst_84 : (tensor<1xi64>) -> tensor<1xf64>
-    %1221 = stablehlo.reshape %1220 : (tensor<1xf64>) -> tensor<f64>
-    %1222 = stablehlo.broadcast_in_dim %1219, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1223 = stablehlo.broadcast_in_dim %1221, dims = [] : (tensor<f64>) -> tensor<1x1024x1xf64>
-    %1224 = stablehlo.divide %1222, %1223 : tensor<1x1024x1xf64>
-    %1225 = stablehlo.broadcast_in_dim %1217, dims = [0, 1, 2] : (tensor<1x1024x160xf64>) -> tensor<1x1024x160xf64>
-    %1226 = stablehlo.broadcast_in_dim %1224, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x160xf64>
-    %1227 = stablehlo.subtract %1225, %1226 : tensor<1x1024x160xf64>
-    %1228 = stablehlo.multiply %1227, %1227 : tensor<1x1024x160xf64>
-    %1229 = stablehlo.reduce(%1228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1230 = stablehlo.reshape %1229 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1231 = stablehlo.broadcast_in_dim %1230, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1232 = stablehlo.divide %1231, %1223 : tensor<1x1024x1xf64>
-    %1233 = stablehlo.convert %1232 : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf32>
-    %1234 = stablehlo.reduce(%1216 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf32>, tensor<f32>) -> tensor<1x1024xf32>
-    %1235 = stablehlo.reshape %1234 : (tensor<1x1024xf32>) -> tensor<1x1024x1xf32>
-    %1236 = stablehlo.convert %cst_84 : (tensor<1xi64>) -> tensor<1xf32>
-    %1237 = stablehlo.reshape %1236 : (tensor<1xf32>) -> tensor<f32>
-    %1238 = stablehlo.broadcast_in_dim %1235, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1239 = stablehlo.broadcast_in_dim %1237, dims = [] : (tensor<f32>) -> tensor<1x1024x1xf32>
-    %1240 = stablehlo.divide %1238, %1239 : tensor<1x1024x1xf32>
-    %1241 = stablehlo.broadcast_in_dim %1233, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1242 = stablehlo.broadcast_in_dim %33, dims = [] : (tensor<f32>) -> tensor<1x1024x1xf32>
-    %1243 = stablehlo.add %1241, %1242 : tensor<1x1024x1xf32>
-    %1244 = stablehlo.rsqrt %1243 : tensor<1x1024x1xf32>
-    %1245 = stablehlo.broadcast_in_dim %1216, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1246 = stablehlo.broadcast_in_dim %1240, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1247 = stablehlo.subtract %1245, %1246 : tensor<1x1024x160xf32>
-    %1248 = stablehlo.broadcast_in_dim %1247, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1249 = stablehlo.broadcast_in_dim %1244, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1250 = stablehlo.multiply %1248, %1249 : tensor<1x1024x160xf32>
-    %1251 = stablehlo.convert %arg59 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1252 = stablehlo.broadcast_in_dim %1250, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1253 = stablehlo.broadcast_in_dim %1251, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1254 = stablehlo.multiply %1252, %1253 : tensor<1x1024x160xf32>
-    %1255 = stablehlo.convert %arg60 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1256 = stablehlo.broadcast_in_dim %1254, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1257 = stablehlo.broadcast_in_dim %1255, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1258 = stablehlo.add %1256, %1257 : tensor<1x1024x160xf32>
-    %1259 = stablehlo.convert %1258 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xbf16>
-    %1260 = stablehlo.convert %1259 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xf32>
-    %1261 = stablehlo.convert %1260 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf64>
-    %1262 = stablehlo.reduce(%1261 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1263 = stablehlo.reshape %1262 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1264 = stablehlo.broadcast_in_dim %1263, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1265 = stablehlo.divide %1264, %1223 : tensor<1x1024x1xf64>
-    %1266 = stablehlo.broadcast_in_dim %1261, dims = [0, 1, 2] : (tensor<1x1024x160xf64>) -> tensor<1x1024x160xf64>
-    %1267 = stablehlo.broadcast_in_dim %1265, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x160xf64>
-    %1268 = stablehlo.subtract %1266, %1267 : tensor<1x1024x160xf64>
-    %1269 = stablehlo.multiply %1268, %1268 : tensor<1x1024x160xf64>
-    %1270 = stablehlo.reduce(%1269 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1271 = stablehlo.reshape %1270 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1272 = stablehlo.broadcast_in_dim %1271, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1273 = stablehlo.divide %1272, %1223 : tensor<1x1024x1xf64>
-    %1274 = stablehlo.convert %1273 : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf32>
-    %1275 = stablehlo.reduce(%1260 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf32>, tensor<f32>) -> tensor<1x1024xf32>
-    %1276 = stablehlo.reshape %1275 : (tensor<1x1024xf32>) -> tensor<1x1024x1xf32>
-    %1277 = stablehlo.broadcast_in_dim %1276, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1278 = stablehlo.divide %1277, %1239 : tensor<1x1024x1xf32>
-    %1279 = stablehlo.broadcast_in_dim %1274, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1280 = stablehlo.add %1279, %1242 : tensor<1x1024x1xf32>
-    %1281 = stablehlo.rsqrt %1280 : tensor<1x1024x1xf32>
-    %1282 = stablehlo.broadcast_in_dim %1260, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1283 = stablehlo.broadcast_in_dim %1278, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1284 = stablehlo.subtract %1282, %1283 : tensor<1x1024x160xf32>
-    %1285 = stablehlo.broadcast_in_dim %1284, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1286 = stablehlo.broadcast_in_dim %1281, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1287 = stablehlo.multiply %1285, %1286 : tensor<1x1024x160xf32>
-    %1288 = stablehlo.convert %arg61 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1289 = stablehlo.broadcast_in_dim %1287, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1290 = stablehlo.broadcast_in_dim %1288, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1291 = stablehlo.multiply %1289, %1290 : tensor<1x1024x160xf32>
-    %1292 = stablehlo.convert %arg62 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1293 = stablehlo.broadcast_in_dim %1291, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1294 = stablehlo.broadcast_in_dim %1292, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1295 = stablehlo.add %1293, %1294 : tensor<1x1024x160xf32>
-    %1296 = stablehlo.convert %1295 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xbf16>
-    %1297 = stablehlo.reshape %1296 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1298 = stablehlo.convert %1297 : (tensor<1024x160xbf16>) -> tensor<1024x160xf32>
-    %1299 = stablehlo.dot_general %1298, %arg156, contracting_dims = [1] x [0] : (tensor<1024x160xf32>, tensor<160x160xf32>) -> tensor<1024x160xf32>
-    %1300 = stablehlo.broadcast_in_dim %1299, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1301 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<1024x160xf32>
-    %1302 = stablehlo.multiply %1300, %1301 : tensor<1024x160xf32>
-    %1303 = stablehlo.broadcast_in_dim %1302, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1304 = stablehlo.broadcast_in_dim %arg157, dims = [1] : (tensor<160xf32>) -> tensor<1024x160xf32>
-    %1305 = stablehlo.add %1303, %1304 : tensor<1024x160xf32>
-    %1306 = stablehlo.convert %1305 : (tensor<1024x160xf32>) -> tensor<1024x160xbf16>
-    %1307 = stablehlo.reshape %1306 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1308 = stablehlo.reshape %1307 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x5x32xbf16>
-    %1309 = stablehlo.transpose %1308, dims = [0, 2, 1, 3] : (tensor<1x1024x5x32xbf16>) -> tensor<1x5x1024x32xbf16>
-    %1310 = stablehlo.transpose %1296, dims = [0, 2, 1] : (tensor<1x1024x160xbf16>) -> tensor<1x160x1024xbf16>
-    %1311 = stablehlo.reshape %1310 : (tensor<1x160x1024xbf16>) -> tensor<1x160x32x32xbf16>
-    %1312 = stablehlo.convolution(%1311, %arg63) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x160x32x32xbf16>, tensor<160x160x2x2xbf16>) -> tensor<1x160x16x16xbf16>
-    %1313 = stablehlo.reshape %arg64 : (tensor<160xbf16>) -> tensor<160x1x1xbf16>
-    %1314 = stablehlo.broadcast_in_dim %1312, dims = [0, 1, 2, 3] : (tensor<1x160x16x16xbf16>) -> tensor<1x160x16x16xbf16>
-    %1315 = stablehlo.broadcast_in_dim %1313, dims = [1, 2, 3] : (tensor<160x1x1xbf16>) -> tensor<1x160x16x16xbf16>
-    %1316 = stablehlo.add %1314, %1315 : tensor<1x160x16x16xbf16>
-    %1317 = stablehlo.reshape %1316 : (tensor<1x160x16x16xbf16>) -> tensor<1x160x256xbf16>
-    %1318 = stablehlo.transpose %1317, dims = [0, 2, 1] : (tensor<1x160x256xbf16>) -> tensor<1x256x160xbf16>
-    %1319 = stablehlo.convert %1318 : (tensor<1x256x160xbf16>) -> tensor<1x256x160xf32>
-    %1320 = stablehlo.convert %1319 : (tensor<1x256x160xf32>) -> tensor<1x256x160xf64>
-    %1321 = stablehlo.reduce(%1320 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x160xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1322 = stablehlo.reshape %1321 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1323 = stablehlo.broadcast_in_dim %1322, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1324 = stablehlo.broadcast_in_dim %1221, dims = [] : (tensor<f64>) -> tensor<1x256x1xf64>
-    %1325 = stablehlo.divide %1323, %1324 : tensor<1x256x1xf64>
-    %1326 = stablehlo.broadcast_in_dim %1320, dims = [0, 1, 2] : (tensor<1x256x160xf64>) -> tensor<1x256x160xf64>
-    %1327 = stablehlo.broadcast_in_dim %1325, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x160xf64>
-    %1328 = stablehlo.subtract %1326, %1327 : tensor<1x256x160xf64>
-    %1329 = stablehlo.multiply %1328, %1328 : tensor<1x256x160xf64>
-    %1330 = stablehlo.reduce(%1329 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x160xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1331 = stablehlo.reshape %1330 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1332 = stablehlo.broadcast_in_dim %1331, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1333 = stablehlo.divide %1332, %1324 : tensor<1x256x1xf64>
-    %1334 = stablehlo.convert %1333 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1335 = stablehlo.reduce(%1319 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x160xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1336 = stablehlo.reshape %1335 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1337 = stablehlo.broadcast_in_dim %1336, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1338 = stablehlo.broadcast_in_dim %1237, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %1339 = stablehlo.divide %1337, %1338 : tensor<1x256x1xf32>
-    %1340 = stablehlo.broadcast_in_dim %1334, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1341 = stablehlo.add %1340, %136 : tensor<1x256x1xf32>
-    %1342 = stablehlo.rsqrt %1341 : tensor<1x256x1xf32>
-    %1343 = stablehlo.broadcast_in_dim %1319, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1344 = stablehlo.broadcast_in_dim %1339, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x160xf32>
-    %1345 = stablehlo.subtract %1343, %1344 : tensor<1x256x160xf32>
-    %1346 = stablehlo.broadcast_in_dim %1345, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1347 = stablehlo.broadcast_in_dim %1342, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x160xf32>
-    %1348 = stablehlo.multiply %1346, %1347 : tensor<1x256x160xf32>
-    %1349 = stablehlo.convert %arg65 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1350 = stablehlo.broadcast_in_dim %1348, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1351 = stablehlo.broadcast_in_dim %1349, dims = [2] : (tensor<160xf32>) -> tensor<1x256x160xf32>
-    %1352 = stablehlo.multiply %1350, %1351 : tensor<1x256x160xf32>
-    %1353 = stablehlo.convert %arg66 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1354 = stablehlo.broadcast_in_dim %1352, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1355 = stablehlo.broadcast_in_dim %1353, dims = [2] : (tensor<160xf32>) -> tensor<1x256x160xf32>
-    %1356 = stablehlo.add %1354, %1355 : tensor<1x256x160xf32>
-    %1357 = stablehlo.convert %1356 : (tensor<1x256x160xf32>) -> tensor<1x256x160xbf16>
-    %1358 = stablehlo.reshape %1357 : (tensor<1x256x160xbf16>) -> tensor<256x160xbf16>
-    %1359 = stablehlo.convert %1358 : (tensor<256x160xbf16>) -> tensor<256x160xf32>
-    %1360 = stablehlo.dot_general %1359, %arg158, contracting_dims = [1] x [0] : (tensor<256x160xf32>, tensor<160x160xf32>) -> tensor<256x160xf32>
-    %1361 = stablehlo.broadcast_in_dim %1360, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1362 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<256x160xf32>
-    %1363 = stablehlo.multiply %1361, %1362 : tensor<256x160xf32>
-    %1364 = stablehlo.broadcast_in_dim %1363, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1365 = stablehlo.broadcast_in_dim %arg159, dims = [1] : (tensor<160xf32>) -> tensor<256x160xf32>
-    %1366 = stablehlo.add %1364, %1365 : tensor<256x160xf32>
-    %1367 = stablehlo.convert %1366 : (tensor<256x160xf32>) -> tensor<256x160xbf16>
-    %1368 = stablehlo.reshape %1367 : (tensor<256x160xbf16>) -> tensor<1x256x160xbf16>
-    %1369 = stablehlo.reshape %1368 : (tensor<1x256x160xbf16>) -> tensor<1x256x5x32xbf16>
-    %1370 = stablehlo.transpose %1369, dims = [0, 2, 1, 3] : (tensor<1x256x5x32xbf16>) -> tensor<1x5x256x32xbf16>
-    %1371 = stablehlo.dot_general %1359, %arg160, contracting_dims = [1] x [0] : (tensor<256x160xf32>, tensor<160x160xf32>) -> tensor<256x160xf32>
-    %1372 = stablehlo.broadcast_in_dim %1371, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1373 = stablehlo.multiply %1372, %1362 : tensor<256x160xf32>
-    %1374 = stablehlo.broadcast_in_dim %1373, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1375 = stablehlo.broadcast_in_dim %arg161, dims = [1] : (tensor<160xf32>) -> tensor<256x160xf32>
-    %1376 = stablehlo.add %1374, %1375 : tensor<256x160xf32>
-    %1377 = stablehlo.convert %1376 : (tensor<256x160xf32>) -> tensor<256x160xbf16>
-    %1378 = stablehlo.reshape %1377 : (tensor<256x160xbf16>) -> tensor<1x256x160xbf16>
-    %1379 = stablehlo.reshape %1378 : (tensor<1x256x160xbf16>) -> tensor<1x256x5x32xbf16>
-    %1380 = stablehlo.transpose %1379, dims = [0, 2, 1, 3] : (tensor<1x256x5x32xbf16>) -> tensor<1x5x256x32xbf16>
-    %1381 = stablehlo.transpose %1370, dims = [0, 1, 3, 2] : (tensor<1x5x256x32xbf16>) -> tensor<1x5x32x256xbf16>
-    %1382 = stablehlo.reshape %1309 : (tensor<1x5x1024x32xbf16>) -> tensor<5x1024x32xbf16>
-    %1383 = stablehlo.reshape %1381 : (tensor<1x5x32x256xbf16>) -> tensor<5x32x256xbf16>
-    %1384 = stablehlo.broadcast_in_dim %1383, dims = [0, 1, 2] : (tensor<5x32x256xbf16>) -> tensor<5x32x256xbf16>
-    %1385 = stablehlo.dot_general %1382, %1384, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1024x32xbf16>, tensor<5x32x256xbf16>) -> tensor<5x1024x256xbf16>
-    %1386 = stablehlo.reshape %1385 : (tensor<5x1024x256xbf16>) -> tensor<1x5x1024x256xbf16>
-    %1387 = stablehlo.broadcast_in_dim %1386, dims = [0, 1, 2, 3] : (tensor<1x5x1024x256xbf16>) -> tensor<1x5x1024x256xbf16>
-    %1388 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x5x1024x256xbf16>
-    %1389 = stablehlo.divide %1387, %1388 : tensor<1x5x1024x256xbf16>
-    %1390 = stablehlo.convert %1389 : (tensor<1x5x1024x256xbf16>) -> tensor<1x5x1024x256xf32>
-    %1391 = stablehlo.reduce(%1390 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1024x256xf32>, tensor<f32>) -> tensor<1x5x1024xf32>
-    %1392 = stablehlo.reshape %1391 : (tensor<1x5x1024xf32>) -> tensor<1x5x1024x1xf32>
-    %1393 = stablehlo.broadcast_in_dim %1390, dims = [0, 1, 2, 3] : (tensor<1x5x1024x256xf32>) -> tensor<1x5x1024x256xf32>
-    %1394 = stablehlo.broadcast_in_dim %1392, dims = [0, 1, 2, 3] : (tensor<1x5x1024x1xf32>) -> tensor<1x5x1024x256xf32>
-    %1395 = stablehlo.subtract %1393, %1394 : tensor<1x5x1024x256xf32>
-    %1396 = stablehlo.exponential %1395 : tensor<1x5x1024x256xf32>
-    %1397 = stablehlo.reduce(%1396 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1024x256xf32>, tensor<f32>) -> tensor<1x5x1024xf32>
-    %1398 = stablehlo.reshape %1397 : (tensor<1x5x1024xf32>) -> tensor<1x5x1024x1xf32>
-    %1399 = stablehlo.broadcast_in_dim %1396, dims = [0, 1, 2, 3] : (tensor<1x5x1024x256xf32>) -> tensor<1x5x1024x256xf32>
-    %1400 = stablehlo.broadcast_in_dim %1398, dims = [0, 1, 2, 3] : (tensor<1x5x1024x1xf32>) -> tensor<1x5x1024x256xf32>
-    %1401 = stablehlo.divide %1399, %1400 : tensor<1x5x1024x256xf32>
-    %1402 = stablehlo.convert %1401 : (tensor<1x5x1024x256xf32>) -> tensor<1x5x1024x256xbf16>
-    %1403 = stablehlo.reshape %1402 : (tensor<1x5x1024x256xbf16>) -> tensor<5x1024x256xbf16>
-    %1404 = stablehlo.reshape %1380 : (tensor<1x5x256x32xbf16>) -> tensor<5x256x32xbf16>
-    %1405 = stablehlo.broadcast_in_dim %1404, dims = [0, 1, 2] : (tensor<5x256x32xbf16>) -> tensor<5x256x32xbf16>
-    %1406 = stablehlo.dot_general %1403, %1405, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1024x256xbf16>, tensor<5x256x32xbf16>) -> tensor<5x1024x32xbf16>
-    %1407 = stablehlo.reshape %1406 : (tensor<5x1024x32xbf16>) -> tensor<1x5x1024x32xbf16>
-    %1408 = stablehlo.transpose %1407, dims = [0, 2, 1, 3] : (tensor<1x5x1024x32xbf16>) -> tensor<1x1024x5x32xbf16>
-    %1409 = stablehlo.reshape %1408 : (tensor<1x1024x5x32xbf16>) -> tensor<1x1024x160xbf16>
-    %1410 = stablehlo.reshape %1409 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1411 = stablehlo.convert %1410 : (tensor<1024x160xbf16>) -> tensor<1024x160xf32>
-    %1412 = stablehlo.dot_general %1411, %arg162, contracting_dims = [1] x [0] : (tensor<1024x160xf32>, tensor<160x160xf32>) -> tensor<1024x160xf32>
-    %1413 = stablehlo.broadcast_in_dim %1412, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1414 = stablehlo.multiply %1413, %1301 : tensor<1024x160xf32>
-    %1415 = stablehlo.broadcast_in_dim %1414, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1416 = stablehlo.broadcast_in_dim %arg163, dims = [1] : (tensor<160xf32>) -> tensor<1024x160xf32>
-    %1417 = stablehlo.add %1415, %1416 : tensor<1024x160xf32>
-    %1418 = stablehlo.convert %1417 : (tensor<1024x160xf32>) -> tensor<1024x160xbf16>
-    %1419 = stablehlo.reshape %1418 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1420 = stablehlo.add %1419, %1259 : tensor<1x1024x160xbf16>
-    %1421 = stablehlo.convert %1420 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xf32>
-    %1422 = stablehlo.convert %1421 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf64>
-    %1423 = stablehlo.reduce(%1422 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1424 = stablehlo.reshape %1423 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1425 = stablehlo.broadcast_in_dim %1424, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1426 = stablehlo.divide %1425, %1223 : tensor<1x1024x1xf64>
-    %1427 = stablehlo.broadcast_in_dim %1422, dims = [0, 1, 2] : (tensor<1x1024x160xf64>) -> tensor<1x1024x160xf64>
-    %1428 = stablehlo.broadcast_in_dim %1426, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x160xf64>
-    %1429 = stablehlo.subtract %1427, %1428 : tensor<1x1024x160xf64>
-    %1430 = stablehlo.multiply %1429, %1429 : tensor<1x1024x160xf64>
-    %1431 = stablehlo.reduce(%1430 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1432 = stablehlo.reshape %1431 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1433 = stablehlo.broadcast_in_dim %1432, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1434 = stablehlo.divide %1433, %1223 : tensor<1x1024x1xf64>
-    %1435 = stablehlo.convert %1434 : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf32>
-    %1436 = stablehlo.reduce(%1421 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf32>, tensor<f32>) -> tensor<1x1024xf32>
-    %1437 = stablehlo.reshape %1436 : (tensor<1x1024xf32>) -> tensor<1x1024x1xf32>
-    %1438 = stablehlo.broadcast_in_dim %1437, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1439 = stablehlo.divide %1438, %1239 : tensor<1x1024x1xf32>
-    %1440 = stablehlo.broadcast_in_dim %1435, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1441 = stablehlo.add %1440, %1242 : tensor<1x1024x1xf32>
-    %1442 = stablehlo.rsqrt %1441 : tensor<1x1024x1xf32>
-    %1443 = stablehlo.broadcast_in_dim %1421, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1444 = stablehlo.broadcast_in_dim %1439, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1445 = stablehlo.subtract %1443, %1444 : tensor<1x1024x160xf32>
-    %1446 = stablehlo.broadcast_in_dim %1445, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1447 = stablehlo.broadcast_in_dim %1442, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1448 = stablehlo.multiply %1446, %1447 : tensor<1x1024x160xf32>
-    %1449 = stablehlo.convert %arg67 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1450 = stablehlo.broadcast_in_dim %1448, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1451 = stablehlo.broadcast_in_dim %1449, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1452 = stablehlo.multiply %1450, %1451 : tensor<1x1024x160xf32>
-    %1453 = stablehlo.convert %arg68 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1454 = stablehlo.broadcast_in_dim %1452, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1455 = stablehlo.broadcast_in_dim %1453, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1456 = stablehlo.add %1454, %1455 : tensor<1x1024x160xf32>
-    %1457 = stablehlo.convert %1456 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xbf16>
-    %1458 = stablehlo.reshape %1457 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1459 = stablehlo.convert %1458 : (tensor<1024x160xbf16>) -> tensor<1024x160xf32>
-    %1460 = stablehlo.dot_general %1459, %arg164, contracting_dims = [1] x [0] : (tensor<1024x160xf32>, tensor<160x640xf32>) -> tensor<1024x640xf32>
-    %1461 = stablehlo.broadcast_in_dim %1460, dims = [0, 1] : (tensor<1024x640xf32>) -> tensor<1024x640xf32>
-    %1462 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<1024x640xf32>
-    %1463 = stablehlo.multiply %1461, %1462 : tensor<1024x640xf32>
-    %1464 = stablehlo.broadcast_in_dim %1463, dims = [0, 1] : (tensor<1024x640xf32>) -> tensor<1024x640xf32>
-    %1465 = stablehlo.broadcast_in_dim %arg165, dims = [1] : (tensor<640xf32>) -> tensor<1024x640xf32>
-    %1466 = stablehlo.add %1464, %1465 : tensor<1024x640xf32>
-    %1467 = stablehlo.convert %1466 : (tensor<1024x640xf32>) -> tensor<1024x640xbf16>
-    %1468 = stablehlo.reshape %1467 : (tensor<1024x640xbf16>) -> tensor<1x1024x640xbf16>
-    %1469 = stablehlo.transpose %1468, dims = [0, 2, 1] : (tensor<1x1024x640xbf16>) -> tensor<1x640x1024xbf16>
-    %1470 = stablehlo.reshape %1469 : (tensor<1x640x1024xbf16>) -> tensor<1x640x32x32xbf16>
-    %1471 = stablehlo.convolution(%1470, %arg69) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 640 : i64} : (tensor<1x640x32x32xbf16>, tensor<640x1x3x3xbf16>) -> tensor<1x640x32x32xbf16>
-    %1472 = stablehlo.reshape %arg70 : (tensor<640xbf16>) -> tensor<640x1x1xbf16>
-    %1473 = stablehlo.broadcast_in_dim %1471, dims = [0, 1, 2, 3] : (tensor<1x640x32x32xbf16>) -> tensor<1x640x32x32xbf16>
-    %1474 = stablehlo.broadcast_in_dim %1472, dims = [1, 2, 3] : (tensor<640x1x1xbf16>) -> tensor<1x640x32x32xbf16>
-    %1475 = stablehlo.add %1473, %1474 : tensor<1x640x32x32xbf16>
-    %1476 = stablehlo.reshape %1475 : (tensor<1x640x32x32xbf16>) -> tensor<1x640x1024xbf16>
-    %1477 = stablehlo.transpose %1476, dims = [0, 2, 1] : (tensor<1x640x1024xbf16>) -> tensor<1x1024x640xbf16>
-    %1478 = stablehlo.multiply %1477, %cst_42 : tensor<1x1024x640xbf16>
-    %1479 = stablehlo.rsqrt %cst_41 : tensor<1x1024x640xbf16>
-    %1480 = stablehlo.multiply %1477, %1479 : tensor<1x1024x640xbf16>
-    %1481 = stablehlo.convert %1480 : (tensor<1x1024x640xbf16>) -> tensor<1x1024x640xf32>
-    %1482 = stablehlo.clamp %cst_43, %1481, %cst_44 : tensor<1x1024x640xf32>
-    %1483 = stablehlo.multiply %1482, %1482 : tensor<1x1024x640xf32>
-    %1484 = stablehlo.multiply %cst_45, %1483 : tensor<1x1024x640xf32>
-    %1485 = stablehlo.add %1484, %cst_46 : tensor<1x1024x640xf32>
-    %1486 = stablehlo.multiply %1485, %1483 : tensor<1x1024x640xf32>
-    %1487 = stablehlo.add %1486, %cst_47 : tensor<1x1024x640xf32>
-    %1488 = stablehlo.multiply %1487, %1483 : tensor<1x1024x640xf32>
-    %1489 = stablehlo.add %1488, %cst_48 : tensor<1x1024x640xf32>
-    %1490 = stablehlo.multiply %1489, %1483 : tensor<1x1024x640xf32>
-    %1491 = stablehlo.add %1490, %cst_49 : tensor<1x1024x640xf32>
-    %1492 = stablehlo.multiply %1491, %1483 : tensor<1x1024x640xf32>
-    %1493 = stablehlo.add %1492, %cst_50 : tensor<1x1024x640xf32>
-    %1494 = stablehlo.multiply %1493, %1483 : tensor<1x1024x640xf32>
-    %1495 = stablehlo.add %1494, %cst_51 : tensor<1x1024x640xf32>
-    %1496 = stablehlo.multiply %cst_52, %1483 : tensor<1x1024x640xf32>
-    %1497 = stablehlo.add %1496, %cst_53 : tensor<1x1024x640xf32>
-    %1498 = stablehlo.multiply %1497, %1483 : tensor<1x1024x640xf32>
-    %1499 = stablehlo.add %1498, %cst_54 : tensor<1x1024x640xf32>
-    %1500 = stablehlo.multiply %1499, %1483 : tensor<1x1024x640xf32>
-    %1501 = stablehlo.add %1500, %cst_55 : tensor<1x1024x640xf32>
-    %1502 = stablehlo.multiply %1501, %1483 : tensor<1x1024x640xf32>
-    %1503 = stablehlo.add %1502, %cst_56 : tensor<1x1024x640xf32>
-    %1504 = stablehlo.multiply %1482, %1495 : tensor<1x1024x640xf32>
-    %1505 = stablehlo.divide %1504, %1503 : tensor<1x1024x640xf32>
-    %1506 = stablehlo.clamp %cst_57, %1505, %cst_58 : tensor<1x1024x640xf32>
-    %1507 = stablehlo.convert %1506 : (tensor<1x1024x640xf32>) -> tensor<1x1024x640xbf16>
-    %1508 = stablehlo.add %1507, %cst_40 : tensor<1x1024x640xbf16>
-    %1509 = stablehlo.multiply %1508, %1478 : tensor<1x1024x640xbf16>
-    %1510 = stablehlo.reshape %1509 : (tensor<1x1024x640xbf16>) -> tensor<1024x640xbf16>
-    %1511 = stablehlo.dot_general %1510, %arg166, contracting_dims = [1] x [0] : (tensor<1024x640xbf16>, tensor<640x160xbf16>) -> tensor<1024x160xbf16>
-    %1512 = stablehlo.reshape %1511 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1513 = stablehlo.broadcast_in_dim %1512, dims = [0, 1, 2] : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1514 = stablehlo.broadcast_in_dim %arg71, dims = [2] : (tensor<160xbf16>) -> tensor<1x1024x160xbf16>
-    %1515 = stablehlo.add %1513, %1514 : tensor<1x1024x160xbf16>
-    %1516 = stablehlo.reshape %1515 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1517 = stablehlo.reshape %1516 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1518 = stablehlo.add %1517, %1420 : tensor<1x1024x160xbf16>
-    %1519 = stablehlo.convert %1518 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xf32>
-    %1520 = stablehlo.convert %1519 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf64>
-    %1521 = stablehlo.reduce(%1520 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1522 = stablehlo.reshape %1521 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1523 = stablehlo.broadcast_in_dim %1522, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1524 = stablehlo.divide %1523, %1223 : tensor<1x1024x1xf64>
-    %1525 = stablehlo.broadcast_in_dim %1520, dims = [0, 1, 2] : (tensor<1x1024x160xf64>) -> tensor<1x1024x160xf64>
-    %1526 = stablehlo.broadcast_in_dim %1524, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x160xf64>
-    %1527 = stablehlo.subtract %1525, %1526 : tensor<1x1024x160xf64>
-    %1528 = stablehlo.multiply %1527, %1527 : tensor<1x1024x160xf64>
-    %1529 = stablehlo.reduce(%1528 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1530 = stablehlo.reshape %1529 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1531 = stablehlo.broadcast_in_dim %1530, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1532 = stablehlo.divide %1531, %1223 : tensor<1x1024x1xf64>
-    %1533 = stablehlo.convert %1532 : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf32>
-    %1534 = stablehlo.reduce(%1519 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf32>, tensor<f32>) -> tensor<1x1024xf32>
-    %1535 = stablehlo.reshape %1534 : (tensor<1x1024xf32>) -> tensor<1x1024x1xf32>
-    %1536 = stablehlo.broadcast_in_dim %1535, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1537 = stablehlo.divide %1536, %1239 : tensor<1x1024x1xf32>
-    %1538 = stablehlo.broadcast_in_dim %1533, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1539 = stablehlo.add %1538, %1242 : tensor<1x1024x1xf32>
-    %1540 = stablehlo.rsqrt %1539 : tensor<1x1024x1xf32>
-    %1541 = stablehlo.broadcast_in_dim %1519, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1542 = stablehlo.broadcast_in_dim %1537, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1543 = stablehlo.subtract %1541, %1542 : tensor<1x1024x160xf32>
-    %1544 = stablehlo.broadcast_in_dim %1543, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1545 = stablehlo.broadcast_in_dim %1540, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1546 = stablehlo.multiply %1544, %1545 : tensor<1x1024x160xf32>
-    %1547 = stablehlo.convert %arg72 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1548 = stablehlo.broadcast_in_dim %1546, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1549 = stablehlo.broadcast_in_dim %1547, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1550 = stablehlo.multiply %1548, %1549 : tensor<1x1024x160xf32>
-    %1551 = stablehlo.convert %arg73 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1552 = stablehlo.broadcast_in_dim %1550, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1553 = stablehlo.broadcast_in_dim %1551, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1554 = stablehlo.add %1552, %1553 : tensor<1x1024x160xf32>
-    %1555 = stablehlo.convert %1554 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xbf16>
-    %1556 = stablehlo.reshape %1555 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1557 = stablehlo.convert %1556 : (tensor<1024x160xbf16>) -> tensor<1024x160xf32>
-    %1558 = stablehlo.dot_general %1557, %arg167, contracting_dims = [1] x [0] : (tensor<1024x160xf32>, tensor<160x160xf32>) -> tensor<1024x160xf32>
-    %1559 = stablehlo.broadcast_in_dim %1558, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1560 = stablehlo.multiply %1559, %1301 : tensor<1024x160xf32>
-    %1561 = stablehlo.broadcast_in_dim %1560, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1562 = stablehlo.broadcast_in_dim %arg168, dims = [1] : (tensor<160xf32>) -> tensor<1024x160xf32>
-    %1563 = stablehlo.add %1561, %1562 : tensor<1024x160xf32>
-    %1564 = stablehlo.convert %1563 : (tensor<1024x160xf32>) -> tensor<1024x160xbf16>
-    %1565 = stablehlo.reshape %1564 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1566 = stablehlo.reshape %1565 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x5x32xbf16>
-    %1567 = stablehlo.transpose %1566, dims = [0, 2, 1, 3] : (tensor<1x1024x5x32xbf16>) -> tensor<1x5x1024x32xbf16>
-    %1568 = stablehlo.transpose %1555, dims = [0, 2, 1] : (tensor<1x1024x160xbf16>) -> tensor<1x160x1024xbf16>
-    %1569 = stablehlo.reshape %1568 : (tensor<1x160x1024xbf16>) -> tensor<1x160x32x32xbf16>
-    %1570 = stablehlo.convolution(%1569, %arg74) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x160x32x32xbf16>, tensor<160x160x2x2xbf16>) -> tensor<1x160x16x16xbf16>
-    %1571 = stablehlo.reshape %arg75 : (tensor<160xbf16>) -> tensor<160x1x1xbf16>
-    %1572 = stablehlo.broadcast_in_dim %1570, dims = [0, 1, 2, 3] : (tensor<1x160x16x16xbf16>) -> tensor<1x160x16x16xbf16>
-    %1573 = stablehlo.broadcast_in_dim %1571, dims = [1, 2, 3] : (tensor<160x1x1xbf16>) -> tensor<1x160x16x16xbf16>
-    %1574 = stablehlo.add %1572, %1573 : tensor<1x160x16x16xbf16>
-    %1575 = stablehlo.reshape %1574 : (tensor<1x160x16x16xbf16>) -> tensor<1x160x256xbf16>
-    %1576 = stablehlo.transpose %1575, dims = [0, 2, 1] : (tensor<1x160x256xbf16>) -> tensor<1x256x160xbf16>
-    %1577 = stablehlo.convert %1576 : (tensor<1x256x160xbf16>) -> tensor<1x256x160xf32>
-    %1578 = stablehlo.convert %1577 : (tensor<1x256x160xf32>) -> tensor<1x256x160xf64>
-    %1579 = stablehlo.reduce(%1578 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x160xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1580 = stablehlo.reshape %1579 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1581 = stablehlo.broadcast_in_dim %1580, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1582 = stablehlo.divide %1581, %1324 : tensor<1x256x1xf64>
-    %1583 = stablehlo.broadcast_in_dim %1578, dims = [0, 1, 2] : (tensor<1x256x160xf64>) -> tensor<1x256x160xf64>
-    %1584 = stablehlo.broadcast_in_dim %1582, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x160xf64>
-    %1585 = stablehlo.subtract %1583, %1584 : tensor<1x256x160xf64>
-    %1586 = stablehlo.multiply %1585, %1585 : tensor<1x256x160xf64>
-    %1587 = stablehlo.reduce(%1586 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x160xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1588 = stablehlo.reshape %1587 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1589 = stablehlo.broadcast_in_dim %1588, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1590 = stablehlo.divide %1589, %1324 : tensor<1x256x1xf64>
-    %1591 = stablehlo.convert %1590 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1592 = stablehlo.reduce(%1577 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x160xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1593 = stablehlo.reshape %1592 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1594 = stablehlo.broadcast_in_dim %1593, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1595 = stablehlo.divide %1594, %1338 : tensor<1x256x1xf32>
-    %1596 = stablehlo.broadcast_in_dim %1591, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1597 = stablehlo.add %1596, %136 : tensor<1x256x1xf32>
-    %1598 = stablehlo.rsqrt %1597 : tensor<1x256x1xf32>
-    %1599 = stablehlo.broadcast_in_dim %1577, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1600 = stablehlo.broadcast_in_dim %1595, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x160xf32>
-    %1601 = stablehlo.subtract %1599, %1600 : tensor<1x256x160xf32>
-    %1602 = stablehlo.broadcast_in_dim %1601, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1603 = stablehlo.broadcast_in_dim %1598, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x160xf32>
-    %1604 = stablehlo.multiply %1602, %1603 : tensor<1x256x160xf32>
-    %1605 = stablehlo.convert %arg76 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1606 = stablehlo.broadcast_in_dim %1604, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1607 = stablehlo.broadcast_in_dim %1605, dims = [2] : (tensor<160xf32>) -> tensor<1x256x160xf32>
-    %1608 = stablehlo.multiply %1606, %1607 : tensor<1x256x160xf32>
-    %1609 = stablehlo.convert %arg77 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1610 = stablehlo.broadcast_in_dim %1608, dims = [0, 1, 2] : (tensor<1x256x160xf32>) -> tensor<1x256x160xf32>
-    %1611 = stablehlo.broadcast_in_dim %1609, dims = [2] : (tensor<160xf32>) -> tensor<1x256x160xf32>
-    %1612 = stablehlo.add %1610, %1611 : tensor<1x256x160xf32>
-    %1613 = stablehlo.convert %1612 : (tensor<1x256x160xf32>) -> tensor<1x256x160xbf16>
-    %1614 = stablehlo.reshape %1613 : (tensor<1x256x160xbf16>) -> tensor<256x160xbf16>
-    %1615 = stablehlo.convert %1614 : (tensor<256x160xbf16>) -> tensor<256x160xf32>
-    %1616 = stablehlo.dot_general %1615, %arg169, contracting_dims = [1] x [0] : (tensor<256x160xf32>, tensor<160x160xf32>) -> tensor<256x160xf32>
-    %1617 = stablehlo.broadcast_in_dim %1616, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1618 = stablehlo.multiply %1617, %1362 : tensor<256x160xf32>
-    %1619 = stablehlo.broadcast_in_dim %1618, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1620 = stablehlo.broadcast_in_dim %arg170, dims = [1] : (tensor<160xf32>) -> tensor<256x160xf32>
-    %1621 = stablehlo.add %1619, %1620 : tensor<256x160xf32>
-    %1622 = stablehlo.convert %1621 : (tensor<256x160xf32>) -> tensor<256x160xbf16>
-    %1623 = stablehlo.reshape %1622 : (tensor<256x160xbf16>) -> tensor<1x256x160xbf16>
-    %1624 = stablehlo.reshape %1623 : (tensor<1x256x160xbf16>) -> tensor<1x256x5x32xbf16>
-    %1625 = stablehlo.transpose %1624, dims = [0, 2, 1, 3] : (tensor<1x256x5x32xbf16>) -> tensor<1x5x256x32xbf16>
-    %1626 = stablehlo.dot_general %1615, %arg171, contracting_dims = [1] x [0] : (tensor<256x160xf32>, tensor<160x160xf32>) -> tensor<256x160xf32>
-    %1627 = stablehlo.broadcast_in_dim %1626, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1628 = stablehlo.multiply %1627, %1362 : tensor<256x160xf32>
-    %1629 = stablehlo.broadcast_in_dim %1628, dims = [0, 1] : (tensor<256x160xf32>) -> tensor<256x160xf32>
-    %1630 = stablehlo.broadcast_in_dim %arg172, dims = [1] : (tensor<160xf32>) -> tensor<256x160xf32>
-    %1631 = stablehlo.add %1629, %1630 : tensor<256x160xf32>
-    %1632 = stablehlo.convert %1631 : (tensor<256x160xf32>) -> tensor<256x160xbf16>
-    %1633 = stablehlo.reshape %1632 : (tensor<256x160xbf16>) -> tensor<1x256x160xbf16>
-    %1634 = stablehlo.reshape %1633 : (tensor<1x256x160xbf16>) -> tensor<1x256x5x32xbf16>
-    %1635 = stablehlo.transpose %1634, dims = [0, 2, 1, 3] : (tensor<1x256x5x32xbf16>) -> tensor<1x5x256x32xbf16>
-    %1636 = stablehlo.transpose %1625, dims = [0, 1, 3, 2] : (tensor<1x5x256x32xbf16>) -> tensor<1x5x32x256xbf16>
-    %1637 = stablehlo.reshape %1567 : (tensor<1x5x1024x32xbf16>) -> tensor<5x1024x32xbf16>
-    %1638 = stablehlo.reshape %1636 : (tensor<1x5x32x256xbf16>) -> tensor<5x32x256xbf16>
-    %1639 = stablehlo.broadcast_in_dim %1638, dims = [0, 1, 2] : (tensor<5x32x256xbf16>) -> tensor<5x32x256xbf16>
-    %1640 = stablehlo.dot_general %1637, %1639, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1024x32xbf16>, tensor<5x32x256xbf16>) -> tensor<5x1024x256xbf16>
-    %1641 = stablehlo.reshape %1640 : (tensor<5x1024x256xbf16>) -> tensor<1x5x1024x256xbf16>
-    %1642 = stablehlo.broadcast_in_dim %1641, dims = [0, 1, 2, 3] : (tensor<1x5x1024x256xbf16>) -> tensor<1x5x1024x256xbf16>
-    %1643 = stablehlo.divide %1642, %1388 : tensor<1x5x1024x256xbf16>
-    %1644 = stablehlo.convert %1643 : (tensor<1x5x1024x256xbf16>) -> tensor<1x5x1024x256xf32>
-    %1645 = stablehlo.reduce(%1644 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x5x1024x256xf32>, tensor<f32>) -> tensor<1x5x1024xf32>
-    %1646 = stablehlo.reshape %1645 : (tensor<1x5x1024xf32>) -> tensor<1x5x1024x1xf32>
-    %1647 = stablehlo.broadcast_in_dim %1644, dims = [0, 1, 2, 3] : (tensor<1x5x1024x256xf32>) -> tensor<1x5x1024x256xf32>
-    %1648 = stablehlo.broadcast_in_dim %1646, dims = [0, 1, 2, 3] : (tensor<1x5x1024x1xf32>) -> tensor<1x5x1024x256xf32>
-    %1649 = stablehlo.subtract %1647, %1648 : tensor<1x5x1024x256xf32>
-    %1650 = stablehlo.exponential %1649 : tensor<1x5x1024x256xf32>
-    %1651 = stablehlo.reduce(%1650 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x5x1024x256xf32>, tensor<f32>) -> tensor<1x5x1024xf32>
-    %1652 = stablehlo.reshape %1651 : (tensor<1x5x1024xf32>) -> tensor<1x5x1024x1xf32>
-    %1653 = stablehlo.broadcast_in_dim %1650, dims = [0, 1, 2, 3] : (tensor<1x5x1024x256xf32>) -> tensor<1x5x1024x256xf32>
-    %1654 = stablehlo.broadcast_in_dim %1652, dims = [0, 1, 2, 3] : (tensor<1x5x1024x1xf32>) -> tensor<1x5x1024x256xf32>
-    %1655 = stablehlo.divide %1653, %1654 : tensor<1x5x1024x256xf32>
-    %1656 = stablehlo.convert %1655 : (tensor<1x5x1024x256xf32>) -> tensor<1x5x1024x256xbf16>
-    %1657 = stablehlo.reshape %1656 : (tensor<1x5x1024x256xbf16>) -> tensor<5x1024x256xbf16>
-    %1658 = stablehlo.reshape %1635 : (tensor<1x5x256x32xbf16>) -> tensor<5x256x32xbf16>
-    %1659 = stablehlo.broadcast_in_dim %1658, dims = [0, 1, 2] : (tensor<5x256x32xbf16>) -> tensor<5x256x32xbf16>
-    %1660 = stablehlo.dot_general %1657, %1659, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<5x1024x256xbf16>, tensor<5x256x32xbf16>) -> tensor<5x1024x32xbf16>
-    %1661 = stablehlo.reshape %1660 : (tensor<5x1024x32xbf16>) -> tensor<1x5x1024x32xbf16>
-    %1662 = stablehlo.transpose %1661, dims = [0, 2, 1, 3] : (tensor<1x5x1024x32xbf16>) -> tensor<1x1024x5x32xbf16>
-    %1663 = stablehlo.reshape %1662 : (tensor<1x1024x5x32xbf16>) -> tensor<1x1024x160xbf16>
-    %1664 = stablehlo.reshape %1663 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1665 = stablehlo.convert %1664 : (tensor<1024x160xbf16>) -> tensor<1024x160xf32>
-    %1666 = stablehlo.dot_general %1665, %arg173, contracting_dims = [1] x [0] : (tensor<1024x160xf32>, tensor<160x160xf32>) -> tensor<1024x160xf32>
-    %1667 = stablehlo.broadcast_in_dim %1666, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1668 = stablehlo.multiply %1667, %1301 : tensor<1024x160xf32>
-    %1669 = stablehlo.broadcast_in_dim %1668, dims = [0, 1] : (tensor<1024x160xf32>) -> tensor<1024x160xf32>
-    %1670 = stablehlo.broadcast_in_dim %arg174, dims = [1] : (tensor<160xf32>) -> tensor<1024x160xf32>
-    %1671 = stablehlo.add %1669, %1670 : tensor<1024x160xf32>
-    %1672 = stablehlo.convert %1671 : (tensor<1024x160xf32>) -> tensor<1024x160xbf16>
-    %1673 = stablehlo.reshape %1672 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1674 = stablehlo.add %1673, %1518 : tensor<1x1024x160xbf16>
-    %1675 = stablehlo.convert %1674 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xf32>
-    %1676 = stablehlo.convert %1675 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf64>
-    %1677 = stablehlo.reduce(%1676 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1678 = stablehlo.reshape %1677 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1679 = stablehlo.broadcast_in_dim %1678, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1680 = stablehlo.divide %1679, %1223 : tensor<1x1024x1xf64>
-    %1681 = stablehlo.broadcast_in_dim %1676, dims = [0, 1, 2] : (tensor<1x1024x160xf64>) -> tensor<1x1024x160xf64>
-    %1682 = stablehlo.broadcast_in_dim %1680, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x160xf64>
-    %1683 = stablehlo.subtract %1681, %1682 : tensor<1x1024x160xf64>
-    %1684 = stablehlo.multiply %1683, %1683 : tensor<1x1024x160xf64>
-    %1685 = stablehlo.reduce(%1684 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1686 = stablehlo.reshape %1685 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1687 = stablehlo.broadcast_in_dim %1686, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1688 = stablehlo.divide %1687, %1223 : tensor<1x1024x1xf64>
-    %1689 = stablehlo.convert %1688 : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf32>
-    %1690 = stablehlo.reduce(%1675 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf32>, tensor<f32>) -> tensor<1x1024xf32>
-    %1691 = stablehlo.reshape %1690 : (tensor<1x1024xf32>) -> tensor<1x1024x1xf32>
-    %1692 = stablehlo.broadcast_in_dim %1691, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1693 = stablehlo.divide %1692, %1239 : tensor<1x1024x1xf32>
-    %1694 = stablehlo.broadcast_in_dim %1689, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1695 = stablehlo.add %1694, %1242 : tensor<1x1024x1xf32>
-    %1696 = stablehlo.rsqrt %1695 : tensor<1x1024x1xf32>
-    %1697 = stablehlo.broadcast_in_dim %1675, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1698 = stablehlo.broadcast_in_dim %1693, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1699 = stablehlo.subtract %1697, %1698 : tensor<1x1024x160xf32>
-    %1700 = stablehlo.broadcast_in_dim %1699, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1701 = stablehlo.broadcast_in_dim %1696, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1702 = stablehlo.multiply %1700, %1701 : tensor<1x1024x160xf32>
-    %1703 = stablehlo.convert %arg78 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1704 = stablehlo.broadcast_in_dim %1702, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1705 = stablehlo.broadcast_in_dim %1703, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1706 = stablehlo.multiply %1704, %1705 : tensor<1x1024x160xf32>
-    %1707 = stablehlo.convert %arg79 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1708 = stablehlo.broadcast_in_dim %1706, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1709 = stablehlo.broadcast_in_dim %1707, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1710 = stablehlo.add %1708, %1709 : tensor<1x1024x160xf32>
-    %1711 = stablehlo.convert %1710 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xbf16>
-    %1712 = stablehlo.reshape %1711 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1713 = stablehlo.convert %1712 : (tensor<1024x160xbf16>) -> tensor<1024x160xf32>
-    %1714 = stablehlo.dot_general %1713, %arg175, contracting_dims = [1] x [0] : (tensor<1024x160xf32>, tensor<160x640xf32>) -> tensor<1024x640xf32>
-    %1715 = stablehlo.broadcast_in_dim %1714, dims = [0, 1] : (tensor<1024x640xf32>) -> tensor<1024x640xf32>
-    %1716 = stablehlo.multiply %1715, %1462 : tensor<1024x640xf32>
-    %1717 = stablehlo.broadcast_in_dim %1716, dims = [0, 1] : (tensor<1024x640xf32>) -> tensor<1024x640xf32>
-    %1718 = stablehlo.broadcast_in_dim %arg176, dims = [1] : (tensor<640xf32>) -> tensor<1024x640xf32>
-    %1719 = stablehlo.add %1717, %1718 : tensor<1024x640xf32>
-    %1720 = stablehlo.convert %1719 : (tensor<1024x640xf32>) -> tensor<1024x640xbf16>
-    %1721 = stablehlo.reshape %1720 : (tensor<1024x640xbf16>) -> tensor<1x1024x640xbf16>
-    %1722 = stablehlo.transpose %1721, dims = [0, 2, 1] : (tensor<1x1024x640xbf16>) -> tensor<1x640x1024xbf16>
-    %1723 = stablehlo.reshape %1722 : (tensor<1x640x1024xbf16>) -> tensor<1x640x32x32xbf16>
-    %1724 = stablehlo.convolution(%1723, %arg80) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 640 : i64} : (tensor<1x640x32x32xbf16>, tensor<640x1x3x3xbf16>) -> tensor<1x640x32x32xbf16>
-    %1725 = stablehlo.reshape %arg81 : (tensor<640xbf16>) -> tensor<640x1x1xbf16>
-    %1726 = stablehlo.broadcast_in_dim %1724, dims = [0, 1, 2, 3] : (tensor<1x640x32x32xbf16>) -> tensor<1x640x32x32xbf16>
-    %1727 = stablehlo.broadcast_in_dim %1725, dims = [1, 2, 3] : (tensor<640x1x1xbf16>) -> tensor<1x640x32x32xbf16>
-    %1728 = stablehlo.add %1726, %1727 : tensor<1x640x32x32xbf16>
-    %1729 = stablehlo.reshape %1728 : (tensor<1x640x32x32xbf16>) -> tensor<1x640x1024xbf16>
-    %1730 = stablehlo.transpose %1729, dims = [0, 2, 1] : (tensor<1x640x1024xbf16>) -> tensor<1x1024x640xbf16>
-    %1731 = stablehlo.multiply %1730, %cst_42 : tensor<1x1024x640xbf16>
-    %1732 = stablehlo.multiply %1730, %1479 : tensor<1x1024x640xbf16>
-    %1733 = stablehlo.convert %1732 : (tensor<1x1024x640xbf16>) -> tensor<1x1024x640xf32>
-    %1734 = stablehlo.clamp %cst_43, %1733, %cst_44 : tensor<1x1024x640xf32>
-    %1735 = stablehlo.multiply %1734, %1734 : tensor<1x1024x640xf32>
-    %1736 = stablehlo.multiply %cst_45, %1735 : tensor<1x1024x640xf32>
-    %1737 = stablehlo.add %1736, %cst_46 : tensor<1x1024x640xf32>
-    %1738 = stablehlo.multiply %1737, %1735 : tensor<1x1024x640xf32>
-    %1739 = stablehlo.add %1738, %cst_47 : tensor<1x1024x640xf32>
-    %1740 = stablehlo.multiply %1739, %1735 : tensor<1x1024x640xf32>
-    %1741 = stablehlo.add %1740, %cst_48 : tensor<1x1024x640xf32>
-    %1742 = stablehlo.multiply %1741, %1735 : tensor<1x1024x640xf32>
-    %1743 = stablehlo.add %1742, %cst_49 : tensor<1x1024x640xf32>
-    %1744 = stablehlo.multiply %1743, %1735 : tensor<1x1024x640xf32>
-    %1745 = stablehlo.add %1744, %cst_50 : tensor<1x1024x640xf32>
-    %1746 = stablehlo.multiply %1745, %1735 : tensor<1x1024x640xf32>
-    %1747 = stablehlo.add %1746, %cst_51 : tensor<1x1024x640xf32>
-    %1748 = stablehlo.multiply %cst_52, %1735 : tensor<1x1024x640xf32>
-    %1749 = stablehlo.add %1748, %cst_53 : tensor<1x1024x640xf32>
-    %1750 = stablehlo.multiply %1749, %1735 : tensor<1x1024x640xf32>
-    %1751 = stablehlo.add %1750, %cst_54 : tensor<1x1024x640xf32>
-    %1752 = stablehlo.multiply %1751, %1735 : tensor<1x1024x640xf32>
-    %1753 = stablehlo.add %1752, %cst_55 : tensor<1x1024x640xf32>
-    %1754 = stablehlo.multiply %1753, %1735 : tensor<1x1024x640xf32>
-    %1755 = stablehlo.add %1754, %cst_56 : tensor<1x1024x640xf32>
-    %1756 = stablehlo.multiply %1734, %1747 : tensor<1x1024x640xf32>
-    %1757 = stablehlo.divide %1756, %1755 : tensor<1x1024x640xf32>
-    %1758 = stablehlo.clamp %cst_57, %1757, %cst_58 : tensor<1x1024x640xf32>
-    %1759 = stablehlo.convert %1758 : (tensor<1x1024x640xf32>) -> tensor<1x1024x640xbf16>
-    %1760 = stablehlo.add %1759, %cst_40 : tensor<1x1024x640xbf16>
-    %1761 = stablehlo.multiply %1760, %1731 : tensor<1x1024x640xbf16>
-    %1762 = stablehlo.reshape %1761 : (tensor<1x1024x640xbf16>) -> tensor<1024x640xbf16>
-    %1763 = stablehlo.dot_general %1762, %arg177, contracting_dims = [1] x [0] : (tensor<1024x640xbf16>, tensor<640x160xbf16>) -> tensor<1024x160xbf16>
-    %1764 = stablehlo.reshape %1763 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1765 = stablehlo.broadcast_in_dim %1764, dims = [0, 1, 2] : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1766 = stablehlo.broadcast_in_dim %arg82, dims = [2] : (tensor<160xbf16>) -> tensor<1x1024x160xbf16>
-    %1767 = stablehlo.add %1765, %1766 : tensor<1x1024x160xbf16>
-    %1768 = stablehlo.reshape %1767 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %1769 = stablehlo.reshape %1768 : (tensor<1024x160xbf16>) -> tensor<1x1024x160xbf16>
-    %1770 = stablehlo.add %1769, %1674 : tensor<1x1024x160xbf16>
-    %1771 = stablehlo.convert %1770 : (tensor<1x1024x160xbf16>) -> tensor<1x1024x160xf32>
-    %1772 = stablehlo.convert %1771 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf64>
-    %1773 = stablehlo.reduce(%1772 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1774 = stablehlo.reshape %1773 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1775 = stablehlo.broadcast_in_dim %1774, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1776 = stablehlo.divide %1775, %1223 : tensor<1x1024x1xf64>
-    %1777 = stablehlo.broadcast_in_dim %1772, dims = [0, 1, 2] : (tensor<1x1024x160xf64>) -> tensor<1x1024x160xf64>
-    %1778 = stablehlo.broadcast_in_dim %1776, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x160xf64>
-    %1779 = stablehlo.subtract %1777, %1778 : tensor<1x1024x160xf64>
-    %1780 = stablehlo.multiply %1779, %1779 : tensor<1x1024x160xf64>
-    %1781 = stablehlo.reduce(%1780 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf64>, tensor<f64>) -> tensor<1x1024xf64>
-    %1782 = stablehlo.reshape %1781 : (tensor<1x1024xf64>) -> tensor<1x1024x1xf64>
-    %1783 = stablehlo.broadcast_in_dim %1782, dims = [0, 1, 2] : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf64>
-    %1784 = stablehlo.divide %1783, %1223 : tensor<1x1024x1xf64>
-    %1785 = stablehlo.convert %1784 : (tensor<1x1024x1xf64>) -> tensor<1x1024x1xf32>
-    %1786 = stablehlo.reduce(%1771 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x1024x160xf32>, tensor<f32>) -> tensor<1x1024xf32>
-    %1787 = stablehlo.reshape %1786 : (tensor<1x1024xf32>) -> tensor<1x1024x1xf32>
-    %1788 = stablehlo.broadcast_in_dim %1787, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1789 = stablehlo.divide %1788, %1239 : tensor<1x1024x1xf32>
-    %1790 = stablehlo.broadcast_in_dim %1785, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x1xf32>
-    %1791 = stablehlo.add %1790, %1242 : tensor<1x1024x1xf32>
-    %1792 = stablehlo.rsqrt %1791 : tensor<1x1024x1xf32>
-    %1793 = stablehlo.broadcast_in_dim %1771, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1794 = stablehlo.broadcast_in_dim %1789, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1795 = stablehlo.subtract %1793, %1794 : tensor<1x1024x160xf32>
-    %1796 = stablehlo.broadcast_in_dim %1795, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1797 = stablehlo.broadcast_in_dim %1792, dims = [0, 1, 2] : (tensor<1x1024x1xf32>) -> tensor<1x1024x160xf32>
-    %1798 = stablehlo.multiply %1796, %1797 : tensor<1x1024x160xf32>
-    %1799 = stablehlo.convert %arg83 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1800 = stablehlo.broadcast_in_dim %1798, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1801 = stablehlo.broadcast_in_dim %1799, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1802 = stablehlo.multiply %1800, %1801 : tensor<1x1024x160xf32>
-    %1803 = stablehlo.convert %arg84 : (tensor<160xbf16>) -> tensor<160xf32>
-    %1804 = stablehlo.broadcast_in_dim %1802, dims = [0, 1, 2] : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xf32>
-    %1805 = stablehlo.broadcast_in_dim %1803, dims = [2] : (tensor<160xf32>) -> tensor<1x1024x160xf32>
-    %1806 = stablehlo.add %1804, %1805 : tensor<1x1024x160xf32>
-    %1807 = stablehlo.convert %1806 : (tensor<1x1024x160xf32>) -> tensor<1x1024x160xbf16>
-    %1808 = stablehlo.reshape %1807 : (tensor<1x1024x160xbf16>) -> tensor<1x32x32x160xbf16>
-    %1809 = stablehlo.transpose %1808, dims = [0, 3, 1, 2] : (tensor<1x32x32x160xbf16>) -> tensor<1x160x32x32xbf16>
-    %1810 = stablehlo.convolution(%1809, %arg85) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x160x32x32xbf16>, tensor<256x160x3x3xbf16>) -> tensor<1x256x16x16xbf16>
-    %1811 = stablehlo.reshape %arg86 : (tensor<256xbf16>) -> tensor<256x1x1xbf16>
-    %1812 = stablehlo.broadcast_in_dim %1810, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xbf16>) -> tensor<1x256x16x16xbf16>
-    %1813 = stablehlo.broadcast_in_dim %1811, dims = [1, 2, 3] : (tensor<256x1x1xbf16>) -> tensor<1x256x16x16xbf16>
-    %1814 = stablehlo.add %1812, %1813 : tensor<1x256x16x16xbf16>
-    %1815 = stablehlo.reshape %1814 : (tensor<1x256x16x16xbf16>) -> tensor<1x256x256xbf16>
-    %1816 = stablehlo.transpose %1815, dims = [0, 2, 1] : (tensor<1x256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1817 = stablehlo.convert %1816 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1818 = stablehlo.convert %1817 : (tensor<1x256x256xf32>) -> tensor<1x256x256xf64>
-    %1819 = stablehlo.reduce(%1818 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1820 = stablehlo.reshape %1819 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1821 = stablehlo.convert %cst_85 : (tensor<1xi64>) -> tensor<1xf64>
-    %1822 = stablehlo.reshape %1821 : (tensor<1xf64>) -> tensor<f64>
-    %1823 = stablehlo.broadcast_in_dim %1820, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1824 = stablehlo.broadcast_in_dim %1822, dims = [] : (tensor<f64>) -> tensor<1x256x1xf64>
-    %1825 = stablehlo.divide %1823, %1824 : tensor<1x256x1xf64>
-    %1826 = stablehlo.broadcast_in_dim %1818, dims = [0, 1, 2] : (tensor<1x256x256xf64>) -> tensor<1x256x256xf64>
-    %1827 = stablehlo.broadcast_in_dim %1825, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x256xf64>
-    %1828 = stablehlo.subtract %1826, %1827 : tensor<1x256x256xf64>
-    %1829 = stablehlo.multiply %1828, %1828 : tensor<1x256x256xf64>
-    %1830 = stablehlo.reduce(%1829 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1831 = stablehlo.reshape %1830 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1832 = stablehlo.broadcast_in_dim %1831, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1833 = stablehlo.divide %1832, %1824 : tensor<1x256x1xf64>
-    %1834 = stablehlo.convert %1833 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1835 = stablehlo.reduce(%1817 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1836 = stablehlo.reshape %1835 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1837 = stablehlo.convert %cst_85 : (tensor<1xi64>) -> tensor<1xf32>
-    %1838 = stablehlo.reshape %1837 : (tensor<1xf32>) -> tensor<f32>
-    %1839 = stablehlo.broadcast_in_dim %1836, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1840 = stablehlo.broadcast_in_dim %1838, dims = [] : (tensor<f32>) -> tensor<1x256x1xf32>
-    %1841 = stablehlo.divide %1839, %1840 : tensor<1x256x1xf32>
-    %1842 = stablehlo.broadcast_in_dim %1834, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1843 = stablehlo.add %1842, %136 : tensor<1x256x1xf32>
-    %1844 = stablehlo.rsqrt %1843 : tensor<1x256x1xf32>
-    %1845 = stablehlo.broadcast_in_dim %1817, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1846 = stablehlo.broadcast_in_dim %1841, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %1847 = stablehlo.subtract %1845, %1846 : tensor<1x256x256xf32>
-    %1848 = stablehlo.broadcast_in_dim %1847, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1849 = stablehlo.broadcast_in_dim %1844, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %1850 = stablehlo.multiply %1848, %1849 : tensor<1x256x256xf32>
-    %1851 = stablehlo.convert %arg87 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1852 = stablehlo.broadcast_in_dim %1850, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1853 = stablehlo.broadcast_in_dim %1851, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %1854 = stablehlo.multiply %1852, %1853 : tensor<1x256x256xf32>
-    %1855 = stablehlo.convert %arg88 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1856 = stablehlo.broadcast_in_dim %1854, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1857 = stablehlo.broadcast_in_dim %1855, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %1858 = stablehlo.add %1856, %1857 : tensor<1x256x256xf32>
-    %1859 = stablehlo.convert %1858 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1860 = stablehlo.convert %1859 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1861 = stablehlo.convert %1860 : (tensor<1x256x256xf32>) -> tensor<1x256x256xf64>
-    %1862 = stablehlo.reduce(%1861 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1863 = stablehlo.reshape %1862 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1864 = stablehlo.broadcast_in_dim %1863, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1865 = stablehlo.divide %1864, %1824 : tensor<1x256x1xf64>
-    %1866 = stablehlo.broadcast_in_dim %1861, dims = [0, 1, 2] : (tensor<1x256x256xf64>) -> tensor<1x256x256xf64>
-    %1867 = stablehlo.broadcast_in_dim %1865, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x256xf64>
-    %1868 = stablehlo.subtract %1866, %1867 : tensor<1x256x256xf64>
-    %1869 = stablehlo.multiply %1868, %1868 : tensor<1x256x256xf64>
-    %1870 = stablehlo.reduce(%1869 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1871 = stablehlo.reshape %1870 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1872 = stablehlo.broadcast_in_dim %1871, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1873 = stablehlo.divide %1872, %1824 : tensor<1x256x1xf64>
-    %1874 = stablehlo.convert %1873 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1875 = stablehlo.reduce(%1860 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1876 = stablehlo.reshape %1875 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1877 = stablehlo.broadcast_in_dim %1876, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1878 = stablehlo.divide %1877, %1840 : tensor<1x256x1xf32>
-    %1879 = stablehlo.broadcast_in_dim %1874, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1880 = stablehlo.add %1879, %136 : tensor<1x256x1xf32>
-    %1881 = stablehlo.rsqrt %1880 : tensor<1x256x1xf32>
-    %1882 = stablehlo.broadcast_in_dim %1860, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1883 = stablehlo.broadcast_in_dim %1878, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %1884 = stablehlo.subtract %1882, %1883 : tensor<1x256x256xf32>
-    %1885 = stablehlo.broadcast_in_dim %1884, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1886 = stablehlo.broadcast_in_dim %1881, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %1887 = stablehlo.multiply %1885, %1886 : tensor<1x256x256xf32>
-    %1888 = stablehlo.convert %arg89 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1889 = stablehlo.broadcast_in_dim %1887, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1890 = stablehlo.broadcast_in_dim %1888, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %1891 = stablehlo.multiply %1889, %1890 : tensor<1x256x256xf32>
-    %1892 = stablehlo.convert %arg90 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1893 = stablehlo.broadcast_in_dim %1891, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1894 = stablehlo.broadcast_in_dim %1892, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %1895 = stablehlo.add %1893, %1894 : tensor<1x256x256xf32>
-    %1896 = stablehlo.convert %1895 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %1897 = stablehlo.reshape %1896 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1898 = stablehlo.convert %1897 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1899 = stablehlo.dot_general %1898, %arg178, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1900 = stablehlo.broadcast_in_dim %1899, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1901 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<256x256xf32>
-    %1902 = stablehlo.multiply %1900, %1901 : tensor<256x256xf32>
-    %1903 = stablehlo.broadcast_in_dim %1902, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1904 = stablehlo.broadcast_in_dim %arg179, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1905 = stablehlo.add %1903, %1904 : tensor<256x256xf32>
-    %1906 = stablehlo.convert %1905 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1907 = stablehlo.reshape %1906 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1908 = stablehlo.reshape %1907 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1909 = stablehlo.transpose %1908, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1910 = stablehlo.dot_general %1898, %arg180, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1911 = stablehlo.broadcast_in_dim %1910, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1912 = stablehlo.multiply %1911, %1901 : tensor<256x256xf32>
-    %1913 = stablehlo.broadcast_in_dim %1912, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1914 = stablehlo.broadcast_in_dim %arg181, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1915 = stablehlo.add %1913, %1914 : tensor<256x256xf32>
-    %1916 = stablehlo.convert %1915 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1917 = stablehlo.reshape %1916 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1918 = stablehlo.reshape %1917 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1919 = stablehlo.transpose %1918, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1920 = stablehlo.dot_general %1898, %arg182, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1921 = stablehlo.broadcast_in_dim %1920, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1922 = stablehlo.multiply %1921, %1901 : tensor<256x256xf32>
-    %1923 = stablehlo.broadcast_in_dim %1922, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1924 = stablehlo.broadcast_in_dim %arg183, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1925 = stablehlo.add %1923, %1924 : tensor<256x256xf32>
-    %1926 = stablehlo.convert %1925 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1927 = stablehlo.reshape %1926 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1928 = stablehlo.reshape %1927 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %1929 = stablehlo.transpose %1928, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1930 = stablehlo.transpose %1919, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %1931 = stablehlo.reshape %1909 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1932 = stablehlo.reshape %1930 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1933 = stablehlo.broadcast_in_dim %1932, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %1934 = stablehlo.dot_general %1931, %1933, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %1935 = stablehlo.reshape %1934 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1936 = stablehlo.broadcast_in_dim %1935, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %1937 = stablehlo.broadcast_in_dim %184, dims = [] : (tensor<bf16>) -> tensor<1x8x256x256xbf16>
-    %1938 = stablehlo.divide %1936, %1937 : tensor<1x8x256x256xbf16>
-    %1939 = stablehlo.convert %1938 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %1940 = stablehlo.reduce(%1939 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1941 = stablehlo.reshape %1940 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1942 = stablehlo.broadcast_in_dim %1939, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1943 = stablehlo.broadcast_in_dim %1941, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1944 = stablehlo.subtract %1942, %1943 : tensor<1x8x256x256xf32>
-    %1945 = stablehlo.exponential %1944 : tensor<1x8x256x256xf32>
-    %1946 = stablehlo.reduce(%1945 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %1947 = stablehlo.reshape %1946 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %1948 = stablehlo.broadcast_in_dim %1945, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %1949 = stablehlo.broadcast_in_dim %1947, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %1950 = stablehlo.divide %1948, %1949 : tensor<1x8x256x256xf32>
-    %1951 = stablehlo.convert %1950 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %1952 = stablehlo.reshape %1951 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %1953 = stablehlo.reshape %1929 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1954 = stablehlo.broadcast_in_dim %1953, dims = [0, 1, 2] : (tensor<8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1955 = stablehlo.dot_general %1952, %1954, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %1956 = stablehlo.reshape %1955 : (tensor<8x256x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %1957 = stablehlo.transpose %1956, dims = [0, 2, 1, 3] : (tensor<1x8x256x32xbf16>) -> tensor<1x256x8x32xbf16>
-    %1958 = stablehlo.reshape %1957 : (tensor<1x256x8x32xbf16>) -> tensor<1x256x256xbf16>
-    %1959 = stablehlo.reshape %1958 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %1960 = stablehlo.convert %1959 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %1961 = stablehlo.dot_general %1960, %arg184, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1962 = stablehlo.broadcast_in_dim %1961, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1963 = stablehlo.multiply %1962, %1901 : tensor<256x256xf32>
-    %1964 = stablehlo.broadcast_in_dim %1963, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %1965 = stablehlo.broadcast_in_dim %arg185, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %1966 = stablehlo.add %1964, %1965 : tensor<256x256xf32>
-    %1967 = stablehlo.convert %1966 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %1968 = stablehlo.reshape %1967 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %1969 = stablehlo.add %1968, %1859 : tensor<1x256x256xbf16>
-    %1970 = stablehlo.convert %1969 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %1971 = stablehlo.convert %1970 : (tensor<1x256x256xf32>) -> tensor<1x256x256xf64>
-    %1972 = stablehlo.reduce(%1971 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1973 = stablehlo.reshape %1972 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1974 = stablehlo.broadcast_in_dim %1973, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1975 = stablehlo.divide %1974, %1824 : tensor<1x256x1xf64>
-    %1976 = stablehlo.broadcast_in_dim %1971, dims = [0, 1, 2] : (tensor<1x256x256xf64>) -> tensor<1x256x256xf64>
-    %1977 = stablehlo.broadcast_in_dim %1975, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x256xf64>
-    %1978 = stablehlo.subtract %1976, %1977 : tensor<1x256x256xf64>
-    %1979 = stablehlo.multiply %1978, %1978 : tensor<1x256x256xf64>
-    %1980 = stablehlo.reduce(%1979 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %1981 = stablehlo.reshape %1980 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %1982 = stablehlo.broadcast_in_dim %1981, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %1983 = stablehlo.divide %1982, %1824 : tensor<1x256x1xf64>
-    %1984 = stablehlo.convert %1983 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %1985 = stablehlo.reduce(%1970 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %1986 = stablehlo.reshape %1985 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %1987 = stablehlo.broadcast_in_dim %1986, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1988 = stablehlo.divide %1987, %1840 : tensor<1x256x1xf32>
-    %1989 = stablehlo.broadcast_in_dim %1984, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %1990 = stablehlo.add %1989, %136 : tensor<1x256x1xf32>
-    %1991 = stablehlo.rsqrt %1990 : tensor<1x256x1xf32>
-    %1992 = stablehlo.broadcast_in_dim %1970, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1993 = stablehlo.broadcast_in_dim %1988, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %1994 = stablehlo.subtract %1992, %1993 : tensor<1x256x256xf32>
-    %1995 = stablehlo.broadcast_in_dim %1994, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %1996 = stablehlo.broadcast_in_dim %1991, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %1997 = stablehlo.multiply %1995, %1996 : tensor<1x256x256xf32>
-    %1998 = stablehlo.convert %arg91 : (tensor<256xbf16>) -> tensor<256xf32>
-    %1999 = stablehlo.broadcast_in_dim %1997, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2000 = stablehlo.broadcast_in_dim %1998, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2001 = stablehlo.multiply %1999, %2000 : tensor<1x256x256xf32>
-    %2002 = stablehlo.convert %arg92 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2003 = stablehlo.broadcast_in_dim %2001, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2004 = stablehlo.broadcast_in_dim %2002, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2005 = stablehlo.add %2003, %2004 : tensor<1x256x256xf32>
-    %2006 = stablehlo.convert %2005 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %2007 = stablehlo.reshape %2006 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2008 = stablehlo.convert %2007 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %2009 = stablehlo.dot_general %2008, %arg186, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x1024xf32>) -> tensor<256x1024xf32>
-    %2010 = stablehlo.broadcast_in_dim %2009, dims = [0, 1] : (tensor<256x1024xf32>) -> tensor<256x1024xf32>
-    %2011 = stablehlo.broadcast_in_dim %94, dims = [] : (tensor<f32>) -> tensor<256x1024xf32>
-    %2012 = stablehlo.multiply %2010, %2011 : tensor<256x1024xf32>
-    %2013 = stablehlo.broadcast_in_dim %2012, dims = [0, 1] : (tensor<256x1024xf32>) -> tensor<256x1024xf32>
-    %2014 = stablehlo.broadcast_in_dim %arg187, dims = [1] : (tensor<1024xf32>) -> tensor<256x1024xf32>
-    %2015 = stablehlo.add %2013, %2014 : tensor<256x1024xf32>
-    %2016 = stablehlo.convert %2015 : (tensor<256x1024xf32>) -> tensor<256x1024xbf16>
-    %2017 = stablehlo.reshape %2016 : (tensor<256x1024xbf16>) -> tensor<1x256x1024xbf16>
-    %2018 = stablehlo.transpose %2017, dims = [0, 2, 1] : (tensor<1x256x1024xbf16>) -> tensor<1x1024x256xbf16>
-    %2019 = stablehlo.reshape %2018 : (tensor<1x1024x256xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2020 = stablehlo.convolution(%2019, %arg93) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1024 : i64} : (tensor<1x1024x16x16xbf16>, tensor<1024x1x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2021 = stablehlo.reshape %arg94 : (tensor<1024xbf16>) -> tensor<1024x1x1xbf16>
-    %2022 = stablehlo.broadcast_in_dim %2020, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2023 = stablehlo.broadcast_in_dim %2021, dims = [1, 2, 3] : (tensor<1024x1x1xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2024 = stablehlo.add %2022, %2023 : tensor<1x1024x16x16xbf16>
-    %2025 = stablehlo.reshape %2024 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x256xbf16>
-    %2026 = stablehlo.transpose %2025, dims = [0, 2, 1] : (tensor<1x1024x256xbf16>) -> tensor<1x256x1024xbf16>
-    %2027 = stablehlo.multiply %2026, %cst_61 : tensor<1x256x1024xbf16>
-    %2028 = stablehlo.rsqrt %cst_60 : tensor<1x256x1024xbf16>
-    %2029 = stablehlo.multiply %2026, %2028 : tensor<1x256x1024xbf16>
-    %2030 = stablehlo.convert %2029 : (tensor<1x256x1024xbf16>) -> tensor<1x256x1024xf32>
-    %2031 = stablehlo.clamp %cst_62, %2030, %cst_63 : tensor<1x256x1024xf32>
-    %2032 = stablehlo.multiply %2031, %2031 : tensor<1x256x1024xf32>
-    %2033 = stablehlo.multiply %cst_64, %2032 : tensor<1x256x1024xf32>
-    %2034 = stablehlo.add %2033, %cst_65 : tensor<1x256x1024xf32>
-    %2035 = stablehlo.multiply %2034, %2032 : tensor<1x256x1024xf32>
-    %2036 = stablehlo.add %2035, %cst_66 : tensor<1x256x1024xf32>
-    %2037 = stablehlo.multiply %2036, %2032 : tensor<1x256x1024xf32>
-    %2038 = stablehlo.add %2037, %cst_67 : tensor<1x256x1024xf32>
-    %2039 = stablehlo.multiply %2038, %2032 : tensor<1x256x1024xf32>
-    %2040 = stablehlo.add %2039, %cst_68 : tensor<1x256x1024xf32>
-    %2041 = stablehlo.multiply %2040, %2032 : tensor<1x256x1024xf32>
-    %2042 = stablehlo.add %2041, %cst_69 : tensor<1x256x1024xf32>
-    %2043 = stablehlo.multiply %2042, %2032 : tensor<1x256x1024xf32>
-    %2044 = stablehlo.add %2043, %cst_70 : tensor<1x256x1024xf32>
-    %2045 = stablehlo.multiply %cst_71, %2032 : tensor<1x256x1024xf32>
-    %2046 = stablehlo.add %2045, %cst_72 : tensor<1x256x1024xf32>
-    %2047 = stablehlo.multiply %2046, %2032 : tensor<1x256x1024xf32>
-    %2048 = stablehlo.add %2047, %cst_73 : tensor<1x256x1024xf32>
-    %2049 = stablehlo.multiply %2048, %2032 : tensor<1x256x1024xf32>
-    %2050 = stablehlo.add %2049, %cst_74 : tensor<1x256x1024xf32>
-    %2051 = stablehlo.multiply %2050, %2032 : tensor<1x256x1024xf32>
-    %2052 = stablehlo.add %2051, %cst_75 : tensor<1x256x1024xf32>
-    %2053 = stablehlo.multiply %2031, %2044 : tensor<1x256x1024xf32>
-    %2054 = stablehlo.divide %2053, %2052 : tensor<1x256x1024xf32>
-    %2055 = stablehlo.clamp %cst_76, %2054, %cst_77 : tensor<1x256x1024xf32>
-    %2056 = stablehlo.convert %2055 : (tensor<1x256x1024xf32>) -> tensor<1x256x1024xbf16>
-    %2057 = stablehlo.add %2056, %cst_59 : tensor<1x256x1024xbf16>
-    %2058 = stablehlo.multiply %2057, %2027 : tensor<1x256x1024xbf16>
-    %2059 = stablehlo.reshape %2058 : (tensor<1x256x1024xbf16>) -> tensor<256x1024xbf16>
-    %2060 = stablehlo.dot_general %2059, %arg188, contracting_dims = [1] x [0] : (tensor<256x1024xbf16>, tensor<1024x256xbf16>) -> tensor<256x256xbf16>
-    %2061 = stablehlo.reshape %2060 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2062 = stablehlo.broadcast_in_dim %2061, dims = [0, 1, 2] : (tensor<1x256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2063 = stablehlo.broadcast_in_dim %arg95, dims = [2] : (tensor<256xbf16>) -> tensor<1x256x256xbf16>
-    %2064 = stablehlo.add %2062, %2063 : tensor<1x256x256xbf16>
-    %2065 = stablehlo.reshape %2064 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2066 = stablehlo.reshape %2065 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2067 = stablehlo.add %2066, %1969 : tensor<1x256x256xbf16>
-    %2068 = stablehlo.convert %2067 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %2069 = stablehlo.convert %2068 : (tensor<1x256x256xf32>) -> tensor<1x256x256xf64>
-    %2070 = stablehlo.reduce(%2069 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2071 = stablehlo.reshape %2070 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2072 = stablehlo.broadcast_in_dim %2071, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2073 = stablehlo.divide %2072, %1824 : tensor<1x256x1xf64>
-    %2074 = stablehlo.broadcast_in_dim %2069, dims = [0, 1, 2] : (tensor<1x256x256xf64>) -> tensor<1x256x256xf64>
-    %2075 = stablehlo.broadcast_in_dim %2073, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x256xf64>
-    %2076 = stablehlo.subtract %2074, %2075 : tensor<1x256x256xf64>
-    %2077 = stablehlo.multiply %2076, %2076 : tensor<1x256x256xf64>
-    %2078 = stablehlo.reduce(%2077 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2079 = stablehlo.reshape %2078 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2080 = stablehlo.broadcast_in_dim %2079, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2081 = stablehlo.divide %2080, %1824 : tensor<1x256x1xf64>
-    %2082 = stablehlo.convert %2081 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2083 = stablehlo.reduce(%2068 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2084 = stablehlo.reshape %2083 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2085 = stablehlo.broadcast_in_dim %2084, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2086 = stablehlo.divide %2085, %1840 : tensor<1x256x1xf32>
-    %2087 = stablehlo.broadcast_in_dim %2082, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2088 = stablehlo.add %2087, %136 : tensor<1x256x1xf32>
-    %2089 = stablehlo.rsqrt %2088 : tensor<1x256x1xf32>
-    %2090 = stablehlo.broadcast_in_dim %2068, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2091 = stablehlo.broadcast_in_dim %2086, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %2092 = stablehlo.subtract %2090, %2091 : tensor<1x256x256xf32>
-    %2093 = stablehlo.broadcast_in_dim %2092, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2094 = stablehlo.broadcast_in_dim %2089, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %2095 = stablehlo.multiply %2093, %2094 : tensor<1x256x256xf32>
-    %2096 = stablehlo.convert %arg96 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2097 = stablehlo.broadcast_in_dim %2095, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2098 = stablehlo.broadcast_in_dim %2096, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2099 = stablehlo.multiply %2097, %2098 : tensor<1x256x256xf32>
-    %2100 = stablehlo.convert %arg97 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2101 = stablehlo.broadcast_in_dim %2099, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2102 = stablehlo.broadcast_in_dim %2100, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2103 = stablehlo.add %2101, %2102 : tensor<1x256x256xf32>
-    %2104 = stablehlo.convert %2103 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %2105 = stablehlo.reshape %2104 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2106 = stablehlo.convert %2105 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %2107 = stablehlo.dot_general %2106, %arg189, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2108 = stablehlo.broadcast_in_dim %2107, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2109 = stablehlo.multiply %2108, %1901 : tensor<256x256xf32>
-    %2110 = stablehlo.broadcast_in_dim %2109, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2111 = stablehlo.broadcast_in_dim %arg190, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2112 = stablehlo.add %2110, %2111 : tensor<256x256xf32>
-    %2113 = stablehlo.convert %2112 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2114 = stablehlo.reshape %2113 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2115 = stablehlo.reshape %2114 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2116 = stablehlo.transpose %2115, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2117 = stablehlo.dot_general %2106, %arg191, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2118 = stablehlo.broadcast_in_dim %2117, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2119 = stablehlo.multiply %2118, %1901 : tensor<256x256xf32>
-    %2120 = stablehlo.broadcast_in_dim %2119, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2121 = stablehlo.broadcast_in_dim %arg192, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2122 = stablehlo.add %2120, %2121 : tensor<256x256xf32>
-    %2123 = stablehlo.convert %2122 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2124 = stablehlo.reshape %2123 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2125 = stablehlo.reshape %2124 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2126 = stablehlo.transpose %2125, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2127 = stablehlo.dot_general %2106, %arg193, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2128 = stablehlo.broadcast_in_dim %2127, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2129 = stablehlo.multiply %2128, %1901 : tensor<256x256xf32>
-    %2130 = stablehlo.broadcast_in_dim %2129, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2131 = stablehlo.broadcast_in_dim %arg194, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2132 = stablehlo.add %2130, %2131 : tensor<256x256xf32>
-    %2133 = stablehlo.convert %2132 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2134 = stablehlo.reshape %2133 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2135 = stablehlo.reshape %2134 : (tensor<1x256x256xbf16>) -> tensor<1x256x8x32xbf16>
-    %2136 = stablehlo.transpose %2135, dims = [0, 2, 1, 3] : (tensor<1x256x8x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2137 = stablehlo.transpose %2126, dims = [0, 1, 3, 2] : (tensor<1x8x256x32xbf16>) -> tensor<1x8x32x256xbf16>
-    %2138 = stablehlo.reshape %2116 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2139 = stablehlo.reshape %2137 : (tensor<1x8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2140 = stablehlo.broadcast_in_dim %2139, dims = [0, 1, 2] : (tensor<8x32x256xbf16>) -> tensor<8x32x256xbf16>
-    %2141 = stablehlo.dot_general %2138, %2140, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x32xbf16>, tensor<8x32x256xbf16>) -> tensor<8x256x256xbf16>
-    %2142 = stablehlo.reshape %2141 : (tensor<8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2143 = stablehlo.broadcast_in_dim %2142, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xbf16>
-    %2144 = stablehlo.divide %2143, %1937 : tensor<1x8x256x256xbf16>
-    %2145 = stablehlo.convert %2144 : (tensor<1x8x256x256xbf16>) -> tensor<1x8x256x256xf32>
-    %2146 = stablehlo.reduce(%2145 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2147 = stablehlo.reshape %2146 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2148 = stablehlo.broadcast_in_dim %2145, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2149 = stablehlo.broadcast_in_dim %2147, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2150 = stablehlo.subtract %2148, %2149 : tensor<1x8x256x256xf32>
-    %2151 = stablehlo.exponential %2150 : tensor<1x8x256x256xf32>
-    %2152 = stablehlo.reduce(%2151 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x8x256x256xf32>, tensor<f32>) -> tensor<1x8x256xf32>
-    %2153 = stablehlo.reshape %2152 : (tensor<1x8x256xf32>) -> tensor<1x8x256x1xf32>
-    %2154 = stablehlo.broadcast_in_dim %2151, dims = [0, 1, 2, 3] : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xf32>
-    %2155 = stablehlo.broadcast_in_dim %2153, dims = [0, 1, 2, 3] : (tensor<1x8x256x1xf32>) -> tensor<1x8x256x256xf32>
-    %2156 = stablehlo.divide %2154, %2155 : tensor<1x8x256x256xf32>
-    %2157 = stablehlo.convert %2156 : (tensor<1x8x256x256xf32>) -> tensor<1x8x256x256xbf16>
-    %2158 = stablehlo.reshape %2157 : (tensor<1x8x256x256xbf16>) -> tensor<8x256x256xbf16>
-    %2159 = stablehlo.reshape %2136 : (tensor<1x8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2160 = stablehlo.broadcast_in_dim %2159, dims = [0, 1, 2] : (tensor<8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2161 = stablehlo.dot_general %2158, %2160, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<8x256x256xbf16>, tensor<8x256x32xbf16>) -> tensor<8x256x32xbf16>
-    %2162 = stablehlo.reshape %2161 : (tensor<8x256x32xbf16>) -> tensor<1x8x256x32xbf16>
-    %2163 = stablehlo.transpose %2162, dims = [0, 2, 1, 3] : (tensor<1x8x256x32xbf16>) -> tensor<1x256x8x32xbf16>
-    %2164 = stablehlo.reshape %2163 : (tensor<1x256x8x32xbf16>) -> tensor<1x256x256xbf16>
-    %2165 = stablehlo.reshape %2164 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2166 = stablehlo.convert %2165 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %2167 = stablehlo.dot_general %2166, %arg195, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2168 = stablehlo.broadcast_in_dim %2167, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2169 = stablehlo.multiply %2168, %1901 : tensor<256x256xf32>
-    %2170 = stablehlo.broadcast_in_dim %2169, dims = [0, 1] : (tensor<256x256xf32>) -> tensor<256x256xf32>
-    %2171 = stablehlo.broadcast_in_dim %arg196, dims = [1] : (tensor<256xf32>) -> tensor<256x256xf32>
-    %2172 = stablehlo.add %2170, %2171 : tensor<256x256xf32>
-    %2173 = stablehlo.convert %2172 : (tensor<256x256xf32>) -> tensor<256x256xbf16>
-    %2174 = stablehlo.reshape %2173 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2175 = stablehlo.add %2174, %2067 : tensor<1x256x256xbf16>
-    %2176 = stablehlo.convert %2175 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %2177 = stablehlo.convert %2176 : (tensor<1x256x256xf32>) -> tensor<1x256x256xf64>
-    %2178 = stablehlo.reduce(%2177 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2179 = stablehlo.reshape %2178 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2180 = stablehlo.broadcast_in_dim %2179, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2181 = stablehlo.divide %2180, %1824 : tensor<1x256x1xf64>
-    %2182 = stablehlo.broadcast_in_dim %2177, dims = [0, 1, 2] : (tensor<1x256x256xf64>) -> tensor<1x256x256xf64>
-    %2183 = stablehlo.broadcast_in_dim %2181, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x256xf64>
-    %2184 = stablehlo.subtract %2182, %2183 : tensor<1x256x256xf64>
-    %2185 = stablehlo.multiply %2184, %2184 : tensor<1x256x256xf64>
-    %2186 = stablehlo.reduce(%2185 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2187 = stablehlo.reshape %2186 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2188 = stablehlo.broadcast_in_dim %2187, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2189 = stablehlo.divide %2188, %1824 : tensor<1x256x1xf64>
-    %2190 = stablehlo.convert %2189 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2191 = stablehlo.reduce(%2176 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2192 = stablehlo.reshape %2191 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2193 = stablehlo.broadcast_in_dim %2192, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2194 = stablehlo.divide %2193, %1840 : tensor<1x256x1xf32>
-    %2195 = stablehlo.broadcast_in_dim %2190, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2196 = stablehlo.add %2195, %136 : tensor<1x256x1xf32>
-    %2197 = stablehlo.rsqrt %2196 : tensor<1x256x1xf32>
-    %2198 = stablehlo.broadcast_in_dim %2176, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2199 = stablehlo.broadcast_in_dim %2194, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %2200 = stablehlo.subtract %2198, %2199 : tensor<1x256x256xf32>
-    %2201 = stablehlo.broadcast_in_dim %2200, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2202 = stablehlo.broadcast_in_dim %2197, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %2203 = stablehlo.multiply %2201, %2202 : tensor<1x256x256xf32>
-    %2204 = stablehlo.convert %arg98 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2205 = stablehlo.broadcast_in_dim %2203, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2206 = stablehlo.broadcast_in_dim %2204, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2207 = stablehlo.multiply %2205, %2206 : tensor<1x256x256xf32>
-    %2208 = stablehlo.convert %arg99 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2209 = stablehlo.broadcast_in_dim %2207, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2210 = stablehlo.broadcast_in_dim %2208, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2211 = stablehlo.add %2209, %2210 : tensor<1x256x256xf32>
-    %2212 = stablehlo.convert %2211 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %2213 = stablehlo.reshape %2212 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2214 = stablehlo.convert %2213 : (tensor<256x256xbf16>) -> tensor<256x256xf32>
-    %2215 = stablehlo.dot_general %2214, %arg197, contracting_dims = [1] x [0] : (tensor<256x256xf32>, tensor<256x1024xf32>) -> tensor<256x1024xf32>
-    %2216 = stablehlo.broadcast_in_dim %2215, dims = [0, 1] : (tensor<256x1024xf32>) -> tensor<256x1024xf32>
-    %2217 = stablehlo.multiply %2216, %2011 : tensor<256x1024xf32>
-    %2218 = stablehlo.broadcast_in_dim %2217, dims = [0, 1] : (tensor<256x1024xf32>) -> tensor<256x1024xf32>
-    %2219 = stablehlo.broadcast_in_dim %arg198, dims = [1] : (tensor<1024xf32>) -> tensor<256x1024xf32>
-    %2220 = stablehlo.add %2218, %2219 : tensor<256x1024xf32>
-    %2221 = stablehlo.convert %2220 : (tensor<256x1024xf32>) -> tensor<256x1024xbf16>
-    %2222 = stablehlo.reshape %2221 : (tensor<256x1024xbf16>) -> tensor<1x256x1024xbf16>
-    %2223 = stablehlo.transpose %2222, dims = [0, 2, 1] : (tensor<1x256x1024xbf16>) -> tensor<1x1024x256xbf16>
-    %2224 = stablehlo.reshape %2223 : (tensor<1x1024x256xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2225 = stablehlo.convolution(%2224, %arg100) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1024 : i64} : (tensor<1x1024x16x16xbf16>, tensor<1024x1x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2226 = stablehlo.reshape %arg101 : (tensor<1024xbf16>) -> tensor<1024x1x1xbf16>
-    %2227 = stablehlo.broadcast_in_dim %2225, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2228 = stablehlo.broadcast_in_dim %2226, dims = [1, 2, 3] : (tensor<1024x1x1xbf16>) -> tensor<1x1024x16x16xbf16>
-    %2229 = stablehlo.add %2227, %2228 : tensor<1x1024x16x16xbf16>
-    %2230 = stablehlo.reshape %2229 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x256xbf16>
-    %2231 = stablehlo.transpose %2230, dims = [0, 2, 1] : (tensor<1x1024x256xbf16>) -> tensor<1x256x1024xbf16>
-    %2232 = stablehlo.multiply %2231, %cst_61 : tensor<1x256x1024xbf16>
-    %2233 = stablehlo.multiply %2231, %2028 : tensor<1x256x1024xbf16>
-    %2234 = stablehlo.convert %2233 : (tensor<1x256x1024xbf16>) -> tensor<1x256x1024xf32>
-    %2235 = stablehlo.clamp %cst_62, %2234, %cst_63 : tensor<1x256x1024xf32>
-    %2236 = stablehlo.multiply %2235, %2235 : tensor<1x256x1024xf32>
-    %2237 = stablehlo.multiply %cst_64, %2236 : tensor<1x256x1024xf32>
-    %2238 = stablehlo.add %2237, %cst_65 : tensor<1x256x1024xf32>
-    %2239 = stablehlo.multiply %2238, %2236 : tensor<1x256x1024xf32>
-    %2240 = stablehlo.add %2239, %cst_66 : tensor<1x256x1024xf32>
-    %2241 = stablehlo.multiply %2240, %2236 : tensor<1x256x1024xf32>
-    %2242 = stablehlo.add %2241, %cst_67 : tensor<1x256x1024xf32>
-    %2243 = stablehlo.multiply %2242, %2236 : tensor<1x256x1024xf32>
-    %2244 = stablehlo.add %2243, %cst_68 : tensor<1x256x1024xf32>
-    %2245 = stablehlo.multiply %2244, %2236 : tensor<1x256x1024xf32>
-    %2246 = stablehlo.add %2245, %cst_69 : tensor<1x256x1024xf32>
-    %2247 = stablehlo.multiply %2246, %2236 : tensor<1x256x1024xf32>
-    %2248 = stablehlo.add %2247, %cst_70 : tensor<1x256x1024xf32>
-    %2249 = stablehlo.multiply %cst_71, %2236 : tensor<1x256x1024xf32>
-    %2250 = stablehlo.add %2249, %cst_72 : tensor<1x256x1024xf32>
-    %2251 = stablehlo.multiply %2250, %2236 : tensor<1x256x1024xf32>
-    %2252 = stablehlo.add %2251, %cst_73 : tensor<1x256x1024xf32>
-    %2253 = stablehlo.multiply %2252, %2236 : tensor<1x256x1024xf32>
-    %2254 = stablehlo.add %2253, %cst_74 : tensor<1x256x1024xf32>
-    %2255 = stablehlo.multiply %2254, %2236 : tensor<1x256x1024xf32>
-    %2256 = stablehlo.add %2255, %cst_75 : tensor<1x256x1024xf32>
-    %2257 = stablehlo.multiply %2235, %2248 : tensor<1x256x1024xf32>
-    %2258 = stablehlo.divide %2257, %2256 : tensor<1x256x1024xf32>
-    %2259 = stablehlo.clamp %cst_76, %2258, %cst_77 : tensor<1x256x1024xf32>
-    %2260 = stablehlo.convert %2259 : (tensor<1x256x1024xf32>) -> tensor<1x256x1024xbf16>
-    %2261 = stablehlo.add %2260, %cst_59 : tensor<1x256x1024xbf16>
-    %2262 = stablehlo.multiply %2261, %2232 : tensor<1x256x1024xbf16>
-    %2263 = stablehlo.reshape %2262 : (tensor<1x256x1024xbf16>) -> tensor<256x1024xbf16>
-    %2264 = stablehlo.dot_general %2263, %arg199, contracting_dims = [1] x [0] : (tensor<256x1024xbf16>, tensor<1024x256xbf16>) -> tensor<256x256xbf16>
-    %2265 = stablehlo.reshape %2264 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2266 = stablehlo.broadcast_in_dim %2265, dims = [0, 1, 2] : (tensor<1x256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2267 = stablehlo.broadcast_in_dim %arg102, dims = [2] : (tensor<256xbf16>) -> tensor<1x256x256xbf16>
-    %2268 = stablehlo.add %2266, %2267 : tensor<1x256x256xbf16>
-    %2269 = stablehlo.reshape %2268 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2270 = stablehlo.reshape %2269 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2271 = stablehlo.add %2270, %2175 : tensor<1x256x256xbf16>
-    %2272 = stablehlo.convert %2271 : (tensor<1x256x256xbf16>) -> tensor<1x256x256xf32>
-    %2273 = stablehlo.convert %2272 : (tensor<1x256x256xf32>) -> tensor<1x256x256xf64>
-    %2274 = stablehlo.reduce(%2273 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2275 = stablehlo.reshape %2274 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2276 = stablehlo.broadcast_in_dim %2275, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2277 = stablehlo.divide %2276, %1824 : tensor<1x256x1xf64>
-    %2278 = stablehlo.broadcast_in_dim %2273, dims = [0, 1, 2] : (tensor<1x256x256xf64>) -> tensor<1x256x256xf64>
-    %2279 = stablehlo.broadcast_in_dim %2277, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x256xf64>
-    %2280 = stablehlo.subtract %2278, %2279 : tensor<1x256x256xf64>
-    %2281 = stablehlo.multiply %2280, %2280 : tensor<1x256x256xf64>
-    %2282 = stablehlo.reduce(%2281 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf64>, tensor<f64>) -> tensor<1x256xf64>
-    %2283 = stablehlo.reshape %2282 : (tensor<1x256xf64>) -> tensor<1x256x1xf64>
-    %2284 = stablehlo.broadcast_in_dim %2283, dims = [0, 1, 2] : (tensor<1x256x1xf64>) -> tensor<1x256x1xf64>
-    %2285 = stablehlo.divide %2284, %1824 : tensor<1x256x1xf64>
-    %2286 = stablehlo.convert %2285 : (tensor<1x256x1xf64>) -> tensor<1x256x1xf32>
-    %2287 = stablehlo.reduce(%2272 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x256x256xf32>, tensor<f32>) -> tensor<1x256xf32>
-    %2288 = stablehlo.reshape %2287 : (tensor<1x256xf32>) -> tensor<1x256x1xf32>
-    %2289 = stablehlo.broadcast_in_dim %2288, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2290 = stablehlo.divide %2289, %1840 : tensor<1x256x1xf32>
-    %2291 = stablehlo.broadcast_in_dim %2286, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x1xf32>
-    %2292 = stablehlo.add %2291, %136 : tensor<1x256x1xf32>
-    %2293 = stablehlo.rsqrt %2292 : tensor<1x256x1xf32>
-    %2294 = stablehlo.broadcast_in_dim %2272, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2295 = stablehlo.broadcast_in_dim %2290, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %2296 = stablehlo.subtract %2294, %2295 : tensor<1x256x256xf32>
-    %2297 = stablehlo.broadcast_in_dim %2296, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2298 = stablehlo.broadcast_in_dim %2293, dims = [0, 1, 2] : (tensor<1x256x1xf32>) -> tensor<1x256x256xf32>
-    %2299 = stablehlo.multiply %2297, %2298 : tensor<1x256x256xf32>
-    %2300 = stablehlo.convert %arg103 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2301 = stablehlo.broadcast_in_dim %2299, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2302 = stablehlo.broadcast_in_dim %2300, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2303 = stablehlo.multiply %2301, %2302 : tensor<1x256x256xf32>
-    %2304 = stablehlo.convert %arg104 : (tensor<256xbf16>) -> tensor<256xf32>
-    %2305 = stablehlo.broadcast_in_dim %2303, dims = [0, 1, 2] : (tensor<1x256x256xf32>) -> tensor<1x256x256xf32>
-    %2306 = stablehlo.broadcast_in_dim %2304, dims = [2] : (tensor<256xf32>) -> tensor<1x256x256xf32>
-    %2307 = stablehlo.add %2305, %2306 : tensor<1x256x256xf32>
-    %2308 = stablehlo.convert %2307 : (tensor<1x256x256xf32>) -> tensor<1x256x256xbf16>
-    %2309 = stablehlo.reshape %2308 : (tensor<1x256x256xbf16>) -> tensor<1x16x16x256xbf16>
-    %2310 = stablehlo.transpose %2309, dims = [0, 3, 1, 2] : (tensor<1x16x16x256xbf16>) -> tensor<1x256x16x16xbf16>
-    %2311 = stablehlo.reshape %607 : (tensor<1x32x128x128xbf16>) -> tensor<1x32x16384xbf16>
-    %2312 = stablehlo.transpose %2311, dims = [0, 2, 1] : (tensor<1x32x16384xbf16>) -> tensor<1x16384x32xbf16>
-    %2313 = stablehlo.reshape %2312 : (tensor<1x16384x32xbf16>) -> tensor<16384x32xbf16>
-    %2314 = stablehlo.dot_general %2313, %arg200, contracting_dims = [1] x [0] : (tensor<16384x32xbf16>, tensor<32x256xbf16>) -> tensor<16384x256xbf16>
-    %2315 = stablehlo.reshape %2314 : (tensor<16384x256xbf16>) -> tensor<1x16384x256xbf16>
-    %2316 = stablehlo.broadcast_in_dim %2315, dims = [0, 1, 2] : (tensor<1x16384x256xbf16>) -> tensor<1x16384x256xbf16>
-    %2317 = stablehlo.broadcast_in_dim %arg105, dims = [2] : (tensor<256xbf16>) -> tensor<1x16384x256xbf16>
-    %2318 = stablehlo.add %2316, %2317 : tensor<1x16384x256xbf16>
-    %2319 = stablehlo.reshape %2318 : (tensor<1x16384x256xbf16>) -> tensor<16384x256xbf16>
-    %2320 = stablehlo.reshape %2319 : (tensor<16384x256xbf16>) -> tensor<1x16384x256xbf16>
-    %2321 = stablehlo.transpose %2320, dims = [0, 2, 1] : (tensor<1x16384x256xbf16>) -> tensor<1x256x16384xbf16>
-    %2322 = stablehlo.reshape %2321 : (tensor<1x256x16384xbf16>) -> tensor<1x256x128x128xbf16>
-    %2323 = stablehlo.transpose %2322, dims = [0, 1, 3, 2] : (tensor<1x256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2324 = stablehlo.reshape %2323 : (tensor<1x256x128x128xbf16>) -> tensor<256x128x128xbf16>
-    %2325 = stablehlo.broadcast_in_dim %arg201, dims = [0, 1, 2] : (tensor<256x128x128xbf16>) -> tensor<256x128x128xbf16>
-    %2326 = stablehlo.dot_general %2324, %2325, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x128x128xbf16>, tensor<256x128x128xbf16>) -> tensor<256x128x128xbf16>
-    %2327 = stablehlo.reshape %2326 : (tensor<256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2328 = stablehlo.transpose %2327, dims = [0, 1, 3, 2] : (tensor<1x256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2329 = stablehlo.reshape %2328 : (tensor<1x256x128x128xbf16>) -> tensor<256x128x128xbf16>
-    %2330 = stablehlo.broadcast_in_dim %arg202, dims = [0, 1, 2] : (tensor<256x128x128xbf16>) -> tensor<256x128x128xbf16>
-    %2331 = stablehlo.dot_general %2329, %2330, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x128x128xbf16>, tensor<256x128x128xbf16>) -> tensor<256x128x128xbf16>
-    %2332 = stablehlo.reshape %2331 : (tensor<256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2333 = stablehlo.reshape %1208 : (tensor<1x64x64x64xbf16>) -> tensor<1x64x4096xbf16>
-    %2334 = stablehlo.transpose %2333, dims = [0, 2, 1] : (tensor<1x64x4096xbf16>) -> tensor<1x4096x64xbf16>
-    %2335 = stablehlo.reshape %2334 : (tensor<1x4096x64xbf16>) -> tensor<4096x64xbf16>
-    %2336 = stablehlo.dot_general %2335, %arg203, contracting_dims = [1] x [0] : (tensor<4096x64xbf16>, tensor<64x256xbf16>) -> tensor<4096x256xbf16>
-    %2337 = stablehlo.reshape %2336 : (tensor<4096x256xbf16>) -> tensor<1x4096x256xbf16>
-    %2338 = stablehlo.broadcast_in_dim %2337, dims = [0, 1, 2] : (tensor<1x4096x256xbf16>) -> tensor<1x4096x256xbf16>
-    %2339 = stablehlo.broadcast_in_dim %arg106, dims = [2] : (tensor<256xbf16>) -> tensor<1x4096x256xbf16>
-    %2340 = stablehlo.add %2338, %2339 : tensor<1x4096x256xbf16>
-    %2341 = stablehlo.reshape %2340 : (tensor<1x4096x256xbf16>) -> tensor<4096x256xbf16>
-    %2342 = stablehlo.reshape %2341 : (tensor<4096x256xbf16>) -> tensor<1x4096x256xbf16>
-    %2343 = stablehlo.transpose %2342, dims = [0, 2, 1] : (tensor<1x4096x256xbf16>) -> tensor<1x256x4096xbf16>
-    %2344 = stablehlo.reshape %2343 : (tensor<1x256x4096xbf16>) -> tensor<1x256x64x64xbf16>
-    %2345 = stablehlo.transpose %2344, dims = [0, 1, 3, 2] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %2346 = stablehlo.reshape %2345 : (tensor<1x256x64x64xbf16>) -> tensor<256x64x64xbf16>
-    %2347 = stablehlo.broadcast_in_dim %arg204, dims = [0, 1, 2] : (tensor<256x64x128xbf16>) -> tensor<256x64x128xbf16>
-    %2348 = stablehlo.dot_general %2346, %2347, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x64x64xbf16>, tensor<256x64x128xbf16>) -> tensor<256x64x128xbf16>
-    %2349 = stablehlo.reshape %2348 : (tensor<256x64x128xbf16>) -> tensor<1x256x64x128xbf16>
-    %2350 = stablehlo.transpose %2349, dims = [0, 1, 3, 2] : (tensor<1x256x64x128xbf16>) -> tensor<1x256x128x64xbf16>
-    %2351 = stablehlo.reshape %2350 : (tensor<1x256x128x64xbf16>) -> tensor<256x128x64xbf16>
-    %2352 = stablehlo.broadcast_in_dim %arg205, dims = [0, 1, 2] : (tensor<256x64x128xbf16>) -> tensor<256x64x128xbf16>
-    %2353 = stablehlo.dot_general %2351, %2352, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x128x64xbf16>, tensor<256x64x128xbf16>) -> tensor<256x128x128xbf16>
-    %2354 = stablehlo.reshape %2353 : (tensor<256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2355 = stablehlo.reshape %1809 : (tensor<1x160x32x32xbf16>) -> tensor<1x160x1024xbf16>
-    %2356 = stablehlo.transpose %2355, dims = [0, 2, 1] : (tensor<1x160x1024xbf16>) -> tensor<1x1024x160xbf16>
-    %2357 = stablehlo.reshape %2356 : (tensor<1x1024x160xbf16>) -> tensor<1024x160xbf16>
-    %2358 = stablehlo.dot_general %2357, %arg206, contracting_dims = [1] x [0] : (tensor<1024x160xbf16>, tensor<160x256xbf16>) -> tensor<1024x256xbf16>
-    %2359 = stablehlo.reshape %2358 : (tensor<1024x256xbf16>) -> tensor<1x1024x256xbf16>
-    %2360 = stablehlo.broadcast_in_dim %2359, dims = [0, 1, 2] : (tensor<1x1024x256xbf16>) -> tensor<1x1024x256xbf16>
-    %2361 = stablehlo.broadcast_in_dim %arg107, dims = [2] : (tensor<256xbf16>) -> tensor<1x1024x256xbf16>
-    %2362 = stablehlo.add %2360, %2361 : tensor<1x1024x256xbf16>
-    %2363 = stablehlo.reshape %2362 : (tensor<1x1024x256xbf16>) -> tensor<1024x256xbf16>
-    %2364 = stablehlo.reshape %2363 : (tensor<1024x256xbf16>) -> tensor<1x1024x256xbf16>
-    %2365 = stablehlo.transpose %2364, dims = [0, 2, 1] : (tensor<1x1024x256xbf16>) -> tensor<1x256x1024xbf16>
-    %2366 = stablehlo.reshape %2365 : (tensor<1x256x1024xbf16>) -> tensor<1x256x32x32xbf16>
-    %2367 = stablehlo.transpose %2366, dims = [0, 1, 3, 2] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %2368 = stablehlo.reshape %2367 : (tensor<1x256x32x32xbf16>) -> tensor<256x32x32xbf16>
-    %2369 = stablehlo.broadcast_in_dim %arg207, dims = [0, 1, 2] : (tensor<256x32x128xbf16>) -> tensor<256x32x128xbf16>
-    %2370 = stablehlo.dot_general %2368, %2369, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x32x32xbf16>, tensor<256x32x128xbf16>) -> tensor<256x32x128xbf16>
-    %2371 = stablehlo.reshape %2370 : (tensor<256x32x128xbf16>) -> tensor<1x256x32x128xbf16>
-    %2372 = stablehlo.transpose %2371, dims = [0, 1, 3, 2] : (tensor<1x256x32x128xbf16>) -> tensor<1x256x128x32xbf16>
-    %2373 = stablehlo.reshape %2372 : (tensor<1x256x128x32xbf16>) -> tensor<256x128x32xbf16>
-    %2374 = stablehlo.broadcast_in_dim %arg208, dims = [0, 1, 2] : (tensor<256x32x128xbf16>) -> tensor<256x32x128xbf16>
-    %2375 = stablehlo.dot_general %2373, %2374, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x128x32xbf16>, tensor<256x32x128xbf16>) -> tensor<256x128x128xbf16>
-    %2376 = stablehlo.reshape %2375 : (tensor<256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2377 = stablehlo.reshape %2310 : (tensor<1x256x16x16xbf16>) -> tensor<1x256x256xbf16>
-    %2378 = stablehlo.transpose %2377, dims = [0, 2, 1] : (tensor<1x256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2379 = stablehlo.reshape %2378 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2380 = stablehlo.dot_general %2379, %arg209, contracting_dims = [1] x [0] : (tensor<256x256xbf16>, tensor<256x256xbf16>) -> tensor<256x256xbf16>
-    %2381 = stablehlo.reshape %2380 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2382 = stablehlo.broadcast_in_dim %2381, dims = [0, 1, 2] : (tensor<1x256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2383 = stablehlo.broadcast_in_dim %arg108, dims = [2] : (tensor<256xbf16>) -> tensor<1x256x256xbf16>
-    %2384 = stablehlo.add %2382, %2383 : tensor<1x256x256xbf16>
-    %2385 = stablehlo.reshape %2384 : (tensor<1x256x256xbf16>) -> tensor<256x256xbf16>
-    %2386 = stablehlo.reshape %2385 : (tensor<256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2387 = stablehlo.transpose %2386, dims = [0, 2, 1] : (tensor<1x256x256xbf16>) -> tensor<1x256x256xbf16>
-    %2388 = stablehlo.reshape %2387 : (tensor<1x256x256xbf16>) -> tensor<1x256x16x16xbf16>
-    %2389 = stablehlo.transpose %2388, dims = [0, 1, 3, 2] : (tensor<1x256x16x16xbf16>) -> tensor<1x256x16x16xbf16>
-    %2390 = stablehlo.reshape %2389 : (tensor<1x256x16x16xbf16>) -> tensor<256x16x16xbf16>
-    %2391 = stablehlo.broadcast_in_dim %arg210, dims = [0, 1, 2] : (tensor<256x16x128xbf16>) -> tensor<256x16x128xbf16>
-    %2392 = stablehlo.dot_general %2390, %2391, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x16x16xbf16>, tensor<256x16x128xbf16>) -> tensor<256x16x128xbf16>
-    %2393 = stablehlo.reshape %2392 : (tensor<256x16x128xbf16>) -> tensor<1x256x16x128xbf16>
-    %2394 = stablehlo.transpose %2393, dims = [0, 1, 3, 2] : (tensor<1x256x16x128xbf16>) -> tensor<1x256x128x16xbf16>
-    %2395 = stablehlo.reshape %2394 : (tensor<1x256x128x16xbf16>) -> tensor<256x128x16xbf16>
-    %2396 = stablehlo.broadcast_in_dim %arg211, dims = [0, 1, 2] : (tensor<256x16x128xbf16>) -> tensor<256x16x128xbf16>
-    %2397 = stablehlo.dot_general %2395, %2396, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x128x16xbf16>, tensor<256x16x128xbf16>) -> tensor<256x128x128xbf16>
-    %2398 = stablehlo.reshape %2397 : (tensor<256x128x128xbf16>) -> tensor<1x256x128x128xbf16>
-    %2399 = stablehlo.concatenate %2398, %2376, %2354, %2332, dim = 1 : (tensor<1x256x128x128xbf16>, tensor<1x256x128x128xbf16>, tensor<1x256x128x128xbf16>, tensor<1x256x128x128xbf16>) -> tensor<1x1024x128x128xbf16>
-    %2400 = stablehlo.convolution(%2399, %arg109) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x128x128xbf16>, tensor<256x1024x1x1xbf16>) -> tensor<1x256x128x128xbf16>
-    %2401 = stablehlo.convert %2400 : (tensor<1x256x128x128xbf16>) -> tensor<1x256x128x128xf32>
-    %2402 = stablehlo.broadcast_in_dim %2401, dims = [0, 1, 2, 3] : (tensor<1x256x128x128xf32>) -> tensor<1x256x128x128xf32>
-    %2403 = stablehlo.broadcast_in_dim %arg212, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x128x128xf32>
-    %2404 = stablehlo.subtract %2402, %2403 : tensor<1x256x128x128xf32>
-    %2405 = stablehlo.broadcast_in_dim %2404, dims = [0, 1, 2, 3] : (tensor<1x256x128x128xf32>) -> tensor<1x256x128x128xf32>
-    %2406 = stablehlo.broadcast_in_dim %arg213, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x128x128xf32>
-    %2407 = stablehlo.multiply %2405, %2406 : tensor<1x256x128x128xf32>
-    %2408 = stablehlo.convert %arg214 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %2409 = stablehlo.broadcast_in_dim %2407, dims = [0, 1, 2, 3] : (tensor<1x256x128x128xf32>) -> tensor<1x256x128x128xf32>
-    %2410 = stablehlo.broadcast_in_dim %2408, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x128x128xf32>
-    %2411 = stablehlo.multiply %2409, %2410 : tensor<1x256x128x128xf32>
-    %2412 = stablehlo.convert %arg215 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %2413 = stablehlo.broadcast_in_dim %2411, dims = [0, 1, 2, 3] : (tensor<1x256x128x128xf32>) -> tensor<1x256x128x128xf32>
-    %2414 = stablehlo.broadcast_in_dim %2412, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x128x128xf32>
-    %2415 = stablehlo.add %2413, %2414 : tensor<1x256x128x128xf32>
-    %2416 = stablehlo.convert %2415 : (tensor<1x256x128x128xf32>) -> tensor<1x256x128x128xbf16>
-    %2417 = stablehlo.maximum %2416, %cst_78 : tensor<1x256x128x128xbf16>
-    %2418 = stablehlo.convolution(%2417, %arg110) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x128x128xbf16>, tensor<150x256x1x1xbf16>) -> tensor<1x150x128x128xbf16>
-    %2419 = stablehlo.reshape %arg111 : (tensor<150xbf16>) -> tensor<150x1x1xbf16>
-    %2420 = stablehlo.broadcast_in_dim %2418, dims = [0, 1, 2, 3] : (tensor<1x150x128x128xbf16>) -> tensor<1x150x128x128xbf16>
-    %2421 = stablehlo.broadcast_in_dim %2419, dims = [1, 2, 3] : (tensor<150x1x1xbf16>) -> tensor<1x150x128x128xbf16>
-    %2422 = stablehlo.add %2420, %2421 : tensor<1x150x128x128xbf16>
-    return %2422 : tensor<1x150x128x128xbf16>
-  }
-}
diff --git a/mlir_tests/SqueezeBERT.mlir b/mlir_tests/SqueezeBERT.mlir
deleted file mode 100644
index af7d9d9b..00000000
--- a/mlir_tests/SqueezeBERT.mlir
+++ /dev/null
@@ -1,76 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x8xi64>, %arg1: tensor<1x8xi64>, %arg2: tensor<1x8xi64>, %arg3: tensor<30528x768xbf16>, %arg4: tensor<2x768xbf16>, %arg5: tensor<768xbf16>, %arg6: tensor<768xbf16>, %arg7: tensor<1x8x768xbf16>) -> (tensor<1x8x768xbf16>, tensor<1x1x1x8xbf16>) {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = arith.constant dense<1.000000e+00> : tensor<1xf64>
-    %cst_2 = arith.constant dense<-3.3895313892515355E+38> : tensor<1xf64>
-    %cst_3 = arith.constant dense<768> : tensor<1xi64>
-    %cst_4 = arith.constant dense<9.9999999999999998E-13> : tensor<1xf64>
-    %0 = stablehlo.reshape %arg1 : (tensor<1x8xi64>) -> tensor<1x1x8xi64>
-    %1 = stablehlo.reshape %0 : (tensor<1x1x8xi64>) -> tensor<1x1x1x8xi64>
-    %2 = stablehlo.convert %1 : (tensor<1x1x1x8xi64>) -> tensor<1x1x1x8xbf16>
-    %3 = stablehlo.convert %cst_1 : (tensor<1xf64>) -> tensor<1xbf16>
-    %4 = stablehlo.reshape %3 : (tensor<1xbf16>) -> tensor<bf16>
-    %5 = stablehlo.broadcast_in_dim %4, dims = [] : (tensor<bf16>) -> tensor<1x1x1x8xbf16>
-    %6 = stablehlo.broadcast_in_dim %2, dims = [0, 1, 2, 3] : (tensor<1x1x1x8xbf16>) -> tensor<1x1x1x8xbf16>
-    %7 = stablehlo.subtract %5, %6 : tensor<1x1x1x8xbf16>
-    %8 = stablehlo.convert %cst_2 : (tensor<1xf64>) -> tensor<1xbf16>
-    %9 = stablehlo.reshape %8 : (tensor<1xbf16>) -> tensor<bf16>
-    %10 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x1x1x8xbf16>) -> tensor<1x1x1x8xbf16>
-    %11 = stablehlo.broadcast_in_dim %9, dims = [] : (tensor<bf16>) -> tensor<1x1x1x8xbf16>
-    %12 = stablehlo.multiply %10, %11 : tensor<1x1x1x8xbf16>
-    %13 = "stablehlo.gather"(%arg3, %arg0) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 768>}> : (tensor<30528x768xbf16>, tensor<1x8xi64>) -> tensor<1x8x768xbf16>
-    %14 = stablehlo.convert %13 : tensor<1x8x768xbf16>
-    %15 = "stablehlo.gather"(%arg4, %arg2) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 768>}> : (tensor<2x768xbf16>, tensor<1x8xi64>) -> tensor<1x8x768xbf16>
-    %16 = stablehlo.convert %15 : tensor<1x8x768xbf16>
-    %17 = stablehlo.add %14, %arg7 : tensor<1x8x768xbf16>
-    %18 = stablehlo.add %17, %16 : tensor<1x8x768xbf16>
-    %19 = stablehlo.convert %18 : (tensor<1x8x768xbf16>) -> tensor<1x8x768xf32>
-    %20 = stablehlo.convert %19 : (tensor<1x8x768xf32>) -> tensor<1x8x768xf64>
-    %21 = stablehlo.reduce(%20 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x8x768xf64>, tensor<f64>) -> tensor<1x8xf64>
-    %22 = stablehlo.reshape %21 : (tensor<1x8xf64>) -> tensor<1x8x1xf64>
-    %23 = stablehlo.convert %cst_3 : (tensor<1xi64>) -> tensor<1xf64>
-    %24 = stablehlo.reshape %23 : (tensor<1xf64>) -> tensor<f64>
-    %25 = stablehlo.broadcast_in_dim %22, dims = [0, 1, 2] : (tensor<1x8x1xf64>) -> tensor<1x8x1xf64>
-    %26 = stablehlo.broadcast_in_dim %24, dims = [] : (tensor<f64>) -> tensor<1x8x1xf64>
-    %27 = stablehlo.divide %25, %26 : tensor<1x8x1xf64>
-    %28 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2] : (tensor<1x8x768xf64>) -> tensor<1x8x768xf64>
-    %29 = stablehlo.broadcast_in_dim %27, dims = [0, 1, 2] : (tensor<1x8x1xf64>) -> tensor<1x8x768xf64>
-    %30 = stablehlo.subtract %28, %29 : tensor<1x8x768xf64>
-    %31 = stablehlo.multiply %30, %30 : tensor<1x8x768xf64>
-    %32 = stablehlo.reduce(%31 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x8x768xf64>, tensor<f64>) -> tensor<1x8xf64>
-    %33 = stablehlo.reshape %32 : (tensor<1x8xf64>) -> tensor<1x8x1xf64>
-    %34 = stablehlo.broadcast_in_dim %33, dims = [0, 1, 2] : (tensor<1x8x1xf64>) -> tensor<1x8x1xf64>
-    %35 = stablehlo.divide %34, %26 : tensor<1x8x1xf64>
-    %36 = stablehlo.convert %35 : (tensor<1x8x1xf64>) -> tensor<1x8x1xf32>
-    %37 = stablehlo.reduce(%19 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x8x768xf32>, tensor<f32>) -> tensor<1x8xf32>
-    %38 = stablehlo.reshape %37 : (tensor<1x8xf32>) -> tensor<1x8x1xf32>
-    %39 = stablehlo.convert %cst_3 : (tensor<1xi64>) -> tensor<1xf32>
-    %40 = stablehlo.reshape %39 : (tensor<1xf32>) -> tensor<f32>
-    %41 = stablehlo.broadcast_in_dim %38, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x1xf32>
-    %42 = stablehlo.broadcast_in_dim %40, dims = [] : (tensor<f32>) -> tensor<1x8x1xf32>
-    %43 = stablehlo.divide %41, %42 : tensor<1x8x1xf32>
-    %44 = stablehlo.convert %cst_4 : (tensor<1xf64>) -> tensor<1xf32>
-    %45 = stablehlo.reshape %44 : (tensor<1xf32>) -> tensor<f32>
-    %46 = stablehlo.broadcast_in_dim %36, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x1xf32>
-    %47 = stablehlo.broadcast_in_dim %45, dims = [] : (tensor<f32>) -> tensor<1x8x1xf32>
-    %48 = stablehlo.add %46, %47 : tensor<1x8x1xf32>
-    %49 = stablehlo.rsqrt %48 : tensor<1x8x1xf32>
-    %50 = stablehlo.broadcast_in_dim %19, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %51 = stablehlo.broadcast_in_dim %43, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x768xf32>
-    %52 = stablehlo.subtract %50, %51 : tensor<1x8x768xf32>
-    %53 = stablehlo.broadcast_in_dim %52, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %54 = stablehlo.broadcast_in_dim %49, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x768xf32>
-    %55 = stablehlo.multiply %53, %54 : tensor<1x8x768xf32>
-    %56 = stablehlo.convert %arg5 : (tensor<768xbf16>) -> tensor<768xf32>
-    %57 = stablehlo.broadcast_in_dim %55, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %58 = stablehlo.broadcast_in_dim %56, dims = [2] : (tensor<768xf32>) -> tensor<1x8x768xf32>
-    %59 = stablehlo.multiply %57, %58 : tensor<1x8x768xf32>
-    %60 = stablehlo.convert %arg6 : (tensor<768xbf16>) -> tensor<768xf32>
-    %61 = stablehlo.broadcast_in_dim %59, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %62 = stablehlo.broadcast_in_dim %60, dims = [2] : (tensor<768xf32>) -> tensor<1x8x768xf32>
-    %63 = stablehlo.add %61, %62 : tensor<1x8x768xf32>
-    %64 = stablehlo.convert %63 : (tensor<1x8x768xf32>) -> tensor<1x8x768xbf16>
-    return %64, %12 : tensor<1x8x768xbf16>, tensor<1x1x1x8xbf16>
-  }
-}
diff --git a/mlir_tests/ViLT.mlir b/mlir_tests/ViLT.mlir
deleted file mode 100644
index 284481e8..00000000
--- a/mlir_tests/ViLT.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x8xi64>, %arg1: tensor<1x8xi64>, %arg2: tensor<30522x768xbf16>, %arg3: tensor<2x768xbf16>, %arg4: tensor<768xbf16>, %arg5: tensor<768xbf16>, %arg6: tensor<1x8x768xbf16>) -> tensor<1x8x768xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = arith.constant dense<768> : tensor<1xi64>
-    %cst_2 = arith.constant dense<9.9999999999999998E-13> : tensor<1xf64>
-    %0 = "stablehlo.gather"(%arg2, %arg0) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 768>}> : (tensor<30522x768xbf16>, tensor<1x8xi64>) -> tensor<1x8x768xbf16>
-    %1 = stablehlo.convert %0 : tensor<1x8x768xbf16>
-    %2 = "stablehlo.gather"(%arg3, %arg1) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 768>}> : (tensor<2x768xbf16>, tensor<1x8xi64>) -> tensor<1x8x768xbf16>
-    %3 = stablehlo.convert %2 : tensor<1x8x768xbf16>
-    %4 = stablehlo.add %1, %3 : tensor<1x8x768xbf16>
-    %5 = stablehlo.add %4, %arg6 : tensor<1x8x768xbf16>
-    %6 = stablehlo.convert %5 : (tensor<1x8x768xbf16>) -> tensor<1x8x768xf32>
-    %7 = stablehlo.convert %6 : (tensor<1x8x768xf32>) -> tensor<1x8x768xf64>
-    %8 = stablehlo.reduce(%7 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x8x768xf64>, tensor<f64>) -> tensor<1x8xf64>
-    %9 = stablehlo.reshape %8 : (tensor<1x8xf64>) -> tensor<1x8x1xf64>
-    %10 = stablehlo.convert %cst_1 : (tensor<1xi64>) -> tensor<1xf64>
-    %11 = stablehlo.reshape %10 : (tensor<1xf64>) -> tensor<f64>
-    %12 = stablehlo.broadcast_in_dim %9, dims = [0, 1, 2] : (tensor<1x8x1xf64>) -> tensor<1x8x1xf64>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [] : (tensor<f64>) -> tensor<1x8x1xf64>
-    %14 = stablehlo.divide %12, %13 : tensor<1x8x1xf64>
-    %15 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2] : (tensor<1x8x768xf64>) -> tensor<1x8x768xf64>
-    %16 = stablehlo.broadcast_in_dim %14, dims = [0, 1, 2] : (tensor<1x8x1xf64>) -> tensor<1x8x768xf64>
-    %17 = stablehlo.subtract %15, %16 : tensor<1x8x768xf64>
-    %18 = stablehlo.multiply %17, %17 : tensor<1x8x768xf64>
-    %19 = stablehlo.reduce(%18 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x8x768xf64>, tensor<f64>) -> tensor<1x8xf64>
-    %20 = stablehlo.reshape %19 : (tensor<1x8xf64>) -> tensor<1x8x1xf64>
-    %21 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2] : (tensor<1x8x1xf64>) -> tensor<1x8x1xf64>
-    %22 = stablehlo.divide %21, %13 : tensor<1x8x1xf64>
-    %23 = stablehlo.convert %22 : (tensor<1x8x1xf64>) -> tensor<1x8x1xf32>
-    %24 = stablehlo.reduce(%6 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x8x768xf32>, tensor<f32>) -> tensor<1x8xf32>
-    %25 = stablehlo.reshape %24 : (tensor<1x8xf32>) -> tensor<1x8x1xf32>
-    %26 = stablehlo.convert %cst_1 : (tensor<1xi64>) -> tensor<1xf32>
-    %27 = stablehlo.reshape %26 : (tensor<1xf32>) -> tensor<f32>
-    %28 = stablehlo.broadcast_in_dim %25, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x1xf32>
-    %29 = stablehlo.broadcast_in_dim %27, dims = [] : (tensor<f32>) -> tensor<1x8x1xf32>
-    %30 = stablehlo.divide %28, %29 : tensor<1x8x1xf32>
-    %31 = stablehlo.convert %cst_2 : (tensor<1xf64>) -> tensor<1xf32>
-    %32 = stablehlo.reshape %31 : (tensor<1xf32>) -> tensor<f32>
-    %33 = stablehlo.broadcast_in_dim %23, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x1xf32>
-    %34 = stablehlo.broadcast_in_dim %32, dims = [] : (tensor<f32>) -> tensor<1x8x1xf32>
-    %35 = stablehlo.add %33, %34 : tensor<1x8x1xf32>
-    %36 = stablehlo.rsqrt %35 : tensor<1x8x1xf32>
-    %37 = stablehlo.broadcast_in_dim %6, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %38 = stablehlo.broadcast_in_dim %30, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x768xf32>
-    %39 = stablehlo.subtract %37, %38 : tensor<1x8x768xf32>
-    %40 = stablehlo.broadcast_in_dim %39, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %41 = stablehlo.broadcast_in_dim %36, dims = [0, 1, 2] : (tensor<1x8x1xf32>) -> tensor<1x8x768xf32>
-    %42 = stablehlo.multiply %40, %41 : tensor<1x8x768xf32>
-    %43 = stablehlo.convert %arg4 : (tensor<768xbf16>) -> tensor<768xf32>
-    %44 = stablehlo.broadcast_in_dim %42, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %45 = stablehlo.broadcast_in_dim %43, dims = [2] : (tensor<768xf32>) -> tensor<1x8x768xf32>
-    %46 = stablehlo.multiply %44, %45 : tensor<1x8x768xf32>
-    %47 = stablehlo.convert %arg5 : (tensor<768xbf16>) -> tensor<768xf32>
-    %48 = stablehlo.broadcast_in_dim %46, dims = [0, 1, 2] : (tensor<1x8x768xf32>) -> tensor<1x8x768xf32>
-    %49 = stablehlo.broadcast_in_dim %47, dims = [2] : (tensor<768xf32>) -> tensor<1x8x768xf32>
-    %50 = stablehlo.add %48, %49 : tensor<1x8x768xf32>
-    %51 = stablehlo.convert %50 : (tensor<1x8x768xf32>) -> tensor<1x8x768xbf16>
-    return %51 : tensor<1x8x768xbf16>
-  }
-}
diff --git a/mlir_tests/YOLOv3.mlir b/mlir_tests/YOLOv3.mlir
deleted file mode 100644
index 74da6991..00000000
--- a/mlir_tests/YOLOv3.mlir
+++ /dev/null
@@ -1,1752 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x512x512xbf16>, %arg1: tensor<32x3x3x3xbf16>, %arg2: tensor<64x32x3x3xbf16>, %arg3: tensor<32x64x1x1xbf16>, %arg4: tensor<64x32x3x3xbf16>, %arg5: tensor<128x64x3x3xbf16>, %arg6: tensor<64x128x1x1xbf16>, %arg7: tensor<128x64x3x3xbf16>, %arg8: tensor<64x128x1x1xbf16>, %arg9: tensor<128x64x3x3xbf16>, %arg10: tensor<256x128x3x3xbf16>, %arg11: tensor<128x256x1x1xbf16>, %arg12: tensor<256x128x3x3xbf16>, %arg13: tensor<128x256x1x1xbf16>, %arg14: tensor<256x128x3x3xbf16>, %arg15: tensor<128x256x1x1xbf16>, %arg16: tensor<256x128x3x3xbf16>, %arg17: tensor<128x256x1x1xbf16>, %arg18: tensor<256x128x3x3xbf16>, %arg19: tensor<128x256x1x1xbf16>, %arg20: tensor<256x128x3x3xbf16>, %arg21: tensor<128x256x1x1xbf16>, %arg22: tensor<256x128x3x3xbf16>, %arg23: tensor<128x256x1x1xbf16>, %arg24: tensor<256x128x3x3xbf16>, %arg25: tensor<128x256x1x1xbf16>, %arg26: tensor<256x128x3x3xbf16>, %arg27: tensor<512x256x3x3xbf16>, %arg28: tensor<256x512x1x1xbf16>, %arg29: tensor<512x256x3x3xbf16>, %arg30: tensor<256x512x1x1xbf16>, %arg31: tensor<512x256x3x3xbf16>, %arg32: tensor<256x512x1x1xbf16>, %arg33: tensor<512x256x3x3xbf16>, %arg34: tensor<256x512x1x1xbf16>, %arg35: tensor<512x256x3x3xbf16>, %arg36: tensor<256x512x1x1xbf16>, %arg37: tensor<512x256x3x3xbf16>, %arg38: tensor<256x512x1x1xbf16>, %arg39: tensor<512x256x3x3xbf16>, %arg40: tensor<256x512x1x1xbf16>, %arg41: tensor<512x256x3x3xbf16>, %arg42: tensor<256x512x1x1xbf16>, %arg43: tensor<512x256x3x3xbf16>, %arg44: tensor<1024x512x3x3xbf16>, %arg45: tensor<512x1024x1x1xbf16>, %arg46: tensor<1024x512x3x3xbf16>, %arg47: tensor<512x1024x1x1xbf16>, %arg48: tensor<1024x512x3x3xbf16>, %arg49: tensor<512x1024x1x1xbf16>, %arg50: tensor<1024x512x3x3xbf16>, %arg51: tensor<512x1024x1x1xbf16>, %arg52: tensor<1024x512x3x3xbf16>, %arg53: tensor<512x1024x1x1xbf16>, %arg54: tensor<1024x512x3x3xbf16>, %arg55: tensor<512x1024x1x1xbf16>, %arg56: tensor<1024x512x3x3xbf16>, %arg57: tensor<512x1024x1x1xbf16>, %arg58: tensor<1024x512x3x3xbf16>, %arg59: tensor<255x1024x1x1xbf16>, %arg60: tensor<255xbf16>, %arg61: tensor<256x512x1x1xbf16>, %arg62: tensor<256x768x1x1xbf16>, %arg63: tensor<512x256x3x3xbf16>, %arg64: tensor<256x512x1x1xbf16>, %arg65: tensor<512x256x3x3xbf16>, %arg66: tensor<256x512x1x1xbf16>, %arg67: tensor<512x256x3x3xbf16>, %arg68: tensor<255x512x1x1xbf16>, %arg69: tensor<255xbf16>, %arg70: tensor<128x256x1x1xbf16>, %arg71: tensor<128x384x1x1xbf16>, %arg72: tensor<256x128x3x3xbf16>, %arg73: tensor<128x256x1x1xbf16>, %arg74: tensor<256x128x3x3xbf16>, %arg75: tensor<128x256x1x1xbf16>, %arg76: tensor<256x128x3x3xbf16>, %arg77: tensor<255x256x1x1xbf16>, %arg78: tensor<255xbf16>, %arg79: tensor<32x1x1xf32>, %arg80: tensor<32x1x1xf32>, %arg81: tensor<32x1x1xbf16>, %arg82: tensor<32x1x1xbf16>, %arg83: tensor<64x1x1xf32>, %arg84: tensor<64x1x1xf32>, %arg85: tensor<64x1x1xbf16>, %arg86: tensor<64x1x1xbf16>, %arg87: tensor<32x1x1xf32>, %arg88: tensor<32x1x1xf32>, %arg89: tensor<32x1x1xbf16>, %arg90: tensor<32x1x1xbf16>, %arg91: tensor<64x1x1xf32>, %arg92: tensor<64x1x1xf32>, %arg93: tensor<64x1x1xbf16>, %arg94: tensor<64x1x1xbf16>, %arg95: tensor<128x1x1xf32>, %arg96: tensor<128x1x1xf32>, %arg97: tensor<128x1x1xbf16>, %arg98: tensor<128x1x1xbf16>, %arg99: tensor<64x1x1xf32>, %arg100: tensor<64x1x1xf32>, %arg101: tensor<64x1x1xbf16>, %arg102: tensor<64x1x1xbf16>, %arg103: tensor<128x1x1xf32>, %arg104: tensor<128x1x1xf32>, %arg105: tensor<128x1x1xbf16>, %arg106: tensor<128x1x1xbf16>, %arg107: tensor<64x1x1xf32>, %arg108: tensor<64x1x1xf32>, %arg109: tensor<64x1x1xbf16>, %arg110: tensor<64x1x1xbf16>, %arg111: tensor<128x1x1xf32>, %arg112: tensor<128x1x1xf32>, %arg113: tensor<128x1x1xbf16>, %arg114: tensor<128x1x1xbf16>, %arg115: tensor<256x1x1xf32>, %arg116: tensor<256x1x1xf32>, %arg117: tensor<256x1x1xbf16>, %arg118: tensor<256x1x1xbf16>, %arg119: tensor<128x1x1xf32>, %arg120: tensor<128x1x1xf32>, %arg121: tensor<128x1x1xbf16>, %arg122: tensor<128x1x1xbf16>, %arg123: tensor<256x1x1xf32>, %arg124: tensor<256x1x1xf32>, %arg125: tensor<256x1x1xbf16>, %arg126: tensor<256x1x1xbf16>, %arg127: tensor<128x1x1xf32>, %arg128: tensor<128x1x1xf32>, %arg129: tensor<128x1x1xbf16>, %arg130: tensor<128x1x1xbf16>, %arg131: tensor<256x1x1xf32>, %arg132: tensor<256x1x1xf32>, %arg133: tensor<256x1x1xbf16>, %arg134: tensor<256x1x1xbf16>, %arg135: tensor<128x1x1xf32>, %arg136: tensor<128x1x1xf32>, %arg137: tensor<128x1x1xbf16>, %arg138: tensor<128x1x1xbf16>, %arg139: tensor<256x1x1xf32>, %arg140: tensor<256x1x1xf32>, %arg141: tensor<256x1x1xbf16>, %arg142: tensor<256x1x1xbf16>, %arg143: tensor<128x1x1xf32>, %arg144: tensor<128x1x1xf32>, %arg145: tensor<128x1x1xbf16>, %arg146: tensor<128x1x1xbf16>, %arg147: tensor<256x1x1xf32>, %arg148: tensor<256x1x1xf32>, %arg149: tensor<256x1x1xbf16>, %arg150: tensor<256x1x1xbf16>, %arg151: tensor<128x1x1xf32>, %arg152: tensor<128x1x1xf32>, %arg153: tensor<128x1x1xbf16>, %arg154: tensor<128x1x1xbf16>, %arg155: tensor<256x1x1xf32>, %arg156: tensor<256x1x1xf32>, %arg157: tensor<256x1x1xbf16>, %arg158: tensor<256x1x1xbf16>, %arg159: tensor<128x1x1xf32>, %arg160: tensor<128x1x1xf32>, %arg161: tensor<128x1x1xbf16>, %arg162: tensor<128x1x1xbf16>, %arg163: tensor<256x1x1xf32>, %arg164: tensor<256x1x1xf32>, %arg165: tensor<256x1x1xbf16>, %arg166: tensor<256x1x1xbf16>, %arg167: tensor<128x1x1xf32>, %arg168: tensor<128x1x1xf32>, %arg169: tensor<128x1x1xbf16>, %arg170: tensor<128x1x1xbf16>, %arg171: tensor<256x1x1xf32>, %arg172: tensor<256x1x1xf32>, %arg173: tensor<256x1x1xbf16>, %arg174: tensor<256x1x1xbf16>, %arg175: tensor<128x1x1xf32>, %arg176: tensor<128x1x1xf32>, %arg177: tensor<128x1x1xbf16>, %arg178: tensor<128x1x1xbf16>, %arg179: tensor<256x1x1xf32>, %arg180: tensor<256x1x1xf32>, %arg181: tensor<256x1x1xbf16>, %arg182: tensor<256x1x1xbf16>, %arg183: tensor<512x1x1xf32>, %arg184: tensor<512x1x1xf32>, %arg185: tensor<512x1x1xbf16>, %arg186: tensor<512x1x1xbf16>, %arg187: tensor<256x1x1xf32>, %arg188: tensor<256x1x1xf32>, %arg189: tensor<256x1x1xbf16>, %arg190: tensor<256x1x1xbf16>, %arg191: tensor<512x1x1xf32>, %arg192: tensor<512x1x1xf32>, %arg193: tensor<512x1x1xbf16>, %arg194: tensor<512x1x1xbf16>, %arg195: tensor<256x1x1xf32>, %arg196: tensor<256x1x1xf32>, %arg197: tensor<256x1x1xbf16>, %arg198: tensor<256x1x1xbf16>, %arg199: tensor<512x1x1xf32>, %arg200: tensor<512x1x1xf32>, %arg201: tensor<512x1x1xbf16>, %arg202: tensor<512x1x1xbf16>, %arg203: tensor<256x1x1xf32>, %arg204: tensor<256x1x1xf32>, %arg205: tensor<256x1x1xbf16>, %arg206: tensor<256x1x1xbf16>, %arg207: tensor<512x1x1xf32>, %arg208: tensor<512x1x1xf32>, %arg209: tensor<512x1x1xbf16>, %arg210: tensor<512x1x1xbf16>, %arg211: tensor<256x1x1xf32>, %arg212: tensor<256x1x1xf32>, %arg213: tensor<256x1x1xbf16>, %arg214: tensor<256x1x1xbf16>, %arg215: tensor<512x1x1xf32>, %arg216: tensor<512x1x1xf32>, %arg217: tensor<512x1x1xbf16>, %arg218: tensor<512x1x1xbf16>, %arg219: tensor<256x1x1xf32>, %arg220: tensor<256x1x1xf32>, %arg221: tensor<256x1x1xbf16>, %arg222: tensor<256x1x1xbf16>, %arg223: tensor<512x1x1xf32>, %arg224: tensor<512x1x1xf32>, %arg225: tensor<512x1x1xbf16>, %arg226: tensor<512x1x1xbf16>, %arg227: tensor<256x1x1xf32>, %arg228: tensor<256x1x1xf32>, %arg229: tensor<256x1x1xbf16>, %arg230: tensor<256x1x1xbf16>, %arg231: tensor<512x1x1xf32>, %arg232: tensor<512x1x1xf32>, %arg233: tensor<512x1x1xbf16>, %arg234: tensor<512x1x1xbf16>, %arg235: tensor<256x1x1xf32>, %arg236: tensor<256x1x1xf32>, %arg237: tensor<256x1x1xbf16>, %arg238: tensor<256x1x1xbf16>, %arg239: tensor<512x1x1xf32>, %arg240: tensor<512x1x1xf32>, %arg241: tensor<512x1x1xbf16>, %arg242: tensor<512x1x1xbf16>, %arg243: tensor<256x1x1xf32>, %arg244: tensor<256x1x1xf32>, %arg245: tensor<256x1x1xbf16>, %arg246: tensor<256x1x1xbf16>, %arg247: tensor<512x1x1xf32>, %arg248: tensor<512x1x1xf32>, %arg249: tensor<512x1x1xbf16>, %arg250: tensor<512x1x1xbf16>, %arg251: tensor<1024x1x1xf32>, %arg252: tensor<1024x1x1xf32>, %arg253: tensor<1024x1x1xbf16>, %arg254: tensor<1024x1x1xbf16>, %arg255: tensor<512x1x1xf32>, %arg256: tensor<512x1x1xf32>, %arg257: tensor<512x1x1xbf16>, %arg258: tensor<512x1x1xbf16>, %arg259: tensor<1024x1x1xf32>, %arg260: tensor<1024x1x1xf32>, %arg261: tensor<1024x1x1xbf16>, %arg262: tensor<1024x1x1xbf16>, %arg263: tensor<512x1x1xf32>, %arg264: tensor<512x1x1xf32>, %arg265: tensor<512x1x1xbf16>, %arg266: tensor<512x1x1xbf16>, %arg267: tensor<1024x1x1xf32>, %arg268: tensor<1024x1x1xf32>, %arg269: tensor<1024x1x1xbf16>, %arg270: tensor<1024x1x1xbf16>, %arg271: tensor<512x1x1xf32>, %arg272: tensor<512x1x1xf32>, %arg273: tensor<512x1x1xbf16>, %arg274: tensor<512x1x1xbf16>, %arg275: tensor<1024x1x1xf32>, %arg276: tensor<1024x1x1xf32>, %arg277: tensor<1024x1x1xbf16>, %arg278: tensor<1024x1x1xbf16>, %arg279: tensor<512x1x1xf32>, %arg280: tensor<512x1x1xf32>, %arg281: tensor<512x1x1xbf16>, %arg282: tensor<512x1x1xbf16>, %arg283: tensor<1024x1x1xf32>, %arg284: tensor<1024x1x1xf32>, %arg285: tensor<1024x1x1xbf16>, %arg286: tensor<1024x1x1xbf16>, %arg287: tensor<512x1x1xf32>, %arg288: tensor<512x1x1xf32>, %arg289: tensor<512x1x1xbf16>, %arg290: tensor<512x1x1xbf16>, %arg291: tensor<1024x1x1xf32>, %arg292: tensor<1024x1x1xf32>, %arg293: tensor<1024x1x1xbf16>, %arg294: tensor<1024x1x1xbf16>, %arg295: tensor<512x1x1xf32>, %arg296: tensor<512x1x1xf32>, %arg297: tensor<512x1x1xbf16>, %arg298: tensor<512x1x1xbf16>, %arg299: tensor<1024x1x1xf32>, %arg300: tensor<1024x1x1xf32>, %arg301: tensor<1024x1x1xbf16>, %arg302: tensor<1024x1x1xbf16>, %arg303: tensor<512x1x1xf32>, %arg304: tensor<512x1x1xf32>, %arg305: tensor<512x1x1xbf16>, %arg306: tensor<512x1x1xbf16>, %arg307: tensor<1024x1x1xf32>, %arg308: tensor<1024x1x1xf32>, %arg309: tensor<1024x1x1xbf16>, %arg310: tensor<1024x1x1xbf16>, %arg311: tensor<256x1x1xf32>, %arg312: tensor<256x1x1xf32>, %arg313: tensor<256x1x1xbf16>, %arg314: tensor<256x1x1xbf16>, %arg315: tensor<256x16x32xbf16>, %arg316: tensor<256x16x32xbf16>, %arg317: tensor<256x1x1xf32>, %arg318: tensor<256x1x1xf32>, %arg319: tensor<256x1x1xbf16>, %arg320: tensor<256x1x1xbf16>, %arg321: tensor<512x1x1xf32>, %arg322: tensor<512x1x1xf32>, %arg323: tensor<512x1x1xbf16>, %arg324: tensor<512x1x1xbf16>, %arg325: tensor<256x1x1xf32>, %arg326: tensor<256x1x1xf32>, %arg327: tensor<256x1x1xbf16>, %arg328: tensor<256x1x1xbf16>, %arg329: tensor<512x1x1xf32>, %arg330: tensor<512x1x1xf32>, %arg331: tensor<512x1x1xbf16>, %arg332: tensor<512x1x1xbf16>, %arg333: tensor<256x1x1xf32>, %arg334: tensor<256x1x1xf32>, %arg335: tensor<256x1x1xbf16>, %arg336: tensor<256x1x1xbf16>, %arg337: tensor<512x1x1xf32>, %arg338: tensor<512x1x1xf32>, %arg339: tensor<512x1x1xbf16>, %arg340: tensor<512x1x1xbf16>, %arg341: tensor<128x1x1xf32>, %arg342: tensor<128x1x1xf32>, %arg343: tensor<128x1x1xbf16>, %arg344: tensor<128x1x1xbf16>, %arg345: tensor<128x32x64xbf16>, %arg346: tensor<128x32x64xbf16>, %arg347: tensor<128x1x1xf32>, %arg348: tensor<128x1x1xf32>, %arg349: tensor<128x1x1xbf16>, %arg350: tensor<128x1x1xbf16>, %arg351: tensor<256x1x1xf32>, %arg352: tensor<256x1x1xf32>, %arg353: tensor<256x1x1xbf16>, %arg354: tensor<256x1x1xbf16>, %arg355: tensor<128x1x1xf32>, %arg356: tensor<128x1x1xf32>, %arg357: tensor<128x1x1xbf16>, %arg358: tensor<128x1x1xbf16>, %arg359: tensor<256x1x1xf32>, %arg360: tensor<256x1x1xf32>, %arg361: tensor<256x1x1xbf16>, %arg362: tensor<256x1x1xbf16>, %arg363: tensor<128x1x1xf32>, %arg364: tensor<128x1x1xf32>, %arg365: tensor<128x1x1xbf16>, %arg366: tensor<128x1x1xbf16>, %arg367: tensor<256x1x1xf32>, %arg368: tensor<256x1x1xf32>, %arg369: tensor<256x1x1xbf16>, %arg370: tensor<256x1x1xbf16>) -> (tensor<1x255x16x16xbf16>, tensor<1x255x32x32xbf16>, tensor<1x255x64x64xbf16>) {
-    %c = stablehlo.constant dense<0> : tensor<i64>
-    %cst = arith.constant dense<1.000000e-01> : tensor<1xf64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x512x512xbf16>, tensor<32x3x3x3xbf16>) -> tensor<1x32x512x512xbf16>
-    %1 = stablehlo.convert %0 : (tensor<1x32x512x512xbf16>) -> tensor<1x32x512x512xf32>
-    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2, 3] : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32>
-    %3 = stablehlo.broadcast_in_dim %arg79, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x512x512xf32>
-    %4 = stablehlo.subtract %2, %3 : tensor<1x32x512x512xf32>
-    %5 = stablehlo.broadcast_in_dim %4, dims = [0, 1, 2, 3] : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32>
-    %6 = stablehlo.broadcast_in_dim %arg80, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x512x512xf32>
-    %7 = stablehlo.multiply %5, %6 : tensor<1x32x512x512xf32>
-    %8 = stablehlo.convert %arg81 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %9 = stablehlo.broadcast_in_dim %7, dims = [0, 1, 2, 3] : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32>
-    %10 = stablehlo.broadcast_in_dim %8, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x512x512xf32>
-    %11 = stablehlo.multiply %9, %10 : tensor<1x32x512x512xf32>
-    %12 = stablehlo.convert %arg82 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2, 3] : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xf32>
-    %14 = stablehlo.broadcast_in_dim %12, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x512x512xf32>
-    %15 = stablehlo.add %13, %14 : tensor<1x32x512x512xf32>
-    %16 = stablehlo.convert %15 : (tensor<1x32x512x512xf32>) -> tensor<1x32x512x512xbf16>
-    %17 = stablehlo.convert %c : (tensor<i64>) -> tensor<bf16>
-    %18 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x32x512x512xbf16>
-    %19 = stablehlo.broadcast_in_dim %16, dims = [0, 1, 2, 3] : (tensor<1x32x512x512xbf16>) -> tensor<1x32x512x512xbf16>
-    %20 = stablehlo.maximum %18, %19 : tensor<1x32x512x512xbf16>
-    %21 = stablehlo.minimum %18, %19 : tensor<1x32x512x512xbf16>
-    %22 = stablehlo.convert %cst : (tensor<1xf64>) -> tensor<1xbf16>
-    %23 = stablehlo.reshape %22 : (tensor<1xbf16>) -> tensor<bf16>
-    %24 = stablehlo.broadcast_in_dim %21, dims = [0, 1, 2, 3] : (tensor<1x32x512x512xbf16>) -> tensor<1x32x512x512xbf16>
-    %25 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x32x512x512xbf16>
-    %26 = stablehlo.multiply %24, %25 : tensor<1x32x512x512xbf16>
-    %27 = stablehlo.add %20, %26 : tensor<1x32x512x512xbf16>
-    %28 = stablehlo.convolution(%27, %arg2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x512x512xbf16>, tensor<64x32x3x3xbf16>) -> tensor<1x64x256x256xbf16>
-    %29 = stablehlo.convert %28 : (tensor<1x64x256x256xbf16>) -> tensor<1x64x256x256xf32>
-    %30 = stablehlo.broadcast_in_dim %29, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %31 = stablehlo.broadcast_in_dim %arg83, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %32 = stablehlo.subtract %30, %31 : tensor<1x64x256x256xf32>
-    %33 = stablehlo.broadcast_in_dim %32, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %34 = stablehlo.broadcast_in_dim %arg84, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %35 = stablehlo.multiply %33, %34 : tensor<1x64x256x256xf32>
-    %36 = stablehlo.convert %arg85 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %37 = stablehlo.broadcast_in_dim %35, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %38 = stablehlo.broadcast_in_dim %36, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %39 = stablehlo.multiply %37, %38 : tensor<1x64x256x256xf32>
-    %40 = stablehlo.convert %arg86 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %41 = stablehlo.broadcast_in_dim %39, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %42 = stablehlo.broadcast_in_dim %40, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %43 = stablehlo.add %41, %42 : tensor<1x64x256x256xf32>
-    %44 = stablehlo.convert %43 : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xbf16>
-    %45 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x64x256x256xbf16>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xbf16>) -> tensor<1x64x256x256xbf16>
-    %47 = stablehlo.maximum %45, %46 : tensor<1x64x256x256xbf16>
-    %48 = stablehlo.minimum %45, %46 : tensor<1x64x256x256xbf16>
-    %49 = stablehlo.broadcast_in_dim %48, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xbf16>) -> tensor<1x64x256x256xbf16>
-    %50 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x64x256x256xbf16>
-    %51 = stablehlo.multiply %49, %50 : tensor<1x64x256x256xbf16>
-    %52 = stablehlo.add %47, %51 : tensor<1x64x256x256xbf16>
-    %53 = stablehlo.convolution(%52, %arg3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x256x256xbf16>, tensor<32x64x1x1xbf16>) -> tensor<1x32x256x256xbf16>
-    %54 = stablehlo.convert %53 : (tensor<1x32x256x256xbf16>) -> tensor<1x32x256x256xf32>
-    %55 = stablehlo.broadcast_in_dim %54, dims = [0, 1, 2, 3] : (tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xf32>
-    %56 = stablehlo.broadcast_in_dim %arg87, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x256x256xf32>
-    %57 = stablehlo.subtract %55, %56 : tensor<1x32x256x256xf32>
-    %58 = stablehlo.broadcast_in_dim %57, dims = [0, 1, 2, 3] : (tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xf32>
-    %59 = stablehlo.broadcast_in_dim %arg88, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x256x256xf32>
-    %60 = stablehlo.multiply %58, %59 : tensor<1x32x256x256xf32>
-    %61 = stablehlo.convert %arg89 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %62 = stablehlo.broadcast_in_dim %60, dims = [0, 1, 2, 3] : (tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xf32>
-    %63 = stablehlo.broadcast_in_dim %61, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x256x256xf32>
-    %64 = stablehlo.multiply %62, %63 : tensor<1x32x256x256xf32>
-    %65 = stablehlo.convert %arg90 : (tensor<32x1x1xbf16>) -> tensor<32x1x1xf32>
-    %66 = stablehlo.broadcast_in_dim %64, dims = [0, 1, 2, 3] : (tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xf32>
-    %67 = stablehlo.broadcast_in_dim %65, dims = [1, 2, 3] : (tensor<32x1x1xf32>) -> tensor<1x32x256x256xf32>
-    %68 = stablehlo.add %66, %67 : tensor<1x32x256x256xf32>
-    %69 = stablehlo.convert %68 : (tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xbf16>
-    %70 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x32x256x256xbf16>
-    %71 = stablehlo.broadcast_in_dim %69, dims = [0, 1, 2, 3] : (tensor<1x32x256x256xbf16>) -> tensor<1x32x256x256xbf16>
-    %72 = stablehlo.maximum %70, %71 : tensor<1x32x256x256xbf16>
-    %73 = stablehlo.minimum %70, %71 : tensor<1x32x256x256xbf16>
-    %74 = stablehlo.broadcast_in_dim %73, dims = [0, 1, 2, 3] : (tensor<1x32x256x256xbf16>) -> tensor<1x32x256x256xbf16>
-    %75 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x32x256x256xbf16>
-    %76 = stablehlo.multiply %74, %75 : tensor<1x32x256x256xbf16>
-    %77 = stablehlo.add %72, %76 : tensor<1x32x256x256xbf16>
-    %78 = stablehlo.convolution(%77, %arg4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x256x256xbf16>, tensor<64x32x3x3xbf16>) -> tensor<1x64x256x256xbf16>
-    %79 = stablehlo.convert %78 : (tensor<1x64x256x256xbf16>) -> tensor<1x64x256x256xf32>
-    %80 = stablehlo.broadcast_in_dim %79, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %81 = stablehlo.broadcast_in_dim %arg91, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %82 = stablehlo.subtract %80, %81 : tensor<1x64x256x256xf32>
-    %83 = stablehlo.broadcast_in_dim %82, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %84 = stablehlo.broadcast_in_dim %arg92, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %85 = stablehlo.multiply %83, %84 : tensor<1x64x256x256xf32>
-    %86 = stablehlo.convert %arg93 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %87 = stablehlo.broadcast_in_dim %85, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %88 = stablehlo.broadcast_in_dim %86, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %89 = stablehlo.multiply %87, %88 : tensor<1x64x256x256xf32>
-    %90 = stablehlo.convert %arg94 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %91 = stablehlo.broadcast_in_dim %89, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xf32>
-    %92 = stablehlo.broadcast_in_dim %90, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x256x256xf32>
-    %93 = stablehlo.add %91, %92 : tensor<1x64x256x256xf32>
-    %94 = stablehlo.convert %93 : (tensor<1x64x256x256xf32>) -> tensor<1x64x256x256xbf16>
-    %95 = stablehlo.broadcast_in_dim %94, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xbf16>) -> tensor<1x64x256x256xbf16>
-    %96 = stablehlo.maximum %45, %95 : tensor<1x64x256x256xbf16>
-    %97 = stablehlo.minimum %45, %95 : tensor<1x64x256x256xbf16>
-    %98 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x64x256x256xbf16>) -> tensor<1x64x256x256xbf16>
-    %99 = stablehlo.multiply %98, %50 : tensor<1x64x256x256xbf16>
-    %100 = stablehlo.add %96, %99 : tensor<1x64x256x256xbf16>
-    %101 = stablehlo.add %100, %52 : tensor<1x64x256x256xbf16>
-    %102 = stablehlo.convolution(%101, %arg5) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x256x256xbf16>, tensor<128x64x3x3xbf16>) -> tensor<1x128x128x128xbf16>
-    %103 = stablehlo.convert %102 : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xf32>
-    %104 = stablehlo.broadcast_in_dim %103, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %105 = stablehlo.broadcast_in_dim %arg95, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %106 = stablehlo.subtract %104, %105 : tensor<1x128x128x128xf32>
-    %107 = stablehlo.broadcast_in_dim %106, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %108 = stablehlo.broadcast_in_dim %arg96, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %109 = stablehlo.multiply %107, %108 : tensor<1x128x128x128xf32>
-    %110 = stablehlo.convert %arg97 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %111 = stablehlo.broadcast_in_dim %109, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %112 = stablehlo.broadcast_in_dim %110, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %113 = stablehlo.multiply %111, %112 : tensor<1x128x128x128xf32>
-    %114 = stablehlo.convert %arg98 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %115 = stablehlo.broadcast_in_dim %113, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %116 = stablehlo.broadcast_in_dim %114, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %117 = stablehlo.add %115, %116 : tensor<1x128x128x128xf32>
-    %118 = stablehlo.convert %117 : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xbf16>
-    %119 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x128x128x128xbf16>
-    %120 = stablehlo.broadcast_in_dim %118, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %121 = stablehlo.maximum %119, %120 : tensor<1x128x128x128xbf16>
-    %122 = stablehlo.minimum %119, %120 : tensor<1x128x128x128xbf16>
-    %123 = stablehlo.broadcast_in_dim %122, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %124 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x128x128x128xbf16>
-    %125 = stablehlo.multiply %123, %124 : tensor<1x128x128x128xbf16>
-    %126 = stablehlo.add %121, %125 : tensor<1x128x128x128xbf16>
-    %127 = stablehlo.convolution(%126, %arg6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x128x128xbf16>, tensor<64x128x1x1xbf16>) -> tensor<1x64x128x128xbf16>
-    %128 = stablehlo.convert %127 : (tensor<1x64x128x128xbf16>) -> tensor<1x64x128x128xf32>
-    %129 = stablehlo.broadcast_in_dim %128, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %130 = stablehlo.broadcast_in_dim %arg99, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %131 = stablehlo.subtract %129, %130 : tensor<1x64x128x128xf32>
-    %132 = stablehlo.broadcast_in_dim %131, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %133 = stablehlo.broadcast_in_dim %arg100, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %134 = stablehlo.multiply %132, %133 : tensor<1x64x128x128xf32>
-    %135 = stablehlo.convert %arg101 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %136 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %137 = stablehlo.broadcast_in_dim %135, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %138 = stablehlo.multiply %136, %137 : tensor<1x64x128x128xf32>
-    %139 = stablehlo.convert %arg102 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %140 = stablehlo.broadcast_in_dim %138, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %141 = stablehlo.broadcast_in_dim %139, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %142 = stablehlo.add %140, %141 : tensor<1x64x128x128xf32>
-    %143 = stablehlo.convert %142 : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xbf16>
-    %144 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x64x128x128xbf16>
-    %145 = stablehlo.broadcast_in_dim %143, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xbf16>) -> tensor<1x64x128x128xbf16>
-    %146 = stablehlo.maximum %144, %145 : tensor<1x64x128x128xbf16>
-    %147 = stablehlo.minimum %144, %145 : tensor<1x64x128x128xbf16>
-    %148 = stablehlo.broadcast_in_dim %147, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xbf16>) -> tensor<1x64x128x128xbf16>
-    %149 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x64x128x128xbf16>
-    %150 = stablehlo.multiply %148, %149 : tensor<1x64x128x128xbf16>
-    %151 = stablehlo.add %146, %150 : tensor<1x64x128x128xbf16>
-    %152 = stablehlo.convolution(%151, %arg7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x128x128xbf16>, tensor<128x64x3x3xbf16>) -> tensor<1x128x128x128xbf16>
-    %153 = stablehlo.convert %152 : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xf32>
-    %154 = stablehlo.broadcast_in_dim %153, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %155 = stablehlo.broadcast_in_dim %arg103, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %156 = stablehlo.subtract %154, %155 : tensor<1x128x128x128xf32>
-    %157 = stablehlo.broadcast_in_dim %156, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %158 = stablehlo.broadcast_in_dim %arg104, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %159 = stablehlo.multiply %157, %158 : tensor<1x128x128x128xf32>
-    %160 = stablehlo.convert %arg105 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %161 = stablehlo.broadcast_in_dim %159, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %162 = stablehlo.broadcast_in_dim %160, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %163 = stablehlo.multiply %161, %162 : tensor<1x128x128x128xf32>
-    %164 = stablehlo.convert %arg106 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %165 = stablehlo.broadcast_in_dim %163, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %166 = stablehlo.broadcast_in_dim %164, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %167 = stablehlo.add %165, %166 : tensor<1x128x128x128xf32>
-    %168 = stablehlo.convert %167 : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xbf16>
-    %169 = stablehlo.broadcast_in_dim %168, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %170 = stablehlo.maximum %119, %169 : tensor<1x128x128x128xbf16>
-    %171 = stablehlo.minimum %119, %169 : tensor<1x128x128x128xbf16>
-    %172 = stablehlo.broadcast_in_dim %171, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %173 = stablehlo.multiply %172, %124 : tensor<1x128x128x128xbf16>
-    %174 = stablehlo.add %170, %173 : tensor<1x128x128x128xbf16>
-    %175 = stablehlo.add %174, %126 : tensor<1x128x128x128xbf16>
-    %176 = stablehlo.convolution(%175, %arg8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x128x128xbf16>, tensor<64x128x1x1xbf16>) -> tensor<1x64x128x128xbf16>
-    %177 = stablehlo.convert %176 : (tensor<1x64x128x128xbf16>) -> tensor<1x64x128x128xf32>
-    %178 = stablehlo.broadcast_in_dim %177, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %179 = stablehlo.broadcast_in_dim %arg107, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %180 = stablehlo.subtract %178, %179 : tensor<1x64x128x128xf32>
-    %181 = stablehlo.broadcast_in_dim %180, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %182 = stablehlo.broadcast_in_dim %arg108, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %183 = stablehlo.multiply %181, %182 : tensor<1x64x128x128xf32>
-    %184 = stablehlo.convert %arg109 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %185 = stablehlo.broadcast_in_dim %183, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %186 = stablehlo.broadcast_in_dim %184, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %187 = stablehlo.multiply %185, %186 : tensor<1x64x128x128xf32>
-    %188 = stablehlo.convert %arg110 : (tensor<64x1x1xbf16>) -> tensor<64x1x1xf32>
-    %189 = stablehlo.broadcast_in_dim %187, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xf32>
-    %190 = stablehlo.broadcast_in_dim %188, dims = [1, 2, 3] : (tensor<64x1x1xf32>) -> tensor<1x64x128x128xf32>
-    %191 = stablehlo.add %189, %190 : tensor<1x64x128x128xf32>
-    %192 = stablehlo.convert %191 : (tensor<1x64x128x128xf32>) -> tensor<1x64x128x128xbf16>
-    %193 = stablehlo.broadcast_in_dim %192, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xbf16>) -> tensor<1x64x128x128xbf16>
-    %194 = stablehlo.maximum %144, %193 : tensor<1x64x128x128xbf16>
-    %195 = stablehlo.minimum %144, %193 : tensor<1x64x128x128xbf16>
-    %196 = stablehlo.broadcast_in_dim %195, dims = [0, 1, 2, 3] : (tensor<1x64x128x128xbf16>) -> tensor<1x64x128x128xbf16>
-    %197 = stablehlo.multiply %196, %149 : tensor<1x64x128x128xbf16>
-    %198 = stablehlo.add %194, %197 : tensor<1x64x128x128xbf16>
-    %199 = stablehlo.convolution(%198, %arg9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x64x128x128xbf16>, tensor<128x64x3x3xbf16>) -> tensor<1x128x128x128xbf16>
-    %200 = stablehlo.convert %199 : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xf32>
-    %201 = stablehlo.broadcast_in_dim %200, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %202 = stablehlo.broadcast_in_dim %arg111, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %203 = stablehlo.subtract %201, %202 : tensor<1x128x128x128xf32>
-    %204 = stablehlo.broadcast_in_dim %203, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %205 = stablehlo.broadcast_in_dim %arg112, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %206 = stablehlo.multiply %204, %205 : tensor<1x128x128x128xf32>
-    %207 = stablehlo.convert %arg113 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %208 = stablehlo.broadcast_in_dim %206, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %209 = stablehlo.broadcast_in_dim %207, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %210 = stablehlo.multiply %208, %209 : tensor<1x128x128x128xf32>
-    %211 = stablehlo.convert %arg114 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %212 = stablehlo.broadcast_in_dim %210, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xf32>
-    %213 = stablehlo.broadcast_in_dim %211, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x128x128xf32>
-    %214 = stablehlo.add %212, %213 : tensor<1x128x128x128xf32>
-    %215 = stablehlo.convert %214 : (tensor<1x128x128x128xf32>) -> tensor<1x128x128x128xbf16>
-    %216 = stablehlo.broadcast_in_dim %215, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %217 = stablehlo.maximum %119, %216 : tensor<1x128x128x128xbf16>
-    %218 = stablehlo.minimum %119, %216 : tensor<1x128x128x128xbf16>
-    %219 = stablehlo.broadcast_in_dim %218, dims = [0, 1, 2, 3] : (tensor<1x128x128x128xbf16>) -> tensor<1x128x128x128xbf16>
-    %220 = stablehlo.multiply %219, %124 : tensor<1x128x128x128xbf16>
-    %221 = stablehlo.add %217, %220 : tensor<1x128x128x128xbf16>
-    %222 = stablehlo.add %221, %175 : tensor<1x128x128x128xbf16>
-    %223 = stablehlo.convolution(%222, %arg10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x128x128xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %224 = stablehlo.convert %223 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %225 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %226 = stablehlo.broadcast_in_dim %arg115, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %227 = stablehlo.subtract %225, %226 : tensor<1x256x64x64xf32>
-    %228 = stablehlo.broadcast_in_dim %227, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %229 = stablehlo.broadcast_in_dim %arg116, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %230 = stablehlo.multiply %228, %229 : tensor<1x256x64x64xf32>
-    %231 = stablehlo.convert %arg117 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %232 = stablehlo.broadcast_in_dim %230, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %233 = stablehlo.broadcast_in_dim %231, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %234 = stablehlo.multiply %232, %233 : tensor<1x256x64x64xf32>
-    %235 = stablehlo.convert %arg118 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %236 = stablehlo.broadcast_in_dim %234, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %237 = stablehlo.broadcast_in_dim %235, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %238 = stablehlo.add %236, %237 : tensor<1x256x64x64xf32>
-    %239 = stablehlo.convert %238 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %240 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x256x64x64xbf16>
-    %241 = stablehlo.broadcast_in_dim %239, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %242 = stablehlo.maximum %240, %241 : tensor<1x256x64x64xbf16>
-    %243 = stablehlo.minimum %240, %241 : tensor<1x256x64x64xbf16>
-    %244 = stablehlo.broadcast_in_dim %243, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %245 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x256x64x64xbf16>
-    %246 = stablehlo.multiply %244, %245 : tensor<1x256x64x64xbf16>
-    %247 = stablehlo.add %242, %246 : tensor<1x256x64x64xbf16>
-    %248 = stablehlo.convolution(%247, %arg11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %249 = stablehlo.convert %248 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %250 = stablehlo.broadcast_in_dim %249, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %251 = stablehlo.broadcast_in_dim %arg119, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %252 = stablehlo.subtract %250, %251 : tensor<1x128x64x64xf32>
-    %253 = stablehlo.broadcast_in_dim %252, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %254 = stablehlo.broadcast_in_dim %arg120, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %255 = stablehlo.multiply %253, %254 : tensor<1x128x64x64xf32>
-    %256 = stablehlo.convert %arg121 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %257 = stablehlo.broadcast_in_dim %255, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %258 = stablehlo.broadcast_in_dim %256, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %259 = stablehlo.multiply %257, %258 : tensor<1x128x64x64xf32>
-    %260 = stablehlo.convert %arg122 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %261 = stablehlo.broadcast_in_dim %259, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %262 = stablehlo.broadcast_in_dim %260, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %263 = stablehlo.add %261, %262 : tensor<1x128x64x64xf32>
-    %264 = stablehlo.convert %263 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %265 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x128x64x64xbf16>
-    %266 = stablehlo.broadcast_in_dim %264, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %267 = stablehlo.maximum %265, %266 : tensor<1x128x64x64xbf16>
-    %268 = stablehlo.minimum %265, %266 : tensor<1x128x64x64xbf16>
-    %269 = stablehlo.broadcast_in_dim %268, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %270 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x128x64x64xbf16>
-    %271 = stablehlo.multiply %269, %270 : tensor<1x128x64x64xbf16>
-    %272 = stablehlo.add %267, %271 : tensor<1x128x64x64xbf16>
-    %273 = stablehlo.convolution(%272, %arg12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %274 = stablehlo.convert %273 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %276 = stablehlo.broadcast_in_dim %arg123, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %277 = stablehlo.subtract %275, %276 : tensor<1x256x64x64xf32>
-    %278 = stablehlo.broadcast_in_dim %277, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %279 = stablehlo.broadcast_in_dim %arg124, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %280 = stablehlo.multiply %278, %279 : tensor<1x256x64x64xf32>
-    %281 = stablehlo.convert %arg125 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %282 = stablehlo.broadcast_in_dim %280, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %283 = stablehlo.broadcast_in_dim %281, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %284 = stablehlo.multiply %282, %283 : tensor<1x256x64x64xf32>
-    %285 = stablehlo.convert %arg126 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %286 = stablehlo.broadcast_in_dim %284, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %287 = stablehlo.broadcast_in_dim %285, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %288 = stablehlo.add %286, %287 : tensor<1x256x64x64xf32>
-    %289 = stablehlo.convert %288 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %290 = stablehlo.broadcast_in_dim %289, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %291 = stablehlo.maximum %240, %290 : tensor<1x256x64x64xbf16>
-    %292 = stablehlo.minimum %240, %290 : tensor<1x256x64x64xbf16>
-    %293 = stablehlo.broadcast_in_dim %292, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %294 = stablehlo.multiply %293, %245 : tensor<1x256x64x64xbf16>
-    %295 = stablehlo.add %291, %294 : tensor<1x256x64x64xbf16>
-    %296 = stablehlo.add %295, %247 : tensor<1x256x64x64xbf16>
-    %297 = stablehlo.convolution(%296, %arg13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %298 = stablehlo.convert %297 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %299 = stablehlo.broadcast_in_dim %298, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %300 = stablehlo.broadcast_in_dim %arg127, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %301 = stablehlo.subtract %299, %300 : tensor<1x128x64x64xf32>
-    %302 = stablehlo.broadcast_in_dim %301, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %303 = stablehlo.broadcast_in_dim %arg128, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %304 = stablehlo.multiply %302, %303 : tensor<1x128x64x64xf32>
-    %305 = stablehlo.convert %arg129 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %306 = stablehlo.broadcast_in_dim %304, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %307 = stablehlo.broadcast_in_dim %305, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %308 = stablehlo.multiply %306, %307 : tensor<1x128x64x64xf32>
-    %309 = stablehlo.convert %arg130 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %310 = stablehlo.broadcast_in_dim %308, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %311 = stablehlo.broadcast_in_dim %309, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %312 = stablehlo.add %310, %311 : tensor<1x128x64x64xf32>
-    %313 = stablehlo.convert %312 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %314 = stablehlo.broadcast_in_dim %313, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %315 = stablehlo.maximum %265, %314 : tensor<1x128x64x64xbf16>
-    %316 = stablehlo.minimum %265, %314 : tensor<1x128x64x64xbf16>
-    %317 = stablehlo.broadcast_in_dim %316, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %318 = stablehlo.multiply %317, %270 : tensor<1x128x64x64xbf16>
-    %319 = stablehlo.add %315, %318 : tensor<1x128x64x64xbf16>
-    %320 = stablehlo.convolution(%319, %arg14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %321 = stablehlo.convert %320 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %322 = stablehlo.broadcast_in_dim %321, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %323 = stablehlo.broadcast_in_dim %arg131, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %324 = stablehlo.subtract %322, %323 : tensor<1x256x64x64xf32>
-    %325 = stablehlo.broadcast_in_dim %324, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %326 = stablehlo.broadcast_in_dim %arg132, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %327 = stablehlo.multiply %325, %326 : tensor<1x256x64x64xf32>
-    %328 = stablehlo.convert %arg133 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %329 = stablehlo.broadcast_in_dim %327, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %330 = stablehlo.broadcast_in_dim %328, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %331 = stablehlo.multiply %329, %330 : tensor<1x256x64x64xf32>
-    %332 = stablehlo.convert %arg134 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %333 = stablehlo.broadcast_in_dim %331, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %334 = stablehlo.broadcast_in_dim %332, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %335 = stablehlo.add %333, %334 : tensor<1x256x64x64xf32>
-    %336 = stablehlo.convert %335 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %337 = stablehlo.broadcast_in_dim %336, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %338 = stablehlo.maximum %240, %337 : tensor<1x256x64x64xbf16>
-    %339 = stablehlo.minimum %240, %337 : tensor<1x256x64x64xbf16>
-    %340 = stablehlo.broadcast_in_dim %339, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %341 = stablehlo.multiply %340, %245 : tensor<1x256x64x64xbf16>
-    %342 = stablehlo.add %338, %341 : tensor<1x256x64x64xbf16>
-    %343 = stablehlo.add %342, %296 : tensor<1x256x64x64xbf16>
-    %344 = stablehlo.convolution(%343, %arg15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %345 = stablehlo.convert %344 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %346 = stablehlo.broadcast_in_dim %345, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %347 = stablehlo.broadcast_in_dim %arg135, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %348 = stablehlo.subtract %346, %347 : tensor<1x128x64x64xf32>
-    %349 = stablehlo.broadcast_in_dim %348, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %350 = stablehlo.broadcast_in_dim %arg136, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %351 = stablehlo.multiply %349, %350 : tensor<1x128x64x64xf32>
-    %352 = stablehlo.convert %arg137 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %353 = stablehlo.broadcast_in_dim %351, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %354 = stablehlo.broadcast_in_dim %352, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %355 = stablehlo.multiply %353, %354 : tensor<1x128x64x64xf32>
-    %356 = stablehlo.convert %arg138 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %357 = stablehlo.broadcast_in_dim %355, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %358 = stablehlo.broadcast_in_dim %356, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %359 = stablehlo.add %357, %358 : tensor<1x128x64x64xf32>
-    %360 = stablehlo.convert %359 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %361 = stablehlo.broadcast_in_dim %360, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %362 = stablehlo.maximum %265, %361 : tensor<1x128x64x64xbf16>
-    %363 = stablehlo.minimum %265, %361 : tensor<1x128x64x64xbf16>
-    %364 = stablehlo.broadcast_in_dim %363, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %365 = stablehlo.multiply %364, %270 : tensor<1x128x64x64xbf16>
-    %366 = stablehlo.add %362, %365 : tensor<1x128x64x64xbf16>
-    %367 = stablehlo.convolution(%366, %arg16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %368 = stablehlo.convert %367 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %369 = stablehlo.broadcast_in_dim %368, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %370 = stablehlo.broadcast_in_dim %arg139, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %371 = stablehlo.subtract %369, %370 : tensor<1x256x64x64xf32>
-    %372 = stablehlo.broadcast_in_dim %371, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %373 = stablehlo.broadcast_in_dim %arg140, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %374 = stablehlo.multiply %372, %373 : tensor<1x256x64x64xf32>
-    %375 = stablehlo.convert %arg141 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %376 = stablehlo.broadcast_in_dim %374, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %377 = stablehlo.broadcast_in_dim %375, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %378 = stablehlo.multiply %376, %377 : tensor<1x256x64x64xf32>
-    %379 = stablehlo.convert %arg142 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %380 = stablehlo.broadcast_in_dim %378, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %381 = stablehlo.broadcast_in_dim %379, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %382 = stablehlo.add %380, %381 : tensor<1x256x64x64xf32>
-    %383 = stablehlo.convert %382 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %384 = stablehlo.broadcast_in_dim %383, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %385 = stablehlo.maximum %240, %384 : tensor<1x256x64x64xbf16>
-    %386 = stablehlo.minimum %240, %384 : tensor<1x256x64x64xbf16>
-    %387 = stablehlo.broadcast_in_dim %386, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %388 = stablehlo.multiply %387, %245 : tensor<1x256x64x64xbf16>
-    %389 = stablehlo.add %385, %388 : tensor<1x256x64x64xbf16>
-    %390 = stablehlo.add %389, %343 : tensor<1x256x64x64xbf16>
-    %391 = stablehlo.convolution(%390, %arg17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %392 = stablehlo.convert %391 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %393 = stablehlo.broadcast_in_dim %392, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %394 = stablehlo.broadcast_in_dim %arg143, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %395 = stablehlo.subtract %393, %394 : tensor<1x128x64x64xf32>
-    %396 = stablehlo.broadcast_in_dim %395, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %397 = stablehlo.broadcast_in_dim %arg144, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %398 = stablehlo.multiply %396, %397 : tensor<1x128x64x64xf32>
-    %399 = stablehlo.convert %arg145 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %400 = stablehlo.broadcast_in_dim %398, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %401 = stablehlo.broadcast_in_dim %399, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %402 = stablehlo.multiply %400, %401 : tensor<1x128x64x64xf32>
-    %403 = stablehlo.convert %arg146 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %404 = stablehlo.broadcast_in_dim %402, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %405 = stablehlo.broadcast_in_dim %403, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %406 = stablehlo.add %404, %405 : tensor<1x128x64x64xf32>
-    %407 = stablehlo.convert %406 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %408 = stablehlo.broadcast_in_dim %407, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %409 = stablehlo.maximum %265, %408 : tensor<1x128x64x64xbf16>
-    %410 = stablehlo.minimum %265, %408 : tensor<1x128x64x64xbf16>
-    %411 = stablehlo.broadcast_in_dim %410, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %412 = stablehlo.multiply %411, %270 : tensor<1x128x64x64xbf16>
-    %413 = stablehlo.add %409, %412 : tensor<1x128x64x64xbf16>
-    %414 = stablehlo.convolution(%413, %arg18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %415 = stablehlo.convert %414 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %416 = stablehlo.broadcast_in_dim %415, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %417 = stablehlo.broadcast_in_dim %arg147, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %418 = stablehlo.subtract %416, %417 : tensor<1x256x64x64xf32>
-    %419 = stablehlo.broadcast_in_dim %418, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %420 = stablehlo.broadcast_in_dim %arg148, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %421 = stablehlo.multiply %419, %420 : tensor<1x256x64x64xf32>
-    %422 = stablehlo.convert %arg149 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %423 = stablehlo.broadcast_in_dim %421, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %424 = stablehlo.broadcast_in_dim %422, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %425 = stablehlo.multiply %423, %424 : tensor<1x256x64x64xf32>
-    %426 = stablehlo.convert %arg150 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %427 = stablehlo.broadcast_in_dim %425, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %428 = stablehlo.broadcast_in_dim %426, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %429 = stablehlo.add %427, %428 : tensor<1x256x64x64xf32>
-    %430 = stablehlo.convert %429 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %431 = stablehlo.broadcast_in_dim %430, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %432 = stablehlo.maximum %240, %431 : tensor<1x256x64x64xbf16>
-    %433 = stablehlo.minimum %240, %431 : tensor<1x256x64x64xbf16>
-    %434 = stablehlo.broadcast_in_dim %433, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %435 = stablehlo.multiply %434, %245 : tensor<1x256x64x64xbf16>
-    %436 = stablehlo.add %432, %435 : tensor<1x256x64x64xbf16>
-    %437 = stablehlo.add %436, %390 : tensor<1x256x64x64xbf16>
-    %438 = stablehlo.convolution(%437, %arg19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %439 = stablehlo.convert %438 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %440 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %441 = stablehlo.broadcast_in_dim %arg151, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %442 = stablehlo.subtract %440, %441 : tensor<1x128x64x64xf32>
-    %443 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %444 = stablehlo.broadcast_in_dim %arg152, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %445 = stablehlo.multiply %443, %444 : tensor<1x128x64x64xf32>
-    %446 = stablehlo.convert %arg153 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %447 = stablehlo.broadcast_in_dim %445, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %448 = stablehlo.broadcast_in_dim %446, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %449 = stablehlo.multiply %447, %448 : tensor<1x128x64x64xf32>
-    %450 = stablehlo.convert %arg154 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %451 = stablehlo.broadcast_in_dim %449, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %452 = stablehlo.broadcast_in_dim %450, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %453 = stablehlo.add %451, %452 : tensor<1x128x64x64xf32>
-    %454 = stablehlo.convert %453 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %455 = stablehlo.broadcast_in_dim %454, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %456 = stablehlo.maximum %265, %455 : tensor<1x128x64x64xbf16>
-    %457 = stablehlo.minimum %265, %455 : tensor<1x128x64x64xbf16>
-    %458 = stablehlo.broadcast_in_dim %457, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %459 = stablehlo.multiply %458, %270 : tensor<1x128x64x64xbf16>
-    %460 = stablehlo.add %456, %459 : tensor<1x128x64x64xbf16>
-    %461 = stablehlo.convolution(%460, %arg20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %462 = stablehlo.convert %461 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %463 = stablehlo.broadcast_in_dim %462, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %464 = stablehlo.broadcast_in_dim %arg155, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %465 = stablehlo.subtract %463, %464 : tensor<1x256x64x64xf32>
-    %466 = stablehlo.broadcast_in_dim %465, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %467 = stablehlo.broadcast_in_dim %arg156, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %468 = stablehlo.multiply %466, %467 : tensor<1x256x64x64xf32>
-    %469 = stablehlo.convert %arg157 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %470 = stablehlo.broadcast_in_dim %468, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %471 = stablehlo.broadcast_in_dim %469, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %472 = stablehlo.multiply %470, %471 : tensor<1x256x64x64xf32>
-    %473 = stablehlo.convert %arg158 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %474 = stablehlo.broadcast_in_dim %472, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %475 = stablehlo.broadcast_in_dim %473, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %476 = stablehlo.add %474, %475 : tensor<1x256x64x64xf32>
-    %477 = stablehlo.convert %476 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %478 = stablehlo.broadcast_in_dim %477, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %479 = stablehlo.maximum %240, %478 : tensor<1x256x64x64xbf16>
-    %480 = stablehlo.minimum %240, %478 : tensor<1x256x64x64xbf16>
-    %481 = stablehlo.broadcast_in_dim %480, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %482 = stablehlo.multiply %481, %245 : tensor<1x256x64x64xbf16>
-    %483 = stablehlo.add %479, %482 : tensor<1x256x64x64xbf16>
-    %484 = stablehlo.add %483, %437 : tensor<1x256x64x64xbf16>
-    %485 = stablehlo.convolution(%484, %arg21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %486 = stablehlo.convert %485 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %487 = stablehlo.broadcast_in_dim %486, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %488 = stablehlo.broadcast_in_dim %arg159, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %489 = stablehlo.subtract %487, %488 : tensor<1x128x64x64xf32>
-    %490 = stablehlo.broadcast_in_dim %489, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %491 = stablehlo.broadcast_in_dim %arg160, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %492 = stablehlo.multiply %490, %491 : tensor<1x128x64x64xf32>
-    %493 = stablehlo.convert %arg161 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %494 = stablehlo.broadcast_in_dim %492, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %495 = stablehlo.broadcast_in_dim %493, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %496 = stablehlo.multiply %494, %495 : tensor<1x128x64x64xf32>
-    %497 = stablehlo.convert %arg162 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %498 = stablehlo.broadcast_in_dim %496, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %499 = stablehlo.broadcast_in_dim %497, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %500 = stablehlo.add %498, %499 : tensor<1x128x64x64xf32>
-    %501 = stablehlo.convert %500 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %502 = stablehlo.broadcast_in_dim %501, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %503 = stablehlo.maximum %265, %502 : tensor<1x128x64x64xbf16>
-    %504 = stablehlo.minimum %265, %502 : tensor<1x128x64x64xbf16>
-    %505 = stablehlo.broadcast_in_dim %504, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %506 = stablehlo.multiply %505, %270 : tensor<1x128x64x64xbf16>
-    %507 = stablehlo.add %503, %506 : tensor<1x128x64x64xbf16>
-    %508 = stablehlo.convolution(%507, %arg22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %509 = stablehlo.convert %508 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %510 = stablehlo.broadcast_in_dim %509, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %511 = stablehlo.broadcast_in_dim %arg163, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %512 = stablehlo.subtract %510, %511 : tensor<1x256x64x64xf32>
-    %513 = stablehlo.broadcast_in_dim %512, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %514 = stablehlo.broadcast_in_dim %arg164, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %515 = stablehlo.multiply %513, %514 : tensor<1x256x64x64xf32>
-    %516 = stablehlo.convert %arg165 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %517 = stablehlo.broadcast_in_dim %515, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %518 = stablehlo.broadcast_in_dim %516, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %519 = stablehlo.multiply %517, %518 : tensor<1x256x64x64xf32>
-    %520 = stablehlo.convert %arg166 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %521 = stablehlo.broadcast_in_dim %519, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %522 = stablehlo.broadcast_in_dim %520, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %523 = stablehlo.add %521, %522 : tensor<1x256x64x64xf32>
-    %524 = stablehlo.convert %523 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %525 = stablehlo.broadcast_in_dim %524, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %526 = stablehlo.maximum %240, %525 : tensor<1x256x64x64xbf16>
-    %527 = stablehlo.minimum %240, %525 : tensor<1x256x64x64xbf16>
-    %528 = stablehlo.broadcast_in_dim %527, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %529 = stablehlo.multiply %528, %245 : tensor<1x256x64x64xbf16>
-    %530 = stablehlo.add %526, %529 : tensor<1x256x64x64xbf16>
-    %531 = stablehlo.add %530, %484 : tensor<1x256x64x64xbf16>
-    %532 = stablehlo.convolution(%531, %arg23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %533 = stablehlo.convert %532 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %534 = stablehlo.broadcast_in_dim %533, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %535 = stablehlo.broadcast_in_dim %arg167, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %536 = stablehlo.subtract %534, %535 : tensor<1x128x64x64xf32>
-    %537 = stablehlo.broadcast_in_dim %536, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %538 = stablehlo.broadcast_in_dim %arg168, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %539 = stablehlo.multiply %537, %538 : tensor<1x128x64x64xf32>
-    %540 = stablehlo.convert %arg169 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %541 = stablehlo.broadcast_in_dim %539, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %542 = stablehlo.broadcast_in_dim %540, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %543 = stablehlo.multiply %541, %542 : tensor<1x128x64x64xf32>
-    %544 = stablehlo.convert %arg170 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %545 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %546 = stablehlo.broadcast_in_dim %544, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %547 = stablehlo.add %545, %546 : tensor<1x128x64x64xf32>
-    %548 = stablehlo.convert %547 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %549 = stablehlo.broadcast_in_dim %548, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %550 = stablehlo.maximum %265, %549 : tensor<1x128x64x64xbf16>
-    %551 = stablehlo.minimum %265, %549 : tensor<1x128x64x64xbf16>
-    %552 = stablehlo.broadcast_in_dim %551, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %553 = stablehlo.multiply %552, %270 : tensor<1x128x64x64xbf16>
-    %554 = stablehlo.add %550, %553 : tensor<1x128x64x64xbf16>
-    %555 = stablehlo.convolution(%554, %arg24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %556 = stablehlo.convert %555 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %557 = stablehlo.broadcast_in_dim %556, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %558 = stablehlo.broadcast_in_dim %arg171, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %559 = stablehlo.subtract %557, %558 : tensor<1x256x64x64xf32>
-    %560 = stablehlo.broadcast_in_dim %559, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %561 = stablehlo.broadcast_in_dim %arg172, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %562 = stablehlo.multiply %560, %561 : tensor<1x256x64x64xf32>
-    %563 = stablehlo.convert %arg173 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %564 = stablehlo.broadcast_in_dim %562, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %565 = stablehlo.broadcast_in_dim %563, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %566 = stablehlo.multiply %564, %565 : tensor<1x256x64x64xf32>
-    %567 = stablehlo.convert %arg174 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %568 = stablehlo.broadcast_in_dim %566, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %569 = stablehlo.broadcast_in_dim %567, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %570 = stablehlo.add %568, %569 : tensor<1x256x64x64xf32>
-    %571 = stablehlo.convert %570 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %572 = stablehlo.broadcast_in_dim %571, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %573 = stablehlo.maximum %240, %572 : tensor<1x256x64x64xbf16>
-    %574 = stablehlo.minimum %240, %572 : tensor<1x256x64x64xbf16>
-    %575 = stablehlo.broadcast_in_dim %574, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %576 = stablehlo.multiply %575, %245 : tensor<1x256x64x64xbf16>
-    %577 = stablehlo.add %573, %576 : tensor<1x256x64x64xbf16>
-    %578 = stablehlo.add %577, %531 : tensor<1x256x64x64xbf16>
-    %579 = stablehlo.convolution(%578, %arg25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %580 = stablehlo.convert %579 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %581 = stablehlo.broadcast_in_dim %580, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %582 = stablehlo.broadcast_in_dim %arg175, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %583 = stablehlo.subtract %581, %582 : tensor<1x128x64x64xf32>
-    %584 = stablehlo.broadcast_in_dim %583, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %585 = stablehlo.broadcast_in_dim %arg176, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %586 = stablehlo.multiply %584, %585 : tensor<1x128x64x64xf32>
-    %587 = stablehlo.convert %arg177 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %588 = stablehlo.broadcast_in_dim %586, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %589 = stablehlo.broadcast_in_dim %587, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %590 = stablehlo.multiply %588, %589 : tensor<1x128x64x64xf32>
-    %591 = stablehlo.convert %arg178 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %592 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %593 = stablehlo.broadcast_in_dim %591, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %594 = stablehlo.add %592, %593 : tensor<1x128x64x64xf32>
-    %595 = stablehlo.convert %594 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %596 = stablehlo.broadcast_in_dim %595, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %597 = stablehlo.maximum %265, %596 : tensor<1x128x64x64xbf16>
-    %598 = stablehlo.minimum %265, %596 : tensor<1x128x64x64xbf16>
-    %599 = stablehlo.broadcast_in_dim %598, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %600 = stablehlo.multiply %599, %270 : tensor<1x128x64x64xbf16>
-    %601 = stablehlo.add %597, %600 : tensor<1x128x64x64xbf16>
-    %602 = stablehlo.convolution(%601, %arg26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %603 = stablehlo.convert %602 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %604 = stablehlo.broadcast_in_dim %603, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %605 = stablehlo.broadcast_in_dim %arg179, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %606 = stablehlo.subtract %604, %605 : tensor<1x256x64x64xf32>
-    %607 = stablehlo.broadcast_in_dim %606, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %608 = stablehlo.broadcast_in_dim %arg180, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %609 = stablehlo.multiply %607, %608 : tensor<1x256x64x64xf32>
-    %610 = stablehlo.convert %arg181 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %611 = stablehlo.broadcast_in_dim %609, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %612 = stablehlo.broadcast_in_dim %610, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %613 = stablehlo.multiply %611, %612 : tensor<1x256x64x64xf32>
-    %614 = stablehlo.convert %arg182 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %615 = stablehlo.broadcast_in_dim %613, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %616 = stablehlo.broadcast_in_dim %614, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %617 = stablehlo.add %615, %616 : tensor<1x256x64x64xf32>
-    %618 = stablehlo.convert %617 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %619 = stablehlo.broadcast_in_dim %618, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %620 = stablehlo.maximum %240, %619 : tensor<1x256x64x64xbf16>
-    %621 = stablehlo.minimum %240, %619 : tensor<1x256x64x64xbf16>
-    %622 = stablehlo.broadcast_in_dim %621, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %623 = stablehlo.multiply %622, %245 : tensor<1x256x64x64xbf16>
-    %624 = stablehlo.add %620, %623 : tensor<1x256x64x64xbf16>
-    %625 = stablehlo.add %624, %578 : tensor<1x256x64x64xbf16>
-    %626 = stablehlo.convolution(%625, %arg27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %627 = stablehlo.convert %626 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %628 = stablehlo.broadcast_in_dim %627, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %629 = stablehlo.broadcast_in_dim %arg183, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %630 = stablehlo.subtract %628, %629 : tensor<1x512x32x32xf32>
-    %631 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %632 = stablehlo.broadcast_in_dim %arg184, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %633 = stablehlo.multiply %631, %632 : tensor<1x512x32x32xf32>
-    %634 = stablehlo.convert %arg185 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %635 = stablehlo.broadcast_in_dim %633, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %636 = stablehlo.broadcast_in_dim %634, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %637 = stablehlo.multiply %635, %636 : tensor<1x512x32x32xf32>
-    %638 = stablehlo.convert %arg186 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %639 = stablehlo.broadcast_in_dim %637, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %640 = stablehlo.broadcast_in_dim %638, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %641 = stablehlo.add %639, %640 : tensor<1x512x32x32xf32>
-    %642 = stablehlo.convert %641 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %643 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x512x32x32xbf16>
-    %644 = stablehlo.broadcast_in_dim %642, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %645 = stablehlo.maximum %643, %644 : tensor<1x512x32x32xbf16>
-    %646 = stablehlo.minimum %643, %644 : tensor<1x512x32x32xbf16>
-    %647 = stablehlo.broadcast_in_dim %646, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %648 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x512x32x32xbf16>
-    %649 = stablehlo.multiply %647, %648 : tensor<1x512x32x32xbf16>
-    %650 = stablehlo.add %645, %649 : tensor<1x512x32x32xbf16>
-    %651 = stablehlo.convolution(%650, %arg28) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %652 = stablehlo.convert %651 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %653 = stablehlo.broadcast_in_dim %652, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %654 = stablehlo.broadcast_in_dim %arg187, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %655 = stablehlo.subtract %653, %654 : tensor<1x256x32x32xf32>
-    %656 = stablehlo.broadcast_in_dim %655, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %657 = stablehlo.broadcast_in_dim %arg188, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %658 = stablehlo.multiply %656, %657 : tensor<1x256x32x32xf32>
-    %659 = stablehlo.convert %arg189 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %660 = stablehlo.broadcast_in_dim %658, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %661 = stablehlo.broadcast_in_dim %659, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %662 = stablehlo.multiply %660, %661 : tensor<1x256x32x32xf32>
-    %663 = stablehlo.convert %arg190 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %664 = stablehlo.broadcast_in_dim %662, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %665 = stablehlo.broadcast_in_dim %663, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %666 = stablehlo.add %664, %665 : tensor<1x256x32x32xf32>
-    %667 = stablehlo.convert %666 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %668 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x256x32x32xbf16>
-    %669 = stablehlo.broadcast_in_dim %667, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %670 = stablehlo.maximum %668, %669 : tensor<1x256x32x32xbf16>
-    %671 = stablehlo.minimum %668, %669 : tensor<1x256x32x32xbf16>
-    %672 = stablehlo.broadcast_in_dim %671, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %673 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x256x32x32xbf16>
-    %674 = stablehlo.multiply %672, %673 : tensor<1x256x32x32xbf16>
-    %675 = stablehlo.add %670, %674 : tensor<1x256x32x32xbf16>
-    %676 = stablehlo.convolution(%675, %arg29) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %677 = stablehlo.convert %676 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %678 = stablehlo.broadcast_in_dim %677, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %679 = stablehlo.broadcast_in_dim %arg191, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %680 = stablehlo.subtract %678, %679 : tensor<1x512x32x32xf32>
-    %681 = stablehlo.broadcast_in_dim %680, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %682 = stablehlo.broadcast_in_dim %arg192, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %683 = stablehlo.multiply %681, %682 : tensor<1x512x32x32xf32>
-    %684 = stablehlo.convert %arg193 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %685 = stablehlo.broadcast_in_dim %683, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %686 = stablehlo.broadcast_in_dim %684, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %687 = stablehlo.multiply %685, %686 : tensor<1x512x32x32xf32>
-    %688 = stablehlo.convert %arg194 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %689 = stablehlo.broadcast_in_dim %687, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %690 = stablehlo.broadcast_in_dim %688, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %691 = stablehlo.add %689, %690 : tensor<1x512x32x32xf32>
-    %692 = stablehlo.convert %691 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %693 = stablehlo.broadcast_in_dim %692, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %694 = stablehlo.maximum %643, %693 : tensor<1x512x32x32xbf16>
-    %695 = stablehlo.minimum %643, %693 : tensor<1x512x32x32xbf16>
-    %696 = stablehlo.broadcast_in_dim %695, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %697 = stablehlo.multiply %696, %648 : tensor<1x512x32x32xbf16>
-    %698 = stablehlo.add %694, %697 : tensor<1x512x32x32xbf16>
-    %699 = stablehlo.add %698, %650 : tensor<1x512x32x32xbf16>
-    %700 = stablehlo.convolution(%699, %arg30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %701 = stablehlo.convert %700 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %702 = stablehlo.broadcast_in_dim %701, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %703 = stablehlo.broadcast_in_dim %arg195, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %704 = stablehlo.subtract %702, %703 : tensor<1x256x32x32xf32>
-    %705 = stablehlo.broadcast_in_dim %704, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %706 = stablehlo.broadcast_in_dim %arg196, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %707 = stablehlo.multiply %705, %706 : tensor<1x256x32x32xf32>
-    %708 = stablehlo.convert %arg197 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %709 = stablehlo.broadcast_in_dim %707, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %710 = stablehlo.broadcast_in_dim %708, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %711 = stablehlo.multiply %709, %710 : tensor<1x256x32x32xf32>
-    %712 = stablehlo.convert %arg198 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %713 = stablehlo.broadcast_in_dim %711, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %714 = stablehlo.broadcast_in_dim %712, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %715 = stablehlo.add %713, %714 : tensor<1x256x32x32xf32>
-    %716 = stablehlo.convert %715 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %717 = stablehlo.broadcast_in_dim %716, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %718 = stablehlo.maximum %668, %717 : tensor<1x256x32x32xbf16>
-    %719 = stablehlo.minimum %668, %717 : tensor<1x256x32x32xbf16>
-    %720 = stablehlo.broadcast_in_dim %719, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %721 = stablehlo.multiply %720, %673 : tensor<1x256x32x32xbf16>
-    %722 = stablehlo.add %718, %721 : tensor<1x256x32x32xbf16>
-    %723 = stablehlo.convolution(%722, %arg31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %724 = stablehlo.convert %723 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %725 = stablehlo.broadcast_in_dim %724, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %726 = stablehlo.broadcast_in_dim %arg199, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %727 = stablehlo.subtract %725, %726 : tensor<1x512x32x32xf32>
-    %728 = stablehlo.broadcast_in_dim %727, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %729 = stablehlo.broadcast_in_dim %arg200, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %730 = stablehlo.multiply %728, %729 : tensor<1x512x32x32xf32>
-    %731 = stablehlo.convert %arg201 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %732 = stablehlo.broadcast_in_dim %730, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %733 = stablehlo.broadcast_in_dim %731, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %734 = stablehlo.multiply %732, %733 : tensor<1x512x32x32xf32>
-    %735 = stablehlo.convert %arg202 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %736 = stablehlo.broadcast_in_dim %734, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %737 = stablehlo.broadcast_in_dim %735, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %738 = stablehlo.add %736, %737 : tensor<1x512x32x32xf32>
-    %739 = stablehlo.convert %738 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %740 = stablehlo.broadcast_in_dim %739, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %741 = stablehlo.maximum %643, %740 : tensor<1x512x32x32xbf16>
-    %742 = stablehlo.minimum %643, %740 : tensor<1x512x32x32xbf16>
-    %743 = stablehlo.broadcast_in_dim %742, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %744 = stablehlo.multiply %743, %648 : tensor<1x512x32x32xbf16>
-    %745 = stablehlo.add %741, %744 : tensor<1x512x32x32xbf16>
-    %746 = stablehlo.add %745, %699 : tensor<1x512x32x32xbf16>
-    %747 = stablehlo.convolution(%746, %arg32) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %748 = stablehlo.convert %747 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %749 = stablehlo.broadcast_in_dim %748, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %750 = stablehlo.broadcast_in_dim %arg203, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %751 = stablehlo.subtract %749, %750 : tensor<1x256x32x32xf32>
-    %752 = stablehlo.broadcast_in_dim %751, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %753 = stablehlo.broadcast_in_dim %arg204, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %754 = stablehlo.multiply %752, %753 : tensor<1x256x32x32xf32>
-    %755 = stablehlo.convert %arg205 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %756 = stablehlo.broadcast_in_dim %754, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %757 = stablehlo.broadcast_in_dim %755, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %758 = stablehlo.multiply %756, %757 : tensor<1x256x32x32xf32>
-    %759 = stablehlo.convert %arg206 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %760 = stablehlo.broadcast_in_dim %758, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %761 = stablehlo.broadcast_in_dim %759, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %762 = stablehlo.add %760, %761 : tensor<1x256x32x32xf32>
-    %763 = stablehlo.convert %762 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %764 = stablehlo.broadcast_in_dim %763, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %765 = stablehlo.maximum %668, %764 : tensor<1x256x32x32xbf16>
-    %766 = stablehlo.minimum %668, %764 : tensor<1x256x32x32xbf16>
-    %767 = stablehlo.broadcast_in_dim %766, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %768 = stablehlo.multiply %767, %673 : tensor<1x256x32x32xbf16>
-    %769 = stablehlo.add %765, %768 : tensor<1x256x32x32xbf16>
-    %770 = stablehlo.convolution(%769, %arg33) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %771 = stablehlo.convert %770 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %772 = stablehlo.broadcast_in_dim %771, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %773 = stablehlo.broadcast_in_dim %arg207, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %774 = stablehlo.subtract %772, %773 : tensor<1x512x32x32xf32>
-    %775 = stablehlo.broadcast_in_dim %774, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %776 = stablehlo.broadcast_in_dim %arg208, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %777 = stablehlo.multiply %775, %776 : tensor<1x512x32x32xf32>
-    %778 = stablehlo.convert %arg209 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %779 = stablehlo.broadcast_in_dim %777, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %780 = stablehlo.broadcast_in_dim %778, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %781 = stablehlo.multiply %779, %780 : tensor<1x512x32x32xf32>
-    %782 = stablehlo.convert %arg210 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %783 = stablehlo.broadcast_in_dim %781, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %784 = stablehlo.broadcast_in_dim %782, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %785 = stablehlo.add %783, %784 : tensor<1x512x32x32xf32>
-    %786 = stablehlo.convert %785 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %787 = stablehlo.broadcast_in_dim %786, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %788 = stablehlo.maximum %643, %787 : tensor<1x512x32x32xbf16>
-    %789 = stablehlo.minimum %643, %787 : tensor<1x512x32x32xbf16>
-    %790 = stablehlo.broadcast_in_dim %789, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %791 = stablehlo.multiply %790, %648 : tensor<1x512x32x32xbf16>
-    %792 = stablehlo.add %788, %791 : tensor<1x512x32x32xbf16>
-    %793 = stablehlo.add %792, %746 : tensor<1x512x32x32xbf16>
-    %794 = stablehlo.convolution(%793, %arg34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %795 = stablehlo.convert %794 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %796 = stablehlo.broadcast_in_dim %795, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %797 = stablehlo.broadcast_in_dim %arg211, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %798 = stablehlo.subtract %796, %797 : tensor<1x256x32x32xf32>
-    %799 = stablehlo.broadcast_in_dim %798, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %800 = stablehlo.broadcast_in_dim %arg212, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %801 = stablehlo.multiply %799, %800 : tensor<1x256x32x32xf32>
-    %802 = stablehlo.convert %arg213 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %803 = stablehlo.broadcast_in_dim %801, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %804 = stablehlo.broadcast_in_dim %802, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %805 = stablehlo.multiply %803, %804 : tensor<1x256x32x32xf32>
-    %806 = stablehlo.convert %arg214 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %807 = stablehlo.broadcast_in_dim %805, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %808 = stablehlo.broadcast_in_dim %806, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %809 = stablehlo.add %807, %808 : tensor<1x256x32x32xf32>
-    %810 = stablehlo.convert %809 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %811 = stablehlo.broadcast_in_dim %810, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %812 = stablehlo.maximum %668, %811 : tensor<1x256x32x32xbf16>
-    %813 = stablehlo.minimum %668, %811 : tensor<1x256x32x32xbf16>
-    %814 = stablehlo.broadcast_in_dim %813, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %815 = stablehlo.multiply %814, %673 : tensor<1x256x32x32xbf16>
-    %816 = stablehlo.add %812, %815 : tensor<1x256x32x32xbf16>
-    %817 = stablehlo.convolution(%816, %arg35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %818 = stablehlo.convert %817 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %819 = stablehlo.broadcast_in_dim %818, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %820 = stablehlo.broadcast_in_dim %arg215, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %821 = stablehlo.subtract %819, %820 : tensor<1x512x32x32xf32>
-    %822 = stablehlo.broadcast_in_dim %821, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %823 = stablehlo.broadcast_in_dim %arg216, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %824 = stablehlo.multiply %822, %823 : tensor<1x512x32x32xf32>
-    %825 = stablehlo.convert %arg217 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %826 = stablehlo.broadcast_in_dim %824, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %827 = stablehlo.broadcast_in_dim %825, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %828 = stablehlo.multiply %826, %827 : tensor<1x512x32x32xf32>
-    %829 = stablehlo.convert %arg218 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %830 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %831 = stablehlo.broadcast_in_dim %829, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %832 = stablehlo.add %830, %831 : tensor<1x512x32x32xf32>
-    %833 = stablehlo.convert %832 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %834 = stablehlo.broadcast_in_dim %833, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %835 = stablehlo.maximum %643, %834 : tensor<1x512x32x32xbf16>
-    %836 = stablehlo.minimum %643, %834 : tensor<1x512x32x32xbf16>
-    %837 = stablehlo.broadcast_in_dim %836, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %838 = stablehlo.multiply %837, %648 : tensor<1x512x32x32xbf16>
-    %839 = stablehlo.add %835, %838 : tensor<1x512x32x32xbf16>
-    %840 = stablehlo.add %839, %793 : tensor<1x512x32x32xbf16>
-    %841 = stablehlo.convolution(%840, %arg36) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %842 = stablehlo.convert %841 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %843 = stablehlo.broadcast_in_dim %842, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %844 = stablehlo.broadcast_in_dim %arg219, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %845 = stablehlo.subtract %843, %844 : tensor<1x256x32x32xf32>
-    %846 = stablehlo.broadcast_in_dim %845, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %847 = stablehlo.broadcast_in_dim %arg220, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %848 = stablehlo.multiply %846, %847 : tensor<1x256x32x32xf32>
-    %849 = stablehlo.convert %arg221 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %850 = stablehlo.broadcast_in_dim %848, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %851 = stablehlo.broadcast_in_dim %849, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %852 = stablehlo.multiply %850, %851 : tensor<1x256x32x32xf32>
-    %853 = stablehlo.convert %arg222 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %854 = stablehlo.broadcast_in_dim %852, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %855 = stablehlo.broadcast_in_dim %853, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %856 = stablehlo.add %854, %855 : tensor<1x256x32x32xf32>
-    %857 = stablehlo.convert %856 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %858 = stablehlo.broadcast_in_dim %857, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %859 = stablehlo.maximum %668, %858 : tensor<1x256x32x32xbf16>
-    %860 = stablehlo.minimum %668, %858 : tensor<1x256x32x32xbf16>
-    %861 = stablehlo.broadcast_in_dim %860, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %862 = stablehlo.multiply %861, %673 : tensor<1x256x32x32xbf16>
-    %863 = stablehlo.add %859, %862 : tensor<1x256x32x32xbf16>
-    %864 = stablehlo.convolution(%863, %arg37) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %865 = stablehlo.convert %864 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %866 = stablehlo.broadcast_in_dim %865, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %867 = stablehlo.broadcast_in_dim %arg223, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %868 = stablehlo.subtract %866, %867 : tensor<1x512x32x32xf32>
-    %869 = stablehlo.broadcast_in_dim %868, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %870 = stablehlo.broadcast_in_dim %arg224, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %871 = stablehlo.multiply %869, %870 : tensor<1x512x32x32xf32>
-    %872 = stablehlo.convert %arg225 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %873 = stablehlo.broadcast_in_dim %871, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %874 = stablehlo.broadcast_in_dim %872, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %875 = stablehlo.multiply %873, %874 : tensor<1x512x32x32xf32>
-    %876 = stablehlo.convert %arg226 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %877 = stablehlo.broadcast_in_dim %875, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %878 = stablehlo.broadcast_in_dim %876, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %879 = stablehlo.add %877, %878 : tensor<1x512x32x32xf32>
-    %880 = stablehlo.convert %879 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %881 = stablehlo.broadcast_in_dim %880, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %882 = stablehlo.maximum %643, %881 : tensor<1x512x32x32xbf16>
-    %883 = stablehlo.minimum %643, %881 : tensor<1x512x32x32xbf16>
-    %884 = stablehlo.broadcast_in_dim %883, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %885 = stablehlo.multiply %884, %648 : tensor<1x512x32x32xbf16>
-    %886 = stablehlo.add %882, %885 : tensor<1x512x32x32xbf16>
-    %887 = stablehlo.add %886, %840 : tensor<1x512x32x32xbf16>
-    %888 = stablehlo.convolution(%887, %arg38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %889 = stablehlo.convert %888 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %890 = stablehlo.broadcast_in_dim %889, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %891 = stablehlo.broadcast_in_dim %arg227, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %892 = stablehlo.subtract %890, %891 : tensor<1x256x32x32xf32>
-    %893 = stablehlo.broadcast_in_dim %892, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %894 = stablehlo.broadcast_in_dim %arg228, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %895 = stablehlo.multiply %893, %894 : tensor<1x256x32x32xf32>
-    %896 = stablehlo.convert %arg229 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %897 = stablehlo.broadcast_in_dim %895, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %898 = stablehlo.broadcast_in_dim %896, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %899 = stablehlo.multiply %897, %898 : tensor<1x256x32x32xf32>
-    %900 = stablehlo.convert %arg230 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %901 = stablehlo.broadcast_in_dim %899, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %902 = stablehlo.broadcast_in_dim %900, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %903 = stablehlo.add %901, %902 : tensor<1x256x32x32xf32>
-    %904 = stablehlo.convert %903 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %905 = stablehlo.broadcast_in_dim %904, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %906 = stablehlo.maximum %668, %905 : tensor<1x256x32x32xbf16>
-    %907 = stablehlo.minimum %668, %905 : tensor<1x256x32x32xbf16>
-    %908 = stablehlo.broadcast_in_dim %907, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %909 = stablehlo.multiply %908, %673 : tensor<1x256x32x32xbf16>
-    %910 = stablehlo.add %906, %909 : tensor<1x256x32x32xbf16>
-    %911 = stablehlo.convolution(%910, %arg39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %912 = stablehlo.convert %911 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %913 = stablehlo.broadcast_in_dim %912, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %914 = stablehlo.broadcast_in_dim %arg231, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %915 = stablehlo.subtract %913, %914 : tensor<1x512x32x32xf32>
-    %916 = stablehlo.broadcast_in_dim %915, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %917 = stablehlo.broadcast_in_dim %arg232, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %918 = stablehlo.multiply %916, %917 : tensor<1x512x32x32xf32>
-    %919 = stablehlo.convert %arg233 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %920 = stablehlo.broadcast_in_dim %918, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %921 = stablehlo.broadcast_in_dim %919, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %922 = stablehlo.multiply %920, %921 : tensor<1x512x32x32xf32>
-    %923 = stablehlo.convert %arg234 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %924 = stablehlo.broadcast_in_dim %922, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %925 = stablehlo.broadcast_in_dim %923, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %926 = stablehlo.add %924, %925 : tensor<1x512x32x32xf32>
-    %927 = stablehlo.convert %926 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %928 = stablehlo.broadcast_in_dim %927, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %929 = stablehlo.maximum %643, %928 : tensor<1x512x32x32xbf16>
-    %930 = stablehlo.minimum %643, %928 : tensor<1x512x32x32xbf16>
-    %931 = stablehlo.broadcast_in_dim %930, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %932 = stablehlo.multiply %931, %648 : tensor<1x512x32x32xbf16>
-    %933 = stablehlo.add %929, %932 : tensor<1x512x32x32xbf16>
-    %934 = stablehlo.add %933, %887 : tensor<1x512x32x32xbf16>
-    %935 = stablehlo.convolution(%934, %arg40) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %936 = stablehlo.convert %935 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %937 = stablehlo.broadcast_in_dim %936, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %938 = stablehlo.broadcast_in_dim %arg235, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %939 = stablehlo.subtract %937, %938 : tensor<1x256x32x32xf32>
-    %940 = stablehlo.broadcast_in_dim %939, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %941 = stablehlo.broadcast_in_dim %arg236, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %942 = stablehlo.multiply %940, %941 : tensor<1x256x32x32xf32>
-    %943 = stablehlo.convert %arg237 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %944 = stablehlo.broadcast_in_dim %942, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %945 = stablehlo.broadcast_in_dim %943, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %946 = stablehlo.multiply %944, %945 : tensor<1x256x32x32xf32>
-    %947 = stablehlo.convert %arg238 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %948 = stablehlo.broadcast_in_dim %946, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %949 = stablehlo.broadcast_in_dim %947, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %950 = stablehlo.add %948, %949 : tensor<1x256x32x32xf32>
-    %951 = stablehlo.convert %950 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %952 = stablehlo.broadcast_in_dim %951, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %953 = stablehlo.maximum %668, %952 : tensor<1x256x32x32xbf16>
-    %954 = stablehlo.minimum %668, %952 : tensor<1x256x32x32xbf16>
-    %955 = stablehlo.broadcast_in_dim %954, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %956 = stablehlo.multiply %955, %673 : tensor<1x256x32x32xbf16>
-    %957 = stablehlo.add %953, %956 : tensor<1x256x32x32xbf16>
-    %958 = stablehlo.convolution(%957, %arg41) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %959 = stablehlo.convert %958 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %960 = stablehlo.broadcast_in_dim %959, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %961 = stablehlo.broadcast_in_dim %arg239, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %962 = stablehlo.subtract %960, %961 : tensor<1x512x32x32xf32>
-    %963 = stablehlo.broadcast_in_dim %962, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %964 = stablehlo.broadcast_in_dim %arg240, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %965 = stablehlo.multiply %963, %964 : tensor<1x512x32x32xf32>
-    %966 = stablehlo.convert %arg241 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %967 = stablehlo.broadcast_in_dim %965, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %968 = stablehlo.broadcast_in_dim %966, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %969 = stablehlo.multiply %967, %968 : tensor<1x512x32x32xf32>
-    %970 = stablehlo.convert %arg242 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %971 = stablehlo.broadcast_in_dim %969, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %972 = stablehlo.broadcast_in_dim %970, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %973 = stablehlo.add %971, %972 : tensor<1x512x32x32xf32>
-    %974 = stablehlo.convert %973 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %975 = stablehlo.broadcast_in_dim %974, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %976 = stablehlo.maximum %643, %975 : tensor<1x512x32x32xbf16>
-    %977 = stablehlo.minimum %643, %975 : tensor<1x512x32x32xbf16>
-    %978 = stablehlo.broadcast_in_dim %977, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %979 = stablehlo.multiply %978, %648 : tensor<1x512x32x32xbf16>
-    %980 = stablehlo.add %976, %979 : tensor<1x512x32x32xbf16>
-    %981 = stablehlo.add %980, %934 : tensor<1x512x32x32xbf16>
-    %982 = stablehlo.convolution(%981, %arg42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %983 = stablehlo.convert %982 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %984 = stablehlo.broadcast_in_dim %983, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %985 = stablehlo.broadcast_in_dim %arg243, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %986 = stablehlo.subtract %984, %985 : tensor<1x256x32x32xf32>
-    %987 = stablehlo.broadcast_in_dim %986, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %988 = stablehlo.broadcast_in_dim %arg244, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %989 = stablehlo.multiply %987, %988 : tensor<1x256x32x32xf32>
-    %990 = stablehlo.convert %arg245 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %991 = stablehlo.broadcast_in_dim %989, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %992 = stablehlo.broadcast_in_dim %990, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %993 = stablehlo.multiply %991, %992 : tensor<1x256x32x32xf32>
-    %994 = stablehlo.convert %arg246 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %995 = stablehlo.broadcast_in_dim %993, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %996 = stablehlo.broadcast_in_dim %994, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %997 = stablehlo.add %995, %996 : tensor<1x256x32x32xf32>
-    %998 = stablehlo.convert %997 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %999 = stablehlo.broadcast_in_dim %998, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1000 = stablehlo.maximum %668, %999 : tensor<1x256x32x32xbf16>
-    %1001 = stablehlo.minimum %668, %999 : tensor<1x256x32x32xbf16>
-    %1002 = stablehlo.broadcast_in_dim %1001, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1003 = stablehlo.multiply %1002, %673 : tensor<1x256x32x32xbf16>
-    %1004 = stablehlo.add %1000, %1003 : tensor<1x256x32x32xbf16>
-    %1005 = stablehlo.convolution(%1004, %arg43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %1006 = stablehlo.convert %1005 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %1007 = stablehlo.broadcast_in_dim %1006, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1008 = stablehlo.broadcast_in_dim %arg247, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1009 = stablehlo.subtract %1007, %1008 : tensor<1x512x32x32xf32>
-    %1010 = stablehlo.broadcast_in_dim %1009, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1011 = stablehlo.broadcast_in_dim %arg248, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1012 = stablehlo.multiply %1010, %1011 : tensor<1x512x32x32xf32>
-    %1013 = stablehlo.convert %arg249 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1014 = stablehlo.broadcast_in_dim %1012, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1015 = stablehlo.broadcast_in_dim %1013, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1016 = stablehlo.multiply %1014, %1015 : tensor<1x512x32x32xf32>
-    %1017 = stablehlo.convert %arg250 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1018 = stablehlo.broadcast_in_dim %1016, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1019 = stablehlo.broadcast_in_dim %1017, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1020 = stablehlo.add %1018, %1019 : tensor<1x512x32x32xf32>
-    %1021 = stablehlo.convert %1020 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %1022 = stablehlo.broadcast_in_dim %1021, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1023 = stablehlo.maximum %643, %1022 : tensor<1x512x32x32xbf16>
-    %1024 = stablehlo.minimum %643, %1022 : tensor<1x512x32x32xbf16>
-    %1025 = stablehlo.broadcast_in_dim %1024, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1026 = stablehlo.multiply %1025, %648 : tensor<1x512x32x32xbf16>
-    %1027 = stablehlo.add %1023, %1026 : tensor<1x512x32x32xbf16>
-    %1028 = stablehlo.add %1027, %981 : tensor<1x512x32x32xbf16>
-    %1029 = stablehlo.convolution(%1028, %arg44) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1030 = stablehlo.convert %1029 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1031 = stablehlo.broadcast_in_dim %1030, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1032 = stablehlo.broadcast_in_dim %arg251, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1033 = stablehlo.subtract %1031, %1032 : tensor<1x1024x16x16xf32>
-    %1034 = stablehlo.broadcast_in_dim %1033, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1035 = stablehlo.broadcast_in_dim %arg252, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1036 = stablehlo.multiply %1034, %1035 : tensor<1x1024x16x16xf32>
-    %1037 = stablehlo.convert %arg253 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1038 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1039 = stablehlo.broadcast_in_dim %1037, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1040 = stablehlo.multiply %1038, %1039 : tensor<1x1024x16x16xf32>
-    %1041 = stablehlo.convert %arg254 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1042 = stablehlo.broadcast_in_dim %1040, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1043 = stablehlo.broadcast_in_dim %1041, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1044 = stablehlo.add %1042, %1043 : tensor<1x1024x16x16xf32>
-    %1045 = stablehlo.convert %1044 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1046 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x1024x16x16xbf16>
-    %1047 = stablehlo.broadcast_in_dim %1045, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1048 = stablehlo.maximum %1046, %1047 : tensor<1x1024x16x16xbf16>
-    %1049 = stablehlo.minimum %1046, %1047 : tensor<1x1024x16x16xbf16>
-    %1050 = stablehlo.broadcast_in_dim %1049, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1051 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x1024x16x16xbf16>
-    %1052 = stablehlo.multiply %1050, %1051 : tensor<1x1024x16x16xbf16>
-    %1053 = stablehlo.add %1048, %1052 : tensor<1x1024x16x16xbf16>
-    %1054 = stablehlo.convolution(%1053, %arg45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1055 = stablehlo.convert %1054 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1056 = stablehlo.broadcast_in_dim %1055, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1057 = stablehlo.broadcast_in_dim %arg255, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1058 = stablehlo.subtract %1056, %1057 : tensor<1x512x16x16xf32>
-    %1059 = stablehlo.broadcast_in_dim %1058, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1060 = stablehlo.broadcast_in_dim %arg256, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1061 = stablehlo.multiply %1059, %1060 : tensor<1x512x16x16xf32>
-    %1062 = stablehlo.convert %arg257 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1063 = stablehlo.broadcast_in_dim %1061, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1064 = stablehlo.broadcast_in_dim %1062, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1065 = stablehlo.multiply %1063, %1064 : tensor<1x512x16x16xf32>
-    %1066 = stablehlo.convert %arg258 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1067 = stablehlo.broadcast_in_dim %1065, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1068 = stablehlo.broadcast_in_dim %1066, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1069 = stablehlo.add %1067, %1068 : tensor<1x512x16x16xf32>
-    %1070 = stablehlo.convert %1069 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1071 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x512x16x16xbf16>
-    %1072 = stablehlo.broadcast_in_dim %1070, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1073 = stablehlo.maximum %1071, %1072 : tensor<1x512x16x16xbf16>
-    %1074 = stablehlo.minimum %1071, %1072 : tensor<1x512x16x16xbf16>
-    %1075 = stablehlo.broadcast_in_dim %1074, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1076 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x512x16x16xbf16>
-    %1077 = stablehlo.multiply %1075, %1076 : tensor<1x512x16x16xbf16>
-    %1078 = stablehlo.add %1073, %1077 : tensor<1x512x16x16xbf16>
-    %1079 = stablehlo.convolution(%1078, %arg46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1080 = stablehlo.convert %1079 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1081 = stablehlo.broadcast_in_dim %1080, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1082 = stablehlo.broadcast_in_dim %arg259, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1083 = stablehlo.subtract %1081, %1082 : tensor<1x1024x16x16xf32>
-    %1084 = stablehlo.broadcast_in_dim %1083, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1085 = stablehlo.broadcast_in_dim %arg260, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1086 = stablehlo.multiply %1084, %1085 : tensor<1x1024x16x16xf32>
-    %1087 = stablehlo.convert %arg261 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1088 = stablehlo.broadcast_in_dim %1086, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1089 = stablehlo.broadcast_in_dim %1087, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1090 = stablehlo.multiply %1088, %1089 : tensor<1x1024x16x16xf32>
-    %1091 = stablehlo.convert %arg262 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1092 = stablehlo.broadcast_in_dim %1090, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1093 = stablehlo.broadcast_in_dim %1091, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1094 = stablehlo.add %1092, %1093 : tensor<1x1024x16x16xf32>
-    %1095 = stablehlo.convert %1094 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1096 = stablehlo.broadcast_in_dim %1095, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1097 = stablehlo.maximum %1046, %1096 : tensor<1x1024x16x16xbf16>
-    %1098 = stablehlo.minimum %1046, %1096 : tensor<1x1024x16x16xbf16>
-    %1099 = stablehlo.broadcast_in_dim %1098, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1100 = stablehlo.multiply %1099, %1051 : tensor<1x1024x16x16xbf16>
-    %1101 = stablehlo.add %1097, %1100 : tensor<1x1024x16x16xbf16>
-    %1102 = stablehlo.add %1101, %1053 : tensor<1x1024x16x16xbf16>
-    %1103 = stablehlo.convolution(%1102, %arg47) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1104 = stablehlo.convert %1103 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1105 = stablehlo.broadcast_in_dim %1104, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1106 = stablehlo.broadcast_in_dim %arg263, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1107 = stablehlo.subtract %1105, %1106 : tensor<1x512x16x16xf32>
-    %1108 = stablehlo.broadcast_in_dim %1107, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1109 = stablehlo.broadcast_in_dim %arg264, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1110 = stablehlo.multiply %1108, %1109 : tensor<1x512x16x16xf32>
-    %1111 = stablehlo.convert %arg265 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1112 = stablehlo.broadcast_in_dim %1110, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1113 = stablehlo.broadcast_in_dim %1111, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1114 = stablehlo.multiply %1112, %1113 : tensor<1x512x16x16xf32>
-    %1115 = stablehlo.convert %arg266 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1116 = stablehlo.broadcast_in_dim %1114, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1117 = stablehlo.broadcast_in_dim %1115, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1118 = stablehlo.add %1116, %1117 : tensor<1x512x16x16xf32>
-    %1119 = stablehlo.convert %1118 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1120 = stablehlo.broadcast_in_dim %1119, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1121 = stablehlo.maximum %1071, %1120 : tensor<1x512x16x16xbf16>
-    %1122 = stablehlo.minimum %1071, %1120 : tensor<1x512x16x16xbf16>
-    %1123 = stablehlo.broadcast_in_dim %1122, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1124 = stablehlo.multiply %1123, %1076 : tensor<1x512x16x16xbf16>
-    %1125 = stablehlo.add %1121, %1124 : tensor<1x512x16x16xbf16>
-    %1126 = stablehlo.convolution(%1125, %arg48) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1127 = stablehlo.convert %1126 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1128 = stablehlo.broadcast_in_dim %1127, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1129 = stablehlo.broadcast_in_dim %arg267, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1130 = stablehlo.subtract %1128, %1129 : tensor<1x1024x16x16xf32>
-    %1131 = stablehlo.broadcast_in_dim %1130, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1132 = stablehlo.broadcast_in_dim %arg268, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1133 = stablehlo.multiply %1131, %1132 : tensor<1x1024x16x16xf32>
-    %1134 = stablehlo.convert %arg269 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1135 = stablehlo.broadcast_in_dim %1133, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1136 = stablehlo.broadcast_in_dim %1134, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1137 = stablehlo.multiply %1135, %1136 : tensor<1x1024x16x16xf32>
-    %1138 = stablehlo.convert %arg270 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1139 = stablehlo.broadcast_in_dim %1137, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1140 = stablehlo.broadcast_in_dim %1138, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1141 = stablehlo.add %1139, %1140 : tensor<1x1024x16x16xf32>
-    %1142 = stablehlo.convert %1141 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1143 = stablehlo.broadcast_in_dim %1142, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1144 = stablehlo.maximum %1046, %1143 : tensor<1x1024x16x16xbf16>
-    %1145 = stablehlo.minimum %1046, %1143 : tensor<1x1024x16x16xbf16>
-    %1146 = stablehlo.broadcast_in_dim %1145, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1147 = stablehlo.multiply %1146, %1051 : tensor<1x1024x16x16xbf16>
-    %1148 = stablehlo.add %1144, %1147 : tensor<1x1024x16x16xbf16>
-    %1149 = stablehlo.add %1148, %1102 : tensor<1x1024x16x16xbf16>
-    %1150 = stablehlo.convolution(%1149, %arg49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1151 = stablehlo.convert %1150 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1152 = stablehlo.broadcast_in_dim %1151, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1153 = stablehlo.broadcast_in_dim %arg271, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1154 = stablehlo.subtract %1152, %1153 : tensor<1x512x16x16xf32>
-    %1155 = stablehlo.broadcast_in_dim %1154, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1156 = stablehlo.broadcast_in_dim %arg272, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1157 = stablehlo.multiply %1155, %1156 : tensor<1x512x16x16xf32>
-    %1158 = stablehlo.convert %arg273 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1159 = stablehlo.broadcast_in_dim %1157, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1160 = stablehlo.broadcast_in_dim %1158, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1161 = stablehlo.multiply %1159, %1160 : tensor<1x512x16x16xf32>
-    %1162 = stablehlo.convert %arg274 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1163 = stablehlo.broadcast_in_dim %1161, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1164 = stablehlo.broadcast_in_dim %1162, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1165 = stablehlo.add %1163, %1164 : tensor<1x512x16x16xf32>
-    %1166 = stablehlo.convert %1165 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1167 = stablehlo.broadcast_in_dim %1166, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1168 = stablehlo.maximum %1071, %1167 : tensor<1x512x16x16xbf16>
-    %1169 = stablehlo.minimum %1071, %1167 : tensor<1x512x16x16xbf16>
-    %1170 = stablehlo.broadcast_in_dim %1169, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1171 = stablehlo.multiply %1170, %1076 : tensor<1x512x16x16xbf16>
-    %1172 = stablehlo.add %1168, %1171 : tensor<1x512x16x16xbf16>
-    %1173 = stablehlo.convolution(%1172, %arg50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1174 = stablehlo.convert %1173 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1175 = stablehlo.broadcast_in_dim %1174, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1176 = stablehlo.broadcast_in_dim %arg275, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1177 = stablehlo.subtract %1175, %1176 : tensor<1x1024x16x16xf32>
-    %1178 = stablehlo.broadcast_in_dim %1177, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1179 = stablehlo.broadcast_in_dim %arg276, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1180 = stablehlo.multiply %1178, %1179 : tensor<1x1024x16x16xf32>
-    %1181 = stablehlo.convert %arg277 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1182 = stablehlo.broadcast_in_dim %1180, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1183 = stablehlo.broadcast_in_dim %1181, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1184 = stablehlo.multiply %1182, %1183 : tensor<1x1024x16x16xf32>
-    %1185 = stablehlo.convert %arg278 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1186 = stablehlo.broadcast_in_dim %1184, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1187 = stablehlo.broadcast_in_dim %1185, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1188 = stablehlo.add %1186, %1187 : tensor<1x1024x16x16xf32>
-    %1189 = stablehlo.convert %1188 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1190 = stablehlo.broadcast_in_dim %1189, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1191 = stablehlo.maximum %1046, %1190 : tensor<1x1024x16x16xbf16>
-    %1192 = stablehlo.minimum %1046, %1190 : tensor<1x1024x16x16xbf16>
-    %1193 = stablehlo.broadcast_in_dim %1192, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1194 = stablehlo.multiply %1193, %1051 : tensor<1x1024x16x16xbf16>
-    %1195 = stablehlo.add %1191, %1194 : tensor<1x1024x16x16xbf16>
-    %1196 = stablehlo.add %1195, %1149 : tensor<1x1024x16x16xbf16>
-    %1197 = stablehlo.convolution(%1196, %arg51) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1198 = stablehlo.convert %1197 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1199 = stablehlo.broadcast_in_dim %1198, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1200 = stablehlo.broadcast_in_dim %arg279, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1201 = stablehlo.subtract %1199, %1200 : tensor<1x512x16x16xf32>
-    %1202 = stablehlo.broadcast_in_dim %1201, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1203 = stablehlo.broadcast_in_dim %arg280, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1204 = stablehlo.multiply %1202, %1203 : tensor<1x512x16x16xf32>
-    %1205 = stablehlo.convert %arg281 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1206 = stablehlo.broadcast_in_dim %1204, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1207 = stablehlo.broadcast_in_dim %1205, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1208 = stablehlo.multiply %1206, %1207 : tensor<1x512x16x16xf32>
-    %1209 = stablehlo.convert %arg282 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1210 = stablehlo.broadcast_in_dim %1208, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1211 = stablehlo.broadcast_in_dim %1209, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1212 = stablehlo.add %1210, %1211 : tensor<1x512x16x16xf32>
-    %1213 = stablehlo.convert %1212 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1214 = stablehlo.broadcast_in_dim %1213, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1215 = stablehlo.maximum %1071, %1214 : tensor<1x512x16x16xbf16>
-    %1216 = stablehlo.minimum %1071, %1214 : tensor<1x512x16x16xbf16>
-    %1217 = stablehlo.broadcast_in_dim %1216, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1218 = stablehlo.multiply %1217, %1076 : tensor<1x512x16x16xbf16>
-    %1219 = stablehlo.add %1215, %1218 : tensor<1x512x16x16xbf16>
-    %1220 = stablehlo.convolution(%1219, %arg52) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1221 = stablehlo.convert %1220 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1222 = stablehlo.broadcast_in_dim %1221, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1223 = stablehlo.broadcast_in_dim %arg283, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1224 = stablehlo.subtract %1222, %1223 : tensor<1x1024x16x16xf32>
-    %1225 = stablehlo.broadcast_in_dim %1224, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1226 = stablehlo.broadcast_in_dim %arg284, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1227 = stablehlo.multiply %1225, %1226 : tensor<1x1024x16x16xf32>
-    %1228 = stablehlo.convert %arg285 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1229 = stablehlo.broadcast_in_dim %1227, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1230 = stablehlo.broadcast_in_dim %1228, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1231 = stablehlo.multiply %1229, %1230 : tensor<1x1024x16x16xf32>
-    %1232 = stablehlo.convert %arg286 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1233 = stablehlo.broadcast_in_dim %1231, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1234 = stablehlo.broadcast_in_dim %1232, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1235 = stablehlo.add %1233, %1234 : tensor<1x1024x16x16xf32>
-    %1236 = stablehlo.convert %1235 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1237 = stablehlo.broadcast_in_dim %1236, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1238 = stablehlo.maximum %1046, %1237 : tensor<1x1024x16x16xbf16>
-    %1239 = stablehlo.minimum %1046, %1237 : tensor<1x1024x16x16xbf16>
-    %1240 = stablehlo.broadcast_in_dim %1239, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1241 = stablehlo.multiply %1240, %1051 : tensor<1x1024x16x16xbf16>
-    %1242 = stablehlo.add %1238, %1241 : tensor<1x1024x16x16xbf16>
-    %1243 = stablehlo.add %1242, %1196 : tensor<1x1024x16x16xbf16>
-    %1244 = stablehlo.convolution(%1243, %arg53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1245 = stablehlo.convert %1244 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1246 = stablehlo.broadcast_in_dim %1245, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1247 = stablehlo.broadcast_in_dim %arg287, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1248 = stablehlo.subtract %1246, %1247 : tensor<1x512x16x16xf32>
-    %1249 = stablehlo.broadcast_in_dim %1248, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1250 = stablehlo.broadcast_in_dim %arg288, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1251 = stablehlo.multiply %1249, %1250 : tensor<1x512x16x16xf32>
-    %1252 = stablehlo.convert %arg289 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1253 = stablehlo.broadcast_in_dim %1251, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1254 = stablehlo.broadcast_in_dim %1252, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1255 = stablehlo.multiply %1253, %1254 : tensor<1x512x16x16xf32>
-    %1256 = stablehlo.convert %arg290 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1257 = stablehlo.broadcast_in_dim %1255, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1258 = stablehlo.broadcast_in_dim %1256, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1259 = stablehlo.add %1257, %1258 : tensor<1x512x16x16xf32>
-    %1260 = stablehlo.convert %1259 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1261 = stablehlo.broadcast_in_dim %1260, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1262 = stablehlo.maximum %1071, %1261 : tensor<1x512x16x16xbf16>
-    %1263 = stablehlo.minimum %1071, %1261 : tensor<1x512x16x16xbf16>
-    %1264 = stablehlo.broadcast_in_dim %1263, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1265 = stablehlo.multiply %1264, %1076 : tensor<1x512x16x16xbf16>
-    %1266 = stablehlo.add %1262, %1265 : tensor<1x512x16x16xbf16>
-    %1267 = stablehlo.convolution(%1266, %arg54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1268 = stablehlo.convert %1267 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1269 = stablehlo.broadcast_in_dim %1268, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1270 = stablehlo.broadcast_in_dim %arg291, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1271 = stablehlo.subtract %1269, %1270 : tensor<1x1024x16x16xf32>
-    %1272 = stablehlo.broadcast_in_dim %1271, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1273 = stablehlo.broadcast_in_dim %arg292, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1274 = stablehlo.multiply %1272, %1273 : tensor<1x1024x16x16xf32>
-    %1275 = stablehlo.convert %arg293 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1276 = stablehlo.broadcast_in_dim %1274, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1277 = stablehlo.broadcast_in_dim %1275, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1278 = stablehlo.multiply %1276, %1277 : tensor<1x1024x16x16xf32>
-    %1279 = stablehlo.convert %arg294 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1280 = stablehlo.broadcast_in_dim %1278, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1281 = stablehlo.broadcast_in_dim %1279, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1282 = stablehlo.add %1280, %1281 : tensor<1x1024x16x16xf32>
-    %1283 = stablehlo.convert %1282 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1284 = stablehlo.broadcast_in_dim %1283, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1285 = stablehlo.maximum %1046, %1284 : tensor<1x1024x16x16xbf16>
-    %1286 = stablehlo.minimum %1046, %1284 : tensor<1x1024x16x16xbf16>
-    %1287 = stablehlo.broadcast_in_dim %1286, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1288 = stablehlo.multiply %1287, %1051 : tensor<1x1024x16x16xbf16>
-    %1289 = stablehlo.add %1285, %1288 : tensor<1x1024x16x16xbf16>
-    %1290 = stablehlo.convolution(%1289, %arg55) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1291 = stablehlo.convert %1290 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1292 = stablehlo.broadcast_in_dim %1291, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1293 = stablehlo.broadcast_in_dim %arg295, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1294 = stablehlo.subtract %1292, %1293 : tensor<1x512x16x16xf32>
-    %1295 = stablehlo.broadcast_in_dim %1294, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1296 = stablehlo.broadcast_in_dim %arg296, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1297 = stablehlo.multiply %1295, %1296 : tensor<1x512x16x16xf32>
-    %1298 = stablehlo.convert %arg297 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1299 = stablehlo.broadcast_in_dim %1297, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1300 = stablehlo.broadcast_in_dim %1298, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1301 = stablehlo.multiply %1299, %1300 : tensor<1x512x16x16xf32>
-    %1302 = stablehlo.convert %arg298 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1303 = stablehlo.broadcast_in_dim %1301, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1304 = stablehlo.broadcast_in_dim %1302, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1305 = stablehlo.add %1303, %1304 : tensor<1x512x16x16xf32>
-    %1306 = stablehlo.convert %1305 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1307 = stablehlo.broadcast_in_dim %1306, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1308 = stablehlo.maximum %1071, %1307 : tensor<1x512x16x16xbf16>
-    %1309 = stablehlo.minimum %1071, %1307 : tensor<1x512x16x16xbf16>
-    %1310 = stablehlo.broadcast_in_dim %1309, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1311 = stablehlo.multiply %1310, %1076 : tensor<1x512x16x16xbf16>
-    %1312 = stablehlo.add %1308, %1311 : tensor<1x512x16x16xbf16>
-    %1313 = stablehlo.convolution(%1312, %arg56) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1314 = stablehlo.convert %1313 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1315 = stablehlo.broadcast_in_dim %1314, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1316 = stablehlo.broadcast_in_dim %arg299, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1317 = stablehlo.subtract %1315, %1316 : tensor<1x1024x16x16xf32>
-    %1318 = stablehlo.broadcast_in_dim %1317, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1319 = stablehlo.broadcast_in_dim %arg300, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1320 = stablehlo.multiply %1318, %1319 : tensor<1x1024x16x16xf32>
-    %1321 = stablehlo.convert %arg301 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1322 = stablehlo.broadcast_in_dim %1320, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1323 = stablehlo.broadcast_in_dim %1321, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1324 = stablehlo.multiply %1322, %1323 : tensor<1x1024x16x16xf32>
-    %1325 = stablehlo.convert %arg302 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1326 = stablehlo.broadcast_in_dim %1324, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1327 = stablehlo.broadcast_in_dim %1325, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1328 = stablehlo.add %1326, %1327 : tensor<1x1024x16x16xf32>
-    %1329 = stablehlo.convert %1328 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1330 = stablehlo.broadcast_in_dim %1329, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1331 = stablehlo.maximum %1046, %1330 : tensor<1x1024x16x16xbf16>
-    %1332 = stablehlo.minimum %1046, %1330 : tensor<1x1024x16x16xbf16>
-    %1333 = stablehlo.broadcast_in_dim %1332, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1334 = stablehlo.multiply %1333, %1051 : tensor<1x1024x16x16xbf16>
-    %1335 = stablehlo.add %1331, %1334 : tensor<1x1024x16x16xbf16>
-    %1336 = stablehlo.convolution(%1335, %arg57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<512x1024x1x1xbf16>) -> tensor<1x512x16x16xbf16>
-    %1337 = stablehlo.convert %1336 : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xf32>
-    %1338 = stablehlo.broadcast_in_dim %1337, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1339 = stablehlo.broadcast_in_dim %arg303, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1340 = stablehlo.subtract %1338, %1339 : tensor<1x512x16x16xf32>
-    %1341 = stablehlo.broadcast_in_dim %1340, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1342 = stablehlo.broadcast_in_dim %arg304, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1343 = stablehlo.multiply %1341, %1342 : tensor<1x512x16x16xf32>
-    %1344 = stablehlo.convert %arg305 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1345 = stablehlo.broadcast_in_dim %1343, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1346 = stablehlo.broadcast_in_dim %1344, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1347 = stablehlo.multiply %1345, %1346 : tensor<1x512x16x16xf32>
-    %1348 = stablehlo.convert %arg306 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1349 = stablehlo.broadcast_in_dim %1347, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xf32>
-    %1350 = stablehlo.broadcast_in_dim %1348, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x16x16xf32>
-    %1351 = stablehlo.add %1349, %1350 : tensor<1x512x16x16xf32>
-    %1352 = stablehlo.convert %1351 : (tensor<1x512x16x16xf32>) -> tensor<1x512x16x16xbf16>
-    %1353 = stablehlo.broadcast_in_dim %1352, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1354 = stablehlo.maximum %1071, %1353 : tensor<1x512x16x16xbf16>
-    %1355 = stablehlo.minimum %1071, %1353 : tensor<1x512x16x16xbf16>
-    %1356 = stablehlo.broadcast_in_dim %1355, dims = [0, 1, 2, 3] : (tensor<1x512x16x16xbf16>) -> tensor<1x512x16x16xbf16>
-    %1357 = stablehlo.multiply %1356, %1076 : tensor<1x512x16x16xbf16>
-    %1358 = stablehlo.add %1354, %1357 : tensor<1x512x16x16xbf16>
-    %1359 = stablehlo.convolution(%1358, %arg58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<1024x512x3x3xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1360 = stablehlo.convert %1359 : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xf32>
-    %1361 = stablehlo.broadcast_in_dim %1360, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1362 = stablehlo.broadcast_in_dim %arg307, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1363 = stablehlo.subtract %1361, %1362 : tensor<1x1024x16x16xf32>
-    %1364 = stablehlo.broadcast_in_dim %1363, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1365 = stablehlo.broadcast_in_dim %arg308, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1366 = stablehlo.multiply %1364, %1365 : tensor<1x1024x16x16xf32>
-    %1367 = stablehlo.convert %arg309 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1368 = stablehlo.broadcast_in_dim %1366, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1369 = stablehlo.broadcast_in_dim %1367, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1370 = stablehlo.multiply %1368, %1369 : tensor<1x1024x16x16xf32>
-    %1371 = stablehlo.convert %arg310 : (tensor<1024x1x1xbf16>) -> tensor<1024x1x1xf32>
-    %1372 = stablehlo.broadcast_in_dim %1370, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xf32>
-    %1373 = stablehlo.broadcast_in_dim %1371, dims = [1, 2, 3] : (tensor<1024x1x1xf32>) -> tensor<1x1024x16x16xf32>
-    %1374 = stablehlo.add %1372, %1373 : tensor<1x1024x16x16xf32>
-    %1375 = stablehlo.convert %1374 : (tensor<1x1024x16x16xf32>) -> tensor<1x1024x16x16xbf16>
-    %1376 = stablehlo.broadcast_in_dim %1375, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1377 = stablehlo.maximum %1046, %1376 : tensor<1x1024x16x16xbf16>
-    %1378 = stablehlo.minimum %1046, %1376 : tensor<1x1024x16x16xbf16>
-    %1379 = stablehlo.broadcast_in_dim %1378, dims = [0, 1, 2, 3] : (tensor<1x1024x16x16xbf16>) -> tensor<1x1024x16x16xbf16>
-    %1380 = stablehlo.multiply %1379, %1051 : tensor<1x1024x16x16xbf16>
-    %1381 = stablehlo.add %1377, %1380 : tensor<1x1024x16x16xbf16>
-    %1382 = stablehlo.convolution(%1381, %arg59) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x1024x16x16xbf16>, tensor<255x1024x1x1xbf16>) -> tensor<1x255x16x16xbf16>
-    %1383 = stablehlo.reshape %arg60 : (tensor<255xbf16>) -> tensor<255x1x1xbf16>
-    %1384 = stablehlo.broadcast_in_dim %1382, dims = [0, 1, 2, 3] : (tensor<1x255x16x16xbf16>) -> tensor<1x255x16x16xbf16>
-    %1385 = stablehlo.broadcast_in_dim %1383, dims = [1, 2, 3] : (tensor<255x1x1xbf16>) -> tensor<1x255x16x16xbf16>
-    %1386 = stablehlo.add %1384, %1385 : tensor<1x255x16x16xbf16>
-    %1387 = stablehlo.convolution(%1358, %arg61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x16x16xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x16x16xbf16>
-    %1388 = stablehlo.convert %1387 : (tensor<1x256x16x16xbf16>) -> tensor<1x256x16x16xf32>
-    %1389 = stablehlo.broadcast_in_dim %1388, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xf32>) -> tensor<1x256x16x16xf32>
-    %1390 = stablehlo.broadcast_in_dim %arg311, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x16x16xf32>
-    %1391 = stablehlo.subtract %1389, %1390 : tensor<1x256x16x16xf32>
-    %1392 = stablehlo.broadcast_in_dim %1391, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xf32>) -> tensor<1x256x16x16xf32>
-    %1393 = stablehlo.broadcast_in_dim %arg312, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x16x16xf32>
-    %1394 = stablehlo.multiply %1392, %1393 : tensor<1x256x16x16xf32>
-    %1395 = stablehlo.convert %arg313 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1396 = stablehlo.broadcast_in_dim %1394, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xf32>) -> tensor<1x256x16x16xf32>
-    %1397 = stablehlo.broadcast_in_dim %1395, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x16x16xf32>
-    %1398 = stablehlo.multiply %1396, %1397 : tensor<1x256x16x16xf32>
-    %1399 = stablehlo.convert %arg314 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1400 = stablehlo.broadcast_in_dim %1398, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xf32>) -> tensor<1x256x16x16xf32>
-    %1401 = stablehlo.broadcast_in_dim %1399, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x16x16xf32>
-    %1402 = stablehlo.add %1400, %1401 : tensor<1x256x16x16xf32>
-    %1403 = stablehlo.convert %1402 : (tensor<1x256x16x16xf32>) -> tensor<1x256x16x16xbf16>
-    %1404 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x256x16x16xbf16>
-    %1405 = stablehlo.broadcast_in_dim %1403, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xbf16>) -> tensor<1x256x16x16xbf16>
-    %1406 = stablehlo.maximum %1404, %1405 : tensor<1x256x16x16xbf16>
-    %1407 = stablehlo.minimum %1404, %1405 : tensor<1x256x16x16xbf16>
-    %1408 = stablehlo.broadcast_in_dim %1407, dims = [0, 1, 2, 3] : (tensor<1x256x16x16xbf16>) -> tensor<1x256x16x16xbf16>
-    %1409 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x256x16x16xbf16>
-    %1410 = stablehlo.multiply %1408, %1409 : tensor<1x256x16x16xbf16>
-    %1411 = stablehlo.add %1406, %1410 : tensor<1x256x16x16xbf16>
-    %1412 = stablehlo.transpose %1411, dims = [0, 1, 3, 2] : (tensor<1x256x16x16xbf16>) -> tensor<1x256x16x16xbf16>
-    %1413 = stablehlo.reshape %1412 : (tensor<1x256x16x16xbf16>) -> tensor<256x16x16xbf16>
-    %1414 = stablehlo.broadcast_in_dim %arg315, dims = [0, 1, 2] : (tensor<256x16x32xbf16>) -> tensor<256x16x32xbf16>
-    %1415 = stablehlo.dot_general %1413, %1414, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x16x16xbf16>, tensor<256x16x32xbf16>) -> tensor<256x16x32xbf16>
-    %1416 = stablehlo.reshape %1415 : (tensor<256x16x32xbf16>) -> tensor<1x256x16x32xbf16>
-    %1417 = stablehlo.transpose %1416, dims = [0, 1, 3, 2] : (tensor<1x256x16x32xbf16>) -> tensor<1x256x32x16xbf16>
-    %1418 = stablehlo.reshape %1417 : (tensor<1x256x32x16xbf16>) -> tensor<256x32x16xbf16>
-    %1419 = stablehlo.broadcast_in_dim %arg316, dims = [0, 1, 2] : (tensor<256x16x32xbf16>) -> tensor<256x16x32xbf16>
-    %1420 = stablehlo.dot_general %1418, %1419, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<256x32x16xbf16>, tensor<256x16x32xbf16>) -> tensor<256x32x32xbf16>
-    %1421 = stablehlo.reshape %1420 : (tensor<256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1422 = stablehlo.concatenate %1421, %1028, dim = 1 : (tensor<1x256x32x32xbf16>, tensor<1x512x32x32xbf16>) -> tensor<1x768x32x32xbf16>
-    %1423 = stablehlo.convolution(%1422, %arg62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x768x32x32xbf16>, tensor<256x768x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %1424 = stablehlo.convert %1423 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %1425 = stablehlo.broadcast_in_dim %1424, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1426 = stablehlo.broadcast_in_dim %arg317, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1427 = stablehlo.subtract %1425, %1426 : tensor<1x256x32x32xf32>
-    %1428 = stablehlo.broadcast_in_dim %1427, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1429 = stablehlo.broadcast_in_dim %arg318, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1430 = stablehlo.multiply %1428, %1429 : tensor<1x256x32x32xf32>
-    %1431 = stablehlo.convert %arg319 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1432 = stablehlo.broadcast_in_dim %1430, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1433 = stablehlo.broadcast_in_dim %1431, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1434 = stablehlo.multiply %1432, %1433 : tensor<1x256x32x32xf32>
-    %1435 = stablehlo.convert %arg320 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1436 = stablehlo.broadcast_in_dim %1434, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1437 = stablehlo.broadcast_in_dim %1435, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1438 = stablehlo.add %1436, %1437 : tensor<1x256x32x32xf32>
-    %1439 = stablehlo.convert %1438 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %1440 = stablehlo.broadcast_in_dim %1439, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1441 = stablehlo.maximum %668, %1440 : tensor<1x256x32x32xbf16>
-    %1442 = stablehlo.minimum %668, %1440 : tensor<1x256x32x32xbf16>
-    %1443 = stablehlo.broadcast_in_dim %1442, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1444 = stablehlo.multiply %1443, %673 : tensor<1x256x32x32xbf16>
-    %1445 = stablehlo.add %1441, %1444 : tensor<1x256x32x32xbf16>
-    %1446 = stablehlo.convolution(%1445, %arg63) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %1447 = stablehlo.convert %1446 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %1448 = stablehlo.broadcast_in_dim %1447, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1449 = stablehlo.broadcast_in_dim %arg321, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1450 = stablehlo.subtract %1448, %1449 : tensor<1x512x32x32xf32>
-    %1451 = stablehlo.broadcast_in_dim %1450, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1452 = stablehlo.broadcast_in_dim %arg322, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1453 = stablehlo.multiply %1451, %1452 : tensor<1x512x32x32xf32>
-    %1454 = stablehlo.convert %arg323 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1455 = stablehlo.broadcast_in_dim %1453, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1456 = stablehlo.broadcast_in_dim %1454, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1457 = stablehlo.multiply %1455, %1456 : tensor<1x512x32x32xf32>
-    %1458 = stablehlo.convert %arg324 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1459 = stablehlo.broadcast_in_dim %1457, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1460 = stablehlo.broadcast_in_dim %1458, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1461 = stablehlo.add %1459, %1460 : tensor<1x512x32x32xf32>
-    %1462 = stablehlo.convert %1461 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %1463 = stablehlo.broadcast_in_dim %1462, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1464 = stablehlo.maximum %643, %1463 : tensor<1x512x32x32xbf16>
-    %1465 = stablehlo.minimum %643, %1463 : tensor<1x512x32x32xbf16>
-    %1466 = stablehlo.broadcast_in_dim %1465, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1467 = stablehlo.multiply %1466, %648 : tensor<1x512x32x32xbf16>
-    %1468 = stablehlo.add %1464, %1467 : tensor<1x512x32x32xbf16>
-    %1469 = stablehlo.convolution(%1468, %arg64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %1470 = stablehlo.convert %1469 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %1471 = stablehlo.broadcast_in_dim %1470, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1472 = stablehlo.broadcast_in_dim %arg325, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1473 = stablehlo.subtract %1471, %1472 : tensor<1x256x32x32xf32>
-    %1474 = stablehlo.broadcast_in_dim %1473, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1475 = stablehlo.broadcast_in_dim %arg326, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1476 = stablehlo.multiply %1474, %1475 : tensor<1x256x32x32xf32>
-    %1477 = stablehlo.convert %arg327 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1478 = stablehlo.broadcast_in_dim %1476, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1479 = stablehlo.broadcast_in_dim %1477, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1480 = stablehlo.multiply %1478, %1479 : tensor<1x256x32x32xf32>
-    %1481 = stablehlo.convert %arg328 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1482 = stablehlo.broadcast_in_dim %1480, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1483 = stablehlo.broadcast_in_dim %1481, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1484 = stablehlo.add %1482, %1483 : tensor<1x256x32x32xf32>
-    %1485 = stablehlo.convert %1484 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %1486 = stablehlo.broadcast_in_dim %1485, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1487 = stablehlo.maximum %668, %1486 : tensor<1x256x32x32xbf16>
-    %1488 = stablehlo.minimum %668, %1486 : tensor<1x256x32x32xbf16>
-    %1489 = stablehlo.broadcast_in_dim %1488, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1490 = stablehlo.multiply %1489, %673 : tensor<1x256x32x32xbf16>
-    %1491 = stablehlo.add %1487, %1490 : tensor<1x256x32x32xbf16>
-    %1492 = stablehlo.convolution(%1491, %arg65) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %1493 = stablehlo.convert %1492 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %1494 = stablehlo.broadcast_in_dim %1493, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1495 = stablehlo.broadcast_in_dim %arg329, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1496 = stablehlo.subtract %1494, %1495 : tensor<1x512x32x32xf32>
-    %1497 = stablehlo.broadcast_in_dim %1496, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1498 = stablehlo.broadcast_in_dim %arg330, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1499 = stablehlo.multiply %1497, %1498 : tensor<1x512x32x32xf32>
-    %1500 = stablehlo.convert %arg331 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1501 = stablehlo.broadcast_in_dim %1499, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1502 = stablehlo.broadcast_in_dim %1500, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1503 = stablehlo.multiply %1501, %1502 : tensor<1x512x32x32xf32>
-    %1504 = stablehlo.convert %arg332 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1505 = stablehlo.broadcast_in_dim %1503, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1506 = stablehlo.broadcast_in_dim %1504, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1507 = stablehlo.add %1505, %1506 : tensor<1x512x32x32xf32>
-    %1508 = stablehlo.convert %1507 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %1509 = stablehlo.broadcast_in_dim %1508, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1510 = stablehlo.maximum %643, %1509 : tensor<1x512x32x32xbf16>
-    %1511 = stablehlo.minimum %643, %1509 : tensor<1x512x32x32xbf16>
-    %1512 = stablehlo.broadcast_in_dim %1511, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1513 = stablehlo.multiply %1512, %648 : tensor<1x512x32x32xbf16>
-    %1514 = stablehlo.add %1510, %1513 : tensor<1x512x32x32xbf16>
-    %1515 = stablehlo.convolution(%1514, %arg66) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<256x512x1x1xbf16>) -> tensor<1x256x32x32xbf16>
-    %1516 = stablehlo.convert %1515 : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xf32>
-    %1517 = stablehlo.broadcast_in_dim %1516, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1518 = stablehlo.broadcast_in_dim %arg333, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1519 = stablehlo.subtract %1517, %1518 : tensor<1x256x32x32xf32>
-    %1520 = stablehlo.broadcast_in_dim %1519, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1521 = stablehlo.broadcast_in_dim %arg334, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1522 = stablehlo.multiply %1520, %1521 : tensor<1x256x32x32xf32>
-    %1523 = stablehlo.convert %arg335 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1524 = stablehlo.broadcast_in_dim %1522, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1525 = stablehlo.broadcast_in_dim %1523, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1526 = stablehlo.multiply %1524, %1525 : tensor<1x256x32x32xf32>
-    %1527 = stablehlo.convert %arg336 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1528 = stablehlo.broadcast_in_dim %1526, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xf32>
-    %1529 = stablehlo.broadcast_in_dim %1527, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x32x32xf32>
-    %1530 = stablehlo.add %1528, %1529 : tensor<1x256x32x32xf32>
-    %1531 = stablehlo.convert %1530 : (tensor<1x256x32x32xf32>) -> tensor<1x256x32x32xbf16>
-    %1532 = stablehlo.broadcast_in_dim %1531, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1533 = stablehlo.maximum %668, %1532 : tensor<1x256x32x32xbf16>
-    %1534 = stablehlo.minimum %668, %1532 : tensor<1x256x32x32xbf16>
-    %1535 = stablehlo.broadcast_in_dim %1534, dims = [0, 1, 2, 3] : (tensor<1x256x32x32xbf16>) -> tensor<1x256x32x32xbf16>
-    %1536 = stablehlo.multiply %1535, %673 : tensor<1x256x32x32xbf16>
-    %1537 = stablehlo.add %1533, %1536 : tensor<1x256x32x32xbf16>
-    %1538 = stablehlo.convolution(%1537, %arg67) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<512x256x3x3xbf16>) -> tensor<1x512x32x32xbf16>
-    %1539 = stablehlo.convert %1538 : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xf32>
-    %1540 = stablehlo.broadcast_in_dim %1539, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1541 = stablehlo.broadcast_in_dim %arg337, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1542 = stablehlo.subtract %1540, %1541 : tensor<1x512x32x32xf32>
-    %1543 = stablehlo.broadcast_in_dim %1542, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1544 = stablehlo.broadcast_in_dim %arg338, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1545 = stablehlo.multiply %1543, %1544 : tensor<1x512x32x32xf32>
-    %1546 = stablehlo.convert %arg339 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1547 = stablehlo.broadcast_in_dim %1545, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1548 = stablehlo.broadcast_in_dim %1546, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1549 = stablehlo.multiply %1547, %1548 : tensor<1x512x32x32xf32>
-    %1550 = stablehlo.convert %arg340 : (tensor<512x1x1xbf16>) -> tensor<512x1x1xf32>
-    %1551 = stablehlo.broadcast_in_dim %1549, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xf32>
-    %1552 = stablehlo.broadcast_in_dim %1550, dims = [1, 2, 3] : (tensor<512x1x1xf32>) -> tensor<1x512x32x32xf32>
-    %1553 = stablehlo.add %1551, %1552 : tensor<1x512x32x32xf32>
-    %1554 = stablehlo.convert %1553 : (tensor<1x512x32x32xf32>) -> tensor<1x512x32x32xbf16>
-    %1555 = stablehlo.broadcast_in_dim %1554, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1556 = stablehlo.maximum %643, %1555 : tensor<1x512x32x32xbf16>
-    %1557 = stablehlo.minimum %643, %1555 : tensor<1x512x32x32xbf16>
-    %1558 = stablehlo.broadcast_in_dim %1557, dims = [0, 1, 2, 3] : (tensor<1x512x32x32xbf16>) -> tensor<1x512x32x32xbf16>
-    %1559 = stablehlo.multiply %1558, %648 : tensor<1x512x32x32xbf16>
-    %1560 = stablehlo.add %1556, %1559 : tensor<1x512x32x32xbf16>
-    %1561 = stablehlo.convolution(%1560, %arg68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x512x32x32xbf16>, tensor<255x512x1x1xbf16>) -> tensor<1x255x32x32xbf16>
-    %1562 = stablehlo.reshape %arg69 : (tensor<255xbf16>) -> tensor<255x1x1xbf16>
-    %1563 = stablehlo.broadcast_in_dim %1561, dims = [0, 1, 2, 3] : (tensor<1x255x32x32xbf16>) -> tensor<1x255x32x32xbf16>
-    %1564 = stablehlo.broadcast_in_dim %1562, dims = [1, 2, 3] : (tensor<255x1x1xbf16>) -> tensor<1x255x32x32xbf16>
-    %1565 = stablehlo.add %1563, %1564 : tensor<1x255x32x32xbf16>
-    %1566 = stablehlo.convolution(%1537, %arg70) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x32x32xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x32x32xbf16>
-    %1567 = stablehlo.convert %1566 : (tensor<1x128x32x32xbf16>) -> tensor<1x128x32x32xf32>
-    %1568 = stablehlo.broadcast_in_dim %1567, dims = [0, 1, 2, 3] : (tensor<1x128x32x32xf32>) -> tensor<1x128x32x32xf32>
-    %1569 = stablehlo.broadcast_in_dim %arg341, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x32x32xf32>
-    %1570 = stablehlo.subtract %1568, %1569 : tensor<1x128x32x32xf32>
-    %1571 = stablehlo.broadcast_in_dim %1570, dims = [0, 1, 2, 3] : (tensor<1x128x32x32xf32>) -> tensor<1x128x32x32xf32>
-    %1572 = stablehlo.broadcast_in_dim %arg342, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x32x32xf32>
-    %1573 = stablehlo.multiply %1571, %1572 : tensor<1x128x32x32xf32>
-    %1574 = stablehlo.convert %arg343 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1575 = stablehlo.broadcast_in_dim %1573, dims = [0, 1, 2, 3] : (tensor<1x128x32x32xf32>) -> tensor<1x128x32x32xf32>
-    %1576 = stablehlo.broadcast_in_dim %1574, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x32x32xf32>
-    %1577 = stablehlo.multiply %1575, %1576 : tensor<1x128x32x32xf32>
-    %1578 = stablehlo.convert %arg344 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1579 = stablehlo.broadcast_in_dim %1577, dims = [0, 1, 2, 3] : (tensor<1x128x32x32xf32>) -> tensor<1x128x32x32xf32>
-    %1580 = stablehlo.broadcast_in_dim %1578, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x32x32xf32>
-    %1581 = stablehlo.add %1579, %1580 : tensor<1x128x32x32xf32>
-    %1582 = stablehlo.convert %1581 : (tensor<1x128x32x32xf32>) -> tensor<1x128x32x32xbf16>
-    %1583 = stablehlo.broadcast_in_dim %17, dims = [] : (tensor<bf16>) -> tensor<1x128x32x32xbf16>
-    %1584 = stablehlo.broadcast_in_dim %1582, dims = [0, 1, 2, 3] : (tensor<1x128x32x32xbf16>) -> tensor<1x128x32x32xbf16>
-    %1585 = stablehlo.maximum %1583, %1584 : tensor<1x128x32x32xbf16>
-    %1586 = stablehlo.minimum %1583, %1584 : tensor<1x128x32x32xbf16>
-    %1587 = stablehlo.broadcast_in_dim %1586, dims = [0, 1, 2, 3] : (tensor<1x128x32x32xbf16>) -> tensor<1x128x32x32xbf16>
-    %1588 = stablehlo.broadcast_in_dim %23, dims = [] : (tensor<bf16>) -> tensor<1x128x32x32xbf16>
-    %1589 = stablehlo.multiply %1587, %1588 : tensor<1x128x32x32xbf16>
-    %1590 = stablehlo.add %1585, %1589 : tensor<1x128x32x32xbf16>
-    %1591 = stablehlo.transpose %1590, dims = [0, 1, 3, 2] : (tensor<1x128x32x32xbf16>) -> tensor<1x128x32x32xbf16>
-    %1592 = stablehlo.reshape %1591 : (tensor<1x128x32x32xbf16>) -> tensor<128x32x32xbf16>
-    %1593 = stablehlo.broadcast_in_dim %arg345, dims = [0, 1, 2] : (tensor<128x32x64xbf16>) -> tensor<128x32x64xbf16>
-    %1594 = stablehlo.dot_general %1592, %1593, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<128x32x32xbf16>, tensor<128x32x64xbf16>) -> tensor<128x32x64xbf16>
-    %1595 = stablehlo.reshape %1594 : (tensor<128x32x64xbf16>) -> tensor<1x128x32x64xbf16>
-    %1596 = stablehlo.transpose %1595, dims = [0, 1, 3, 2] : (tensor<1x128x32x64xbf16>) -> tensor<1x128x64x32xbf16>
-    %1597 = stablehlo.reshape %1596 : (tensor<1x128x64x32xbf16>) -> tensor<128x64x32xbf16>
-    %1598 = stablehlo.broadcast_in_dim %arg346, dims = [0, 1, 2] : (tensor<128x32x64xbf16>) -> tensor<128x32x64xbf16>
-    %1599 = stablehlo.dot_general %1597, %1598, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<128x64x32xbf16>, tensor<128x32x64xbf16>) -> tensor<128x64x64xbf16>
-    %1600 = stablehlo.reshape %1599 : (tensor<128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1601 = stablehlo.concatenate %1600, %625, dim = 1 : (tensor<1x128x64x64xbf16>, tensor<1x256x64x64xbf16>) -> tensor<1x384x64x64xbf16>
-    %1602 = stablehlo.convolution(%1601, %arg71) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x384x64x64xbf16>, tensor<128x384x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %1603 = stablehlo.convert %1602 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %1604 = stablehlo.broadcast_in_dim %1603, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1605 = stablehlo.broadcast_in_dim %arg347, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1606 = stablehlo.subtract %1604, %1605 : tensor<1x128x64x64xf32>
-    %1607 = stablehlo.broadcast_in_dim %1606, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1608 = stablehlo.broadcast_in_dim %arg348, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1609 = stablehlo.multiply %1607, %1608 : tensor<1x128x64x64xf32>
-    %1610 = stablehlo.convert %arg349 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1611 = stablehlo.broadcast_in_dim %1609, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1612 = stablehlo.broadcast_in_dim %1610, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1613 = stablehlo.multiply %1611, %1612 : tensor<1x128x64x64xf32>
-    %1614 = stablehlo.convert %arg350 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1615 = stablehlo.broadcast_in_dim %1613, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1616 = stablehlo.broadcast_in_dim %1614, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1617 = stablehlo.add %1615, %1616 : tensor<1x128x64x64xf32>
-    %1618 = stablehlo.convert %1617 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %1619 = stablehlo.broadcast_in_dim %1618, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1620 = stablehlo.maximum %265, %1619 : tensor<1x128x64x64xbf16>
-    %1621 = stablehlo.minimum %265, %1619 : tensor<1x128x64x64xbf16>
-    %1622 = stablehlo.broadcast_in_dim %1621, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1623 = stablehlo.multiply %1622, %270 : tensor<1x128x64x64xbf16>
-    %1624 = stablehlo.add %1620, %1623 : tensor<1x128x64x64xbf16>
-    %1625 = stablehlo.convolution(%1624, %arg72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %1626 = stablehlo.convert %1625 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %1627 = stablehlo.broadcast_in_dim %1626, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1628 = stablehlo.broadcast_in_dim %arg351, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1629 = stablehlo.subtract %1627, %1628 : tensor<1x256x64x64xf32>
-    %1630 = stablehlo.broadcast_in_dim %1629, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1631 = stablehlo.broadcast_in_dim %arg352, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1632 = stablehlo.multiply %1630, %1631 : tensor<1x256x64x64xf32>
-    %1633 = stablehlo.convert %arg353 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1634 = stablehlo.broadcast_in_dim %1632, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1635 = stablehlo.broadcast_in_dim %1633, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1636 = stablehlo.multiply %1634, %1635 : tensor<1x256x64x64xf32>
-    %1637 = stablehlo.convert %arg354 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1638 = stablehlo.broadcast_in_dim %1636, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1639 = stablehlo.broadcast_in_dim %1637, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1640 = stablehlo.add %1638, %1639 : tensor<1x256x64x64xf32>
-    %1641 = stablehlo.convert %1640 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %1642 = stablehlo.broadcast_in_dim %1641, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1643 = stablehlo.maximum %240, %1642 : tensor<1x256x64x64xbf16>
-    %1644 = stablehlo.minimum %240, %1642 : tensor<1x256x64x64xbf16>
-    %1645 = stablehlo.broadcast_in_dim %1644, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1646 = stablehlo.multiply %1645, %245 : tensor<1x256x64x64xbf16>
-    %1647 = stablehlo.add %1643, %1646 : tensor<1x256x64x64xbf16>
-    %1648 = stablehlo.convolution(%1647, %arg73) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %1649 = stablehlo.convert %1648 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %1650 = stablehlo.broadcast_in_dim %1649, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1651 = stablehlo.broadcast_in_dim %arg355, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1652 = stablehlo.subtract %1650, %1651 : tensor<1x128x64x64xf32>
-    %1653 = stablehlo.broadcast_in_dim %1652, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1654 = stablehlo.broadcast_in_dim %arg356, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1655 = stablehlo.multiply %1653, %1654 : tensor<1x128x64x64xf32>
-    %1656 = stablehlo.convert %arg357 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1657 = stablehlo.broadcast_in_dim %1655, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1658 = stablehlo.broadcast_in_dim %1656, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1659 = stablehlo.multiply %1657, %1658 : tensor<1x128x64x64xf32>
-    %1660 = stablehlo.convert %arg358 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1661 = stablehlo.broadcast_in_dim %1659, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1662 = stablehlo.broadcast_in_dim %1660, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1663 = stablehlo.add %1661, %1662 : tensor<1x128x64x64xf32>
-    %1664 = stablehlo.convert %1663 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %1665 = stablehlo.broadcast_in_dim %1664, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1666 = stablehlo.maximum %265, %1665 : tensor<1x128x64x64xbf16>
-    %1667 = stablehlo.minimum %265, %1665 : tensor<1x128x64x64xbf16>
-    %1668 = stablehlo.broadcast_in_dim %1667, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1669 = stablehlo.multiply %1668, %270 : tensor<1x128x64x64xbf16>
-    %1670 = stablehlo.add %1666, %1669 : tensor<1x128x64x64xbf16>
-    %1671 = stablehlo.convolution(%1670, %arg74) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %1672 = stablehlo.convert %1671 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %1673 = stablehlo.broadcast_in_dim %1672, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1674 = stablehlo.broadcast_in_dim %arg359, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1675 = stablehlo.subtract %1673, %1674 : tensor<1x256x64x64xf32>
-    %1676 = stablehlo.broadcast_in_dim %1675, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1677 = stablehlo.broadcast_in_dim %arg360, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1678 = stablehlo.multiply %1676, %1677 : tensor<1x256x64x64xf32>
-    %1679 = stablehlo.convert %arg361 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1680 = stablehlo.broadcast_in_dim %1678, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1681 = stablehlo.broadcast_in_dim %1679, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1682 = stablehlo.multiply %1680, %1681 : tensor<1x256x64x64xf32>
-    %1683 = stablehlo.convert %arg362 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1684 = stablehlo.broadcast_in_dim %1682, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1685 = stablehlo.broadcast_in_dim %1683, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1686 = stablehlo.add %1684, %1685 : tensor<1x256x64x64xf32>
-    %1687 = stablehlo.convert %1686 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %1688 = stablehlo.broadcast_in_dim %1687, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1689 = stablehlo.maximum %240, %1688 : tensor<1x256x64x64xbf16>
-    %1690 = stablehlo.minimum %240, %1688 : tensor<1x256x64x64xbf16>
-    %1691 = stablehlo.broadcast_in_dim %1690, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1692 = stablehlo.multiply %1691, %245 : tensor<1x256x64x64xbf16>
-    %1693 = stablehlo.add %1689, %1692 : tensor<1x256x64x64xbf16>
-    %1694 = stablehlo.convolution(%1693, %arg75) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<128x256x1x1xbf16>) -> tensor<1x128x64x64xbf16>
-    %1695 = stablehlo.convert %1694 : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xf32>
-    %1696 = stablehlo.broadcast_in_dim %1695, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1697 = stablehlo.broadcast_in_dim %arg363, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1698 = stablehlo.subtract %1696, %1697 : tensor<1x128x64x64xf32>
-    %1699 = stablehlo.broadcast_in_dim %1698, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1700 = stablehlo.broadcast_in_dim %arg364, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1701 = stablehlo.multiply %1699, %1700 : tensor<1x128x64x64xf32>
-    %1702 = stablehlo.convert %arg365 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1703 = stablehlo.broadcast_in_dim %1701, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1704 = stablehlo.broadcast_in_dim %1702, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1705 = stablehlo.multiply %1703, %1704 : tensor<1x128x64x64xf32>
-    %1706 = stablehlo.convert %arg366 : (tensor<128x1x1xbf16>) -> tensor<128x1x1xf32>
-    %1707 = stablehlo.broadcast_in_dim %1705, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xf32>
-    %1708 = stablehlo.broadcast_in_dim %1706, dims = [1, 2, 3] : (tensor<128x1x1xf32>) -> tensor<1x128x64x64xf32>
-    %1709 = stablehlo.add %1707, %1708 : tensor<1x128x64x64xf32>
-    %1710 = stablehlo.convert %1709 : (tensor<1x128x64x64xf32>) -> tensor<1x128x64x64xbf16>
-    %1711 = stablehlo.broadcast_in_dim %1710, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1712 = stablehlo.maximum %265, %1711 : tensor<1x128x64x64xbf16>
-    %1713 = stablehlo.minimum %265, %1711 : tensor<1x128x64x64xbf16>
-    %1714 = stablehlo.broadcast_in_dim %1713, dims = [0, 1, 2, 3] : (tensor<1x128x64x64xbf16>) -> tensor<1x128x64x64xbf16>
-    %1715 = stablehlo.multiply %1714, %270 : tensor<1x128x64x64xbf16>
-    %1716 = stablehlo.add %1712, %1715 : tensor<1x128x64x64xbf16>
-    %1717 = stablehlo.convolution(%1716, %arg76) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x128x64x64xbf16>, tensor<256x128x3x3xbf16>) -> tensor<1x256x64x64xbf16>
-    %1718 = stablehlo.convert %1717 : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xf32>
-    %1719 = stablehlo.broadcast_in_dim %1718, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1720 = stablehlo.broadcast_in_dim %arg367, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1721 = stablehlo.subtract %1719, %1720 : tensor<1x256x64x64xf32>
-    %1722 = stablehlo.broadcast_in_dim %1721, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1723 = stablehlo.broadcast_in_dim %arg368, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1724 = stablehlo.multiply %1722, %1723 : tensor<1x256x64x64xf32>
-    %1725 = stablehlo.convert %arg369 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1726 = stablehlo.broadcast_in_dim %1724, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1727 = stablehlo.broadcast_in_dim %1725, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1728 = stablehlo.multiply %1726, %1727 : tensor<1x256x64x64xf32>
-    %1729 = stablehlo.convert %arg370 : (tensor<256x1x1xbf16>) -> tensor<256x1x1xf32>
-    %1730 = stablehlo.broadcast_in_dim %1728, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xf32>
-    %1731 = stablehlo.broadcast_in_dim %1729, dims = [1, 2, 3] : (tensor<256x1x1xf32>) -> tensor<1x256x64x64xf32>
-    %1732 = stablehlo.add %1730, %1731 : tensor<1x256x64x64xf32>
-    %1733 = stablehlo.convert %1732 : (tensor<1x256x64x64xf32>) -> tensor<1x256x64x64xbf16>
-    %1734 = stablehlo.broadcast_in_dim %1733, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1735 = stablehlo.maximum %240, %1734 : tensor<1x256x64x64xbf16>
-    %1736 = stablehlo.minimum %240, %1734 : tensor<1x256x64x64xbf16>
-    %1737 = stablehlo.broadcast_in_dim %1736, dims = [0, 1, 2, 3] : (tensor<1x256x64x64xbf16>) -> tensor<1x256x64x64xbf16>
-    %1738 = stablehlo.multiply %1737, %245 : tensor<1x256x64x64xbf16>
-    %1739 = stablehlo.add %1735, %1738 : tensor<1x256x64x64xbf16>
-    %1740 = stablehlo.convolution(%1739, %arg77) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x256x64x64xbf16>, tensor<255x256x1x1xbf16>) -> tensor<1x255x64x64xbf16>
-    %1741 = stablehlo.reshape %arg78 : (tensor<255xbf16>) -> tensor<255x1x1xbf16>
-    %1742 = stablehlo.broadcast_in_dim %1740, dims = [0, 1, 2, 3] : (tensor<1x255x64x64xbf16>) -> tensor<1x255x64x64xbf16>
-    %1743 = stablehlo.broadcast_in_dim %1741, dims = [1, 2, 3] : (tensor<255x1x1xbf16>) -> tensor<1x255x64x64xbf16>
-    %1744 = stablehlo.add %1742, %1743 : tensor<1x255x64x64xbf16>
-    return %1386, %1565, %1744 : tensor<1x255x16x16xbf16>, tensor<1x255x32x32xbf16>, tensor<1x255x64x64xbf16>
-  }
-}
diff --git a/mlir_tests/alibaba-damomgp-str-base.mlir b/mlir_tests/alibaba-damomgp-str-base.mlir
deleted file mode 100644
index 7c5b7ce1..00000000
--- a/mlir_tests/alibaba-damomgp-str-base.mlir
+++ /dev/null
@@ -1,2556 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x32x128xbf16>, %arg1: tensor<768x3x4x4xbf16>, %arg2: tensor<768xbf16>, %arg3: tensor<1x257x768xbf16>, %arg4: tensor<768xbf16>, %arg5: tensor<768xbf16>, %arg6: tensor<768xbf16>, %arg7: tensor<768xbf16>, %arg8: tensor<768xbf16>, %arg9: tensor<768xbf16>, %arg10: tensor<768xbf16>, %arg11: tensor<768xbf16>, %arg12: tensor<768xbf16>, %arg13: tensor<768xbf16>, %arg14: tensor<768xbf16>, %arg15: tensor<768xbf16>, %arg16: tensor<768xbf16>, %arg17: tensor<768xbf16>, %arg18: tensor<768xbf16>, %arg19: tensor<768xbf16>, %arg20: tensor<768xbf16>, %arg21: tensor<768xbf16>, %arg22: tensor<768xbf16>, %arg23: tensor<768xbf16>, %arg24: tensor<768xbf16>, %arg25: tensor<768xbf16>, %arg26: tensor<768xbf16>, %arg27: tensor<768xbf16>, %arg28: tensor<768xbf16>, %arg29: tensor<768xbf16>, %arg30: tensor<768xbf16>, %arg31: tensor<768xbf16>, %arg32: tensor<768xbf16>, %arg33: tensor<768xbf16>, %arg34: tensor<768xbf16>, %arg35: tensor<768xbf16>, %arg36: tensor<768xbf16>, %arg37: tensor<768xbf16>, %arg38: tensor<768xbf16>, %arg39: tensor<768xbf16>, %arg40: tensor<768xbf16>, %arg41: tensor<768xbf16>, %arg42: tensor<768xbf16>, %arg43: tensor<768xbf16>, %arg44: tensor<768xbf16>, %arg45: tensor<768xbf16>, %arg46: tensor<768xbf16>, %arg47: tensor<768xbf16>, %arg48: tensor<768xbf16>, %arg49: tensor<768xbf16>, %arg50: tensor<768xbf16>, %arg51: tensor<768xbf16>, %arg52: tensor<768xbf16>, %arg53: tensor<768xbf16>, %arg54: tensor<768x96x1x1xbf16>, %arg55: tensor<27x768x1x1xbf16>, %arg56: tensor<768x96x1x1xbf16>, %arg57: tensor<768xbf16>, %arg58: tensor<768xbf16>, %arg59: tensor<768xbf16>, %arg60: tensor<768xbf16>, %arg61: tensor<768x96x1x1xbf16>, %arg62: tensor<27x768x1x1xbf16>, %arg63: tensor<768x96x1x1xbf16>, %arg64: tensor<768xbf16>, %arg65: tensor<768xbf16>, %arg66: tensor<768xbf16>, %arg67: tensor<768xbf16>, %arg68: tensor<768x96x1x1xbf16>, %arg69: tensor<27x768x1x1xbf16>, %arg70: tensor<768x96x1x1xbf16>, %arg71: tensor<768xbf16>, %arg72: tensor<768xbf16>, %arg73: tensor<1x1x768xbf16>, %arg74: tensor<768x2304xf32>, %arg75: tensor<2304xf32>, %arg76: tensor<768x768xf32>, %arg77: tensor<768xf32>, %arg78: tensor<768x3072xf32>, %arg79: tensor<3072xf32>, %arg80: tensor<3072x768xf32>, %arg81: tensor<768xf32>, %arg82: tensor<768x2304xf32>, %arg83: tensor<2304xf32>, %arg84: tensor<768x768xf32>, %arg85: tensor<768xf32>, %arg86: tensor<768x3072xf32>, %arg87: tensor<3072xf32>, %arg88: tensor<3072x768xf32>, %arg89: tensor<768xf32>, %arg90: tensor<768x2304xf32>, %arg91: tensor<2304xf32>, %arg92: tensor<768x768xf32>, %arg93: tensor<768xf32>, %arg94: tensor<768x3072xf32>, %arg95: tensor<3072xf32>, %arg96: tensor<3072x768xf32>, %arg97: tensor<768xf32>, %arg98: tensor<768x2304xf32>, %arg99: tensor<2304xf32>, %arg100: tensor<768x768xf32>, %arg101: tensor<768xf32>, %arg102: tensor<768x3072xf32>, %arg103: tensor<3072xf32>, %arg104: tensor<3072x768xf32>, %arg105: tensor<768xf32>, %arg106: tensor<768x2304xf32>, %arg107: tensor<2304xf32>, %arg108: tensor<768x768xf32>, %arg109: tensor<768xf32>, %arg110: tensor<768x3072xf32>, %arg111: tensor<3072xf32>, %arg112: tensor<3072x768xf32>, %arg113: tensor<768xf32>, %arg114: tensor<768x2304xf32>, %arg115: tensor<2304xf32>, %arg116: tensor<768x768xf32>, %arg117: tensor<768xf32>, %arg118: tensor<768x3072xf32>, %arg119: tensor<3072xf32>, %arg120: tensor<3072x768xf32>, %arg121: tensor<768xf32>, %arg122: tensor<768x2304xf32>, %arg123: tensor<2304xf32>, %arg124: tensor<768x768xf32>, %arg125: tensor<768xf32>, %arg126: tensor<768x3072xf32>, %arg127: tensor<3072xf32>, %arg128: tensor<3072x768xf32>, %arg129: tensor<768xf32>, %arg130: tensor<768x2304xf32>, %arg131: tensor<2304xf32>, %arg132: tensor<768x768xf32>, %arg133: tensor<768xf32>, %arg134: tensor<768x3072xf32>, %arg135: tensor<3072xf32>, %arg136: tensor<3072x768xf32>, %arg137: tensor<768xf32>, %arg138: tensor<768x2304xf32>, %arg139: tensor<2304xf32>, %arg140: tensor<768x768xf32>, %arg141: tensor<768xf32>, %arg142: tensor<768x3072xf32>, %arg143: tensor<3072xf32>, %arg144: tensor<3072x768xf32>, %arg145: tensor<768xf32>, %arg146: tensor<768x2304xf32>, %arg147: tensor<2304xf32>, %arg148: tensor<768x768xf32>, %arg149: tensor<768xf32>, %arg150: tensor<768x3072xf32>, %arg151: tensor<3072xf32>, %arg152: tensor<3072x768xf32>, %arg153: tensor<768xf32>, %arg154: tensor<768x2304xf32>, %arg155: tensor<2304xf32>, %arg156: tensor<768x768xf32>, %arg157: tensor<768xf32>, %arg158: tensor<768x3072xf32>, %arg159: tensor<3072xf32>, %arg160: tensor<3072x768xf32>, %arg161: tensor<768xf32>, %arg162: tensor<768x2304xf32>, %arg163: tensor<2304xf32>, %arg164: tensor<768x768xf32>, %arg165: tensor<768xf32>, %arg166: tensor<768x3072xf32>, %arg167: tensor<3072xf32>, %arg168: tensor<3072x768xf32>, %arg169: tensor<768xf32>, %arg170: tensor<768x38xf32>, %arg171: tensor<38xf32>, %arg172: tensor<768x50257xf32>, %arg173: tensor<50257xf32>, %arg174: tensor<768x30522xf32>, %arg175: tensor<30522xf32>) -> (tensor<1x27x38xbf16>, tensor<1x27x50257xbf16>, tensor<1x27x30522xbf16>) {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x257x3072xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x257x3072xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x257x3072xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x257x3072xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x257x3072xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x257x3072xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x257x3072xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x257x3072xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x257x3072xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x257x3072xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x257x3072xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x257x3072xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x257x3072xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x257x3072xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x257x3072xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x257x3072xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x257x3072xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x257x3072xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x257x3072xf32>
-    %cst_21 = arith.constant dense<768> : tensor<1xi64>
-    %cst_22 = arith.constant dense<1.000000e-05> : tensor<1xf64>
-    %cst_23 = arith.constant dense<1> : tensor<1xi64>
-    %cst_24 = arith.constant dense<1.250000e-01> : tensor<1xf64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [4, 4], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x32x128xbf16>, tensor<768x3x4x4xbf16>) -> tensor<1x768x8x32xbf16>
-    %1 = stablehlo.reshape %arg2 : (tensor<768xbf16>) -> tensor<768x1x1xbf16>
-    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x768x8x32xbf16>) -> tensor<1x768x8x32xbf16>
-    %3 = stablehlo.broadcast_in_dim %1, dims = [1, 2, 3] : (tensor<768x1x1xbf16>) -> tensor<1x768x8x32xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x768x8x32xbf16>
-    %5 = stablehlo.reshape %4 : (tensor<1x768x8x32xbf16>) -> tensor<1x768x256xbf16>
-    %6 = stablehlo.transpose %5, dims = [0, 2, 1] : (tensor<1x768x256xbf16>) -> tensor<1x256x768xbf16>
-    %7 = stablehlo.concatenate %arg73, %6, dim = 1 : (tensor<1x1x768xbf16>, tensor<1x256x768xbf16>) -> tensor<1x257x768xbf16>
-    %8 = stablehlo.add %7, %arg3 : tensor<1x257x768xbf16>
-    %9 = stablehlo.convert %8 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %10 = stablehlo.convert %9 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %11 = stablehlo.reduce(%10 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %12 = stablehlo.reshape %11 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %13 = stablehlo.convert %cst_21 : (tensor<1xi64>) -> tensor<1xf64>
-    %14 = stablehlo.reshape %13 : (tensor<1xf64>) -> tensor<f64>
-    %15 = stablehlo.broadcast_in_dim %12, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %16 = stablehlo.broadcast_in_dim %14, dims = [] : (tensor<f64>) -> tensor<1x257x1xf64>
-    %17 = stablehlo.divide %15, %16 : tensor<1x257x1xf64>
-    %18 = stablehlo.broadcast_in_dim %10, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %19 = stablehlo.broadcast_in_dim %17, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %20 = stablehlo.subtract %18, %19 : tensor<1x257x768xf64>
-    %21 = stablehlo.multiply %20, %20 : tensor<1x257x768xf64>
-    %22 = stablehlo.reduce(%21 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %23 = stablehlo.reshape %22 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %24 = stablehlo.broadcast_in_dim %23, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %25 = stablehlo.divide %24, %16 : tensor<1x257x1xf64>
-    %26 = stablehlo.convert %25 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %27 = stablehlo.reduce(%9 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %28 = stablehlo.reshape %27 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %29 = stablehlo.convert %cst_21 : (tensor<1xi64>) -> tensor<1xf32>
-    %30 = stablehlo.reshape %29 : (tensor<1xf32>) -> tensor<f32>
-    %31 = stablehlo.broadcast_in_dim %28, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %32 = stablehlo.broadcast_in_dim %30, dims = [] : (tensor<f32>) -> tensor<1x257x1xf32>
-    %33 = stablehlo.divide %31, %32 : tensor<1x257x1xf32>
-    %34 = stablehlo.convert %cst_22 : (tensor<1xf64>) -> tensor<1xf32>
-    %35 = stablehlo.reshape %34 : (tensor<1xf32>) -> tensor<f32>
-    %36 = stablehlo.broadcast_in_dim %26, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %37 = stablehlo.broadcast_in_dim %35, dims = [] : (tensor<f32>) -> tensor<1x257x1xf32>
-    %38 = stablehlo.add %36, %37 : tensor<1x257x1xf32>
-    %39 = stablehlo.rsqrt %38 : tensor<1x257x1xf32>
-    %40 = stablehlo.broadcast_in_dim %9, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %41 = stablehlo.broadcast_in_dim %33, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %42 = stablehlo.subtract %40, %41 : tensor<1x257x768xf32>
-    %43 = stablehlo.broadcast_in_dim %42, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %44 = stablehlo.broadcast_in_dim %39, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %45 = stablehlo.multiply %43, %44 : tensor<1x257x768xf32>
-    %46 = stablehlo.convert %arg4 : (tensor<768xbf16>) -> tensor<768xf32>
-    %47 = stablehlo.broadcast_in_dim %45, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %48 = stablehlo.broadcast_in_dim %46, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %49 = stablehlo.multiply %47, %48 : tensor<1x257x768xf32>
-    %50 = stablehlo.convert %arg5 : (tensor<768xbf16>) -> tensor<768xf32>
-    %51 = stablehlo.broadcast_in_dim %49, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %52 = stablehlo.broadcast_in_dim %50, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %53 = stablehlo.add %51, %52 : tensor<1x257x768xf32>
-    %54 = stablehlo.convert %53 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %55 = stablehlo.reshape %54 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %56 = stablehlo.convert %55 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %57 = stablehlo.dot_general %56, %arg74, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %58 = stablehlo.convert %cst_23 : (tensor<1xi64>) -> tensor<1xf32>
-    %59 = stablehlo.reshape %58 : (tensor<1xf32>) -> tensor<f32>
-    %60 = stablehlo.broadcast_in_dim %57, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %61 = stablehlo.broadcast_in_dim %59, dims = [] : (tensor<f32>) -> tensor<257x2304xf32>
-    %62 = stablehlo.multiply %60, %61 : tensor<257x2304xf32>
-    %63 = stablehlo.broadcast_in_dim %62, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %64 = stablehlo.broadcast_in_dim %arg75, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %65 = stablehlo.add %63, %64 : tensor<257x2304xf32>
-    %66 = stablehlo.convert %65 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %67 = stablehlo.reshape %66 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %68 = stablehlo.reshape %67 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %69 = stablehlo.transpose %68, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %70 = stablehlo.slice %69 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %71 = stablehlo.reshape %70 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %72 = stablehlo.slice %69 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %73 = stablehlo.reshape %72 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %74 = stablehlo.slice %69 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %75 = stablehlo.reshape %74 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %76 = stablehlo.transpose %73, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %77 = stablehlo.reshape %71 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %78 = stablehlo.reshape %76 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %79 = stablehlo.broadcast_in_dim %78, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %80 = stablehlo.dot_general %77, %79, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %81 = stablehlo.reshape %80 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %82 = stablehlo.convert %cst_24 : (tensor<1xf64>) -> tensor<1xbf16>
-    %83 = stablehlo.reshape %82 : (tensor<1xbf16>) -> tensor<bf16>
-    %84 = stablehlo.broadcast_in_dim %81, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %85 = stablehlo.broadcast_in_dim %83, dims = [] : (tensor<bf16>) -> tensor<1x12x257x257xbf16>
-    %86 = stablehlo.multiply %84, %85 : tensor<1x12x257x257xbf16>
-    %87 = stablehlo.convert %86 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %88 = stablehlo.reduce(%87 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %89 = stablehlo.reshape %88 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %90 = stablehlo.broadcast_in_dim %87, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %91 = stablehlo.broadcast_in_dim %89, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %92 = stablehlo.subtract %90, %91 : tensor<1x12x257x257xf32>
-    %93 = stablehlo.exponential %92 : tensor<1x12x257x257xf32>
-    %94 = stablehlo.reduce(%93 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %95 = stablehlo.reshape %94 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %96 = stablehlo.broadcast_in_dim %93, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %97 = stablehlo.broadcast_in_dim %95, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %98 = stablehlo.divide %96, %97 : tensor<1x12x257x257xf32>
-    %99 = stablehlo.convert %98 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %100 = stablehlo.reshape %99 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %101 = stablehlo.reshape %75 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %102 = stablehlo.broadcast_in_dim %101, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %103 = stablehlo.dot_general %100, %102, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %104 = stablehlo.reshape %103 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %105 = stablehlo.transpose %104, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %106 = stablehlo.reshape %105 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %107 = stablehlo.reshape %106 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %108 = stablehlo.convert %107 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %109 = stablehlo.dot_general %108, %arg76, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %110 = stablehlo.broadcast_in_dim %109, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %111 = stablehlo.broadcast_in_dim %59, dims = [] : (tensor<f32>) -> tensor<257x768xf32>
-    %112 = stablehlo.multiply %110, %111 : tensor<257x768xf32>
-    %113 = stablehlo.broadcast_in_dim %112, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %114 = stablehlo.broadcast_in_dim %arg77, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %115 = stablehlo.add %113, %114 : tensor<257x768xf32>
-    %116 = stablehlo.convert %115 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %117 = stablehlo.reshape %116 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %118 = stablehlo.add %117, %8 : tensor<1x257x768xbf16>
-    %119 = stablehlo.convert %118 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %120 = stablehlo.convert %119 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %121 = stablehlo.reduce(%120 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %122 = stablehlo.reshape %121 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %123 = stablehlo.broadcast_in_dim %122, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %124 = stablehlo.divide %123, %16 : tensor<1x257x1xf64>
-    %125 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %126 = stablehlo.broadcast_in_dim %124, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %127 = stablehlo.subtract %125, %126 : tensor<1x257x768xf64>
-    %128 = stablehlo.multiply %127, %127 : tensor<1x257x768xf64>
-    %129 = stablehlo.reduce(%128 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %130 = stablehlo.reshape %129 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %131 = stablehlo.broadcast_in_dim %130, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %132 = stablehlo.divide %131, %16 : tensor<1x257x1xf64>
-    %133 = stablehlo.convert %132 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %134 = stablehlo.reduce(%119 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %135 = stablehlo.reshape %134 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %136 = stablehlo.broadcast_in_dim %135, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %137 = stablehlo.divide %136, %32 : tensor<1x257x1xf32>
-    %138 = stablehlo.broadcast_in_dim %133, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %139 = stablehlo.add %138, %37 : tensor<1x257x1xf32>
-    %140 = stablehlo.rsqrt %139 : tensor<1x257x1xf32>
-    %141 = stablehlo.broadcast_in_dim %119, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %142 = stablehlo.broadcast_in_dim %137, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %143 = stablehlo.subtract %141, %142 : tensor<1x257x768xf32>
-    %144 = stablehlo.broadcast_in_dim %143, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %145 = stablehlo.broadcast_in_dim %140, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %146 = stablehlo.multiply %144, %145 : tensor<1x257x768xf32>
-    %147 = stablehlo.convert %arg6 : (tensor<768xbf16>) -> tensor<768xf32>
-    %148 = stablehlo.broadcast_in_dim %146, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %149 = stablehlo.broadcast_in_dim %147, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %150 = stablehlo.multiply %148, %149 : tensor<1x257x768xf32>
-    %151 = stablehlo.convert %arg7 : (tensor<768xbf16>) -> tensor<768xf32>
-    %152 = stablehlo.broadcast_in_dim %150, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %153 = stablehlo.broadcast_in_dim %151, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %154 = stablehlo.add %152, %153 : tensor<1x257x768xf32>
-    %155 = stablehlo.convert %154 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %156 = stablehlo.reshape %155 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %157 = stablehlo.convert %156 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %158 = stablehlo.dot_general %157, %arg78, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %159 = stablehlo.broadcast_in_dim %158, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %160 = stablehlo.broadcast_in_dim %59, dims = [] : (tensor<f32>) -> tensor<257x3072xf32>
-    %161 = stablehlo.multiply %159, %160 : tensor<257x3072xf32>
-    %162 = stablehlo.broadcast_in_dim %161, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %163 = stablehlo.broadcast_in_dim %arg79, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %164 = stablehlo.add %162, %163 : tensor<257x3072xf32>
-    %165 = stablehlo.convert %164 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %166 = stablehlo.reshape %165 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %167 = stablehlo.multiply %166, %cst_4 : tensor<1x257x3072xbf16>
-    %168 = stablehlo.rsqrt %cst_3 : tensor<1x257x3072xbf16>
-    %169 = stablehlo.multiply %166, %168 : tensor<1x257x3072xbf16>
-    %170 = stablehlo.convert %169 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %171 = stablehlo.clamp %cst_5, %170, %cst_6 : tensor<1x257x3072xf32>
-    %172 = stablehlo.multiply %171, %171 : tensor<1x257x3072xf32>
-    %173 = stablehlo.multiply %cst_7, %172 : tensor<1x257x3072xf32>
-    %174 = stablehlo.add %173, %cst_8 : tensor<1x257x3072xf32>
-    %175 = stablehlo.multiply %174, %172 : tensor<1x257x3072xf32>
-    %176 = stablehlo.add %175, %cst_9 : tensor<1x257x3072xf32>
-    %177 = stablehlo.multiply %176, %172 : tensor<1x257x3072xf32>
-    %178 = stablehlo.add %177, %cst_10 : tensor<1x257x3072xf32>
-    %179 = stablehlo.multiply %178, %172 : tensor<1x257x3072xf32>
-    %180 = stablehlo.add %179, %cst_11 : tensor<1x257x3072xf32>
-    %181 = stablehlo.multiply %180, %172 : tensor<1x257x3072xf32>
-    %182 = stablehlo.add %181, %cst_12 : tensor<1x257x3072xf32>
-    %183 = stablehlo.multiply %182, %172 : tensor<1x257x3072xf32>
-    %184 = stablehlo.add %183, %cst_13 : tensor<1x257x3072xf32>
-    %185 = stablehlo.multiply %cst_14, %172 : tensor<1x257x3072xf32>
-    %186 = stablehlo.add %185, %cst_15 : tensor<1x257x3072xf32>
-    %187 = stablehlo.multiply %186, %172 : tensor<1x257x3072xf32>
-    %188 = stablehlo.add %187, %cst_16 : tensor<1x257x3072xf32>
-    %189 = stablehlo.multiply %188, %172 : tensor<1x257x3072xf32>
-    %190 = stablehlo.add %189, %cst_17 : tensor<1x257x3072xf32>
-    %191 = stablehlo.multiply %190, %172 : tensor<1x257x3072xf32>
-    %192 = stablehlo.add %191, %cst_18 : tensor<1x257x3072xf32>
-    %193 = stablehlo.multiply %171, %184 : tensor<1x257x3072xf32>
-    %194 = stablehlo.divide %193, %192 : tensor<1x257x3072xf32>
-    %195 = stablehlo.clamp %cst_19, %194, %cst_20 : tensor<1x257x3072xf32>
-    %196 = stablehlo.convert %195 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %197 = stablehlo.add %196, %cst_2 : tensor<1x257x3072xbf16>
-    %198 = stablehlo.multiply %197, %167 : tensor<1x257x3072xbf16>
-    %199 = stablehlo.reshape %198 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %200 = stablehlo.convert %199 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %201 = stablehlo.dot_general %200, %arg80, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %202 = stablehlo.broadcast_in_dim %201, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %203 = stablehlo.multiply %202, %111 : tensor<257x768xf32>
-    %204 = stablehlo.broadcast_in_dim %203, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %205 = stablehlo.broadcast_in_dim %arg81, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %206 = stablehlo.add %204, %205 : tensor<257x768xf32>
-    %207 = stablehlo.convert %206 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %208 = stablehlo.reshape %207 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %209 = stablehlo.add %118, %208 : tensor<1x257x768xbf16>
-    %210 = stablehlo.convert %209 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %211 = stablehlo.convert %210 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %212 = stablehlo.reduce(%211 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %213 = stablehlo.reshape %212 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %214 = stablehlo.broadcast_in_dim %213, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %215 = stablehlo.divide %214, %16 : tensor<1x257x1xf64>
-    %216 = stablehlo.broadcast_in_dim %211, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %217 = stablehlo.broadcast_in_dim %215, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %218 = stablehlo.subtract %216, %217 : tensor<1x257x768xf64>
-    %219 = stablehlo.multiply %218, %218 : tensor<1x257x768xf64>
-    %220 = stablehlo.reduce(%219 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %221 = stablehlo.reshape %220 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %222 = stablehlo.broadcast_in_dim %221, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %223 = stablehlo.divide %222, %16 : tensor<1x257x1xf64>
-    %224 = stablehlo.convert %223 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %225 = stablehlo.reduce(%210 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %226 = stablehlo.reshape %225 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %227 = stablehlo.broadcast_in_dim %226, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %228 = stablehlo.divide %227, %32 : tensor<1x257x1xf32>
-    %229 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %230 = stablehlo.add %229, %37 : tensor<1x257x1xf32>
-    %231 = stablehlo.rsqrt %230 : tensor<1x257x1xf32>
-    %232 = stablehlo.broadcast_in_dim %210, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %233 = stablehlo.broadcast_in_dim %228, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %234 = stablehlo.subtract %232, %233 : tensor<1x257x768xf32>
-    %235 = stablehlo.broadcast_in_dim %234, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %236 = stablehlo.broadcast_in_dim %231, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %237 = stablehlo.multiply %235, %236 : tensor<1x257x768xf32>
-    %238 = stablehlo.convert %arg8 : (tensor<768xbf16>) -> tensor<768xf32>
-    %239 = stablehlo.broadcast_in_dim %237, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %240 = stablehlo.broadcast_in_dim %238, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %241 = stablehlo.multiply %239, %240 : tensor<1x257x768xf32>
-    %242 = stablehlo.convert %arg9 : (tensor<768xbf16>) -> tensor<768xf32>
-    %243 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %244 = stablehlo.broadcast_in_dim %242, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %245 = stablehlo.add %243, %244 : tensor<1x257x768xf32>
-    %246 = stablehlo.convert %245 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %247 = stablehlo.reshape %246 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %248 = stablehlo.convert %247 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %249 = stablehlo.dot_general %248, %arg82, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %250 = stablehlo.broadcast_in_dim %249, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %251 = stablehlo.multiply %250, %61 : tensor<257x2304xf32>
-    %252 = stablehlo.broadcast_in_dim %251, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %253 = stablehlo.broadcast_in_dim %arg83, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %254 = stablehlo.add %252, %253 : tensor<257x2304xf32>
-    %255 = stablehlo.convert %254 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %256 = stablehlo.reshape %255 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %257 = stablehlo.reshape %256 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %258 = stablehlo.transpose %257, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %259 = stablehlo.slice %258 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %260 = stablehlo.reshape %259 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %261 = stablehlo.slice %258 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %262 = stablehlo.reshape %261 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %263 = stablehlo.slice %258 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %264 = stablehlo.reshape %263 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %265 = stablehlo.transpose %262, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %266 = stablehlo.reshape %260 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %267 = stablehlo.reshape %265 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %268 = stablehlo.broadcast_in_dim %267, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %269 = stablehlo.dot_general %266, %268, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %270 = stablehlo.reshape %269 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %271 = stablehlo.broadcast_in_dim %270, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %272 = stablehlo.multiply %271, %85 : tensor<1x12x257x257xbf16>
-    %273 = stablehlo.convert %272 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %274 = stablehlo.reduce(%273 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %275 = stablehlo.reshape %274 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %276 = stablehlo.broadcast_in_dim %273, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %277 = stablehlo.broadcast_in_dim %275, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %278 = stablehlo.subtract %276, %277 : tensor<1x12x257x257xf32>
-    %279 = stablehlo.exponential %278 : tensor<1x12x257x257xf32>
-    %280 = stablehlo.reduce(%279 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %281 = stablehlo.reshape %280 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %282 = stablehlo.broadcast_in_dim %279, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %283 = stablehlo.broadcast_in_dim %281, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %284 = stablehlo.divide %282, %283 : tensor<1x12x257x257xf32>
-    %285 = stablehlo.convert %284 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %286 = stablehlo.reshape %285 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %287 = stablehlo.reshape %264 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %288 = stablehlo.broadcast_in_dim %287, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %289 = stablehlo.dot_general %286, %288, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %290 = stablehlo.reshape %289 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %291 = stablehlo.transpose %290, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %292 = stablehlo.reshape %291 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %293 = stablehlo.reshape %292 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %294 = stablehlo.convert %293 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %295 = stablehlo.dot_general %294, %arg84, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %296 = stablehlo.broadcast_in_dim %295, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %297 = stablehlo.multiply %296, %111 : tensor<257x768xf32>
-    %298 = stablehlo.broadcast_in_dim %297, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %299 = stablehlo.broadcast_in_dim %arg85, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %300 = stablehlo.add %298, %299 : tensor<257x768xf32>
-    %301 = stablehlo.convert %300 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %302 = stablehlo.reshape %301 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %303 = stablehlo.add %302, %209 : tensor<1x257x768xbf16>
-    %304 = stablehlo.convert %303 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %305 = stablehlo.convert %304 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %306 = stablehlo.reduce(%305 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %307 = stablehlo.reshape %306 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %308 = stablehlo.broadcast_in_dim %307, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %309 = stablehlo.divide %308, %16 : tensor<1x257x1xf64>
-    %310 = stablehlo.broadcast_in_dim %305, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %311 = stablehlo.broadcast_in_dim %309, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %312 = stablehlo.subtract %310, %311 : tensor<1x257x768xf64>
-    %313 = stablehlo.multiply %312, %312 : tensor<1x257x768xf64>
-    %314 = stablehlo.reduce(%313 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %315 = stablehlo.reshape %314 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %316 = stablehlo.broadcast_in_dim %315, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %317 = stablehlo.divide %316, %16 : tensor<1x257x1xf64>
-    %318 = stablehlo.convert %317 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %319 = stablehlo.reduce(%304 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %320 = stablehlo.reshape %319 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %321 = stablehlo.broadcast_in_dim %320, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %322 = stablehlo.divide %321, %32 : tensor<1x257x1xf32>
-    %323 = stablehlo.broadcast_in_dim %318, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %324 = stablehlo.add %323, %37 : tensor<1x257x1xf32>
-    %325 = stablehlo.rsqrt %324 : tensor<1x257x1xf32>
-    %326 = stablehlo.broadcast_in_dim %304, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %327 = stablehlo.broadcast_in_dim %322, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %328 = stablehlo.subtract %326, %327 : tensor<1x257x768xf32>
-    %329 = stablehlo.broadcast_in_dim %328, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %330 = stablehlo.broadcast_in_dim %325, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %331 = stablehlo.multiply %329, %330 : tensor<1x257x768xf32>
-    %332 = stablehlo.convert %arg10 : (tensor<768xbf16>) -> tensor<768xf32>
-    %333 = stablehlo.broadcast_in_dim %331, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %334 = stablehlo.broadcast_in_dim %332, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %335 = stablehlo.multiply %333, %334 : tensor<1x257x768xf32>
-    %336 = stablehlo.convert %arg11 : (tensor<768xbf16>) -> tensor<768xf32>
-    %337 = stablehlo.broadcast_in_dim %335, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %338 = stablehlo.broadcast_in_dim %336, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %339 = stablehlo.add %337, %338 : tensor<1x257x768xf32>
-    %340 = stablehlo.convert %339 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %341 = stablehlo.reshape %340 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %342 = stablehlo.convert %341 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %343 = stablehlo.dot_general %342, %arg86, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %344 = stablehlo.broadcast_in_dim %343, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %345 = stablehlo.multiply %344, %160 : tensor<257x3072xf32>
-    %346 = stablehlo.broadcast_in_dim %345, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %347 = stablehlo.broadcast_in_dim %arg87, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %348 = stablehlo.add %346, %347 : tensor<257x3072xf32>
-    %349 = stablehlo.convert %348 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %350 = stablehlo.reshape %349 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %351 = stablehlo.multiply %350, %cst_4 : tensor<1x257x3072xbf16>
-    %352 = stablehlo.multiply %350, %168 : tensor<1x257x3072xbf16>
-    %353 = stablehlo.convert %352 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %354 = stablehlo.clamp %cst_5, %353, %cst_6 : tensor<1x257x3072xf32>
-    %355 = stablehlo.multiply %354, %354 : tensor<1x257x3072xf32>
-    %356 = stablehlo.multiply %cst_7, %355 : tensor<1x257x3072xf32>
-    %357 = stablehlo.add %356, %cst_8 : tensor<1x257x3072xf32>
-    %358 = stablehlo.multiply %357, %355 : tensor<1x257x3072xf32>
-    %359 = stablehlo.add %358, %cst_9 : tensor<1x257x3072xf32>
-    %360 = stablehlo.multiply %359, %355 : tensor<1x257x3072xf32>
-    %361 = stablehlo.add %360, %cst_10 : tensor<1x257x3072xf32>
-    %362 = stablehlo.multiply %361, %355 : tensor<1x257x3072xf32>
-    %363 = stablehlo.add %362, %cst_11 : tensor<1x257x3072xf32>
-    %364 = stablehlo.multiply %363, %355 : tensor<1x257x3072xf32>
-    %365 = stablehlo.add %364, %cst_12 : tensor<1x257x3072xf32>
-    %366 = stablehlo.multiply %365, %355 : tensor<1x257x3072xf32>
-    %367 = stablehlo.add %366, %cst_13 : tensor<1x257x3072xf32>
-    %368 = stablehlo.multiply %cst_14, %355 : tensor<1x257x3072xf32>
-    %369 = stablehlo.add %368, %cst_15 : tensor<1x257x3072xf32>
-    %370 = stablehlo.multiply %369, %355 : tensor<1x257x3072xf32>
-    %371 = stablehlo.add %370, %cst_16 : tensor<1x257x3072xf32>
-    %372 = stablehlo.multiply %371, %355 : tensor<1x257x3072xf32>
-    %373 = stablehlo.add %372, %cst_17 : tensor<1x257x3072xf32>
-    %374 = stablehlo.multiply %373, %355 : tensor<1x257x3072xf32>
-    %375 = stablehlo.add %374, %cst_18 : tensor<1x257x3072xf32>
-    %376 = stablehlo.multiply %354, %367 : tensor<1x257x3072xf32>
-    %377 = stablehlo.divide %376, %375 : tensor<1x257x3072xf32>
-    %378 = stablehlo.clamp %cst_19, %377, %cst_20 : tensor<1x257x3072xf32>
-    %379 = stablehlo.convert %378 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %380 = stablehlo.add %379, %cst_2 : tensor<1x257x3072xbf16>
-    %381 = stablehlo.multiply %380, %351 : tensor<1x257x3072xbf16>
-    %382 = stablehlo.reshape %381 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %383 = stablehlo.convert %382 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %384 = stablehlo.dot_general %383, %arg88, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %385 = stablehlo.broadcast_in_dim %384, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %386 = stablehlo.multiply %385, %111 : tensor<257x768xf32>
-    %387 = stablehlo.broadcast_in_dim %386, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %388 = stablehlo.broadcast_in_dim %arg89, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %389 = stablehlo.add %387, %388 : tensor<257x768xf32>
-    %390 = stablehlo.convert %389 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %391 = stablehlo.reshape %390 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %392 = stablehlo.add %303, %391 : tensor<1x257x768xbf16>
-    %393 = stablehlo.convert %392 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %394 = stablehlo.convert %393 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %395 = stablehlo.reduce(%394 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %396 = stablehlo.reshape %395 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %397 = stablehlo.broadcast_in_dim %396, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %398 = stablehlo.divide %397, %16 : tensor<1x257x1xf64>
-    %399 = stablehlo.broadcast_in_dim %394, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %400 = stablehlo.broadcast_in_dim %398, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %401 = stablehlo.subtract %399, %400 : tensor<1x257x768xf64>
-    %402 = stablehlo.multiply %401, %401 : tensor<1x257x768xf64>
-    %403 = stablehlo.reduce(%402 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %404 = stablehlo.reshape %403 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %405 = stablehlo.broadcast_in_dim %404, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %406 = stablehlo.divide %405, %16 : tensor<1x257x1xf64>
-    %407 = stablehlo.convert %406 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %408 = stablehlo.reduce(%393 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %409 = stablehlo.reshape %408 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %410 = stablehlo.broadcast_in_dim %409, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %411 = stablehlo.divide %410, %32 : tensor<1x257x1xf32>
-    %412 = stablehlo.broadcast_in_dim %407, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %413 = stablehlo.add %412, %37 : tensor<1x257x1xf32>
-    %414 = stablehlo.rsqrt %413 : tensor<1x257x1xf32>
-    %415 = stablehlo.broadcast_in_dim %393, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %416 = stablehlo.broadcast_in_dim %411, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %417 = stablehlo.subtract %415, %416 : tensor<1x257x768xf32>
-    %418 = stablehlo.broadcast_in_dim %417, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %419 = stablehlo.broadcast_in_dim %414, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %420 = stablehlo.multiply %418, %419 : tensor<1x257x768xf32>
-    %421 = stablehlo.convert %arg12 : (tensor<768xbf16>) -> tensor<768xf32>
-    %422 = stablehlo.broadcast_in_dim %420, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %423 = stablehlo.broadcast_in_dim %421, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %424 = stablehlo.multiply %422, %423 : tensor<1x257x768xf32>
-    %425 = stablehlo.convert %arg13 : (tensor<768xbf16>) -> tensor<768xf32>
-    %426 = stablehlo.broadcast_in_dim %424, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %427 = stablehlo.broadcast_in_dim %425, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %428 = stablehlo.add %426, %427 : tensor<1x257x768xf32>
-    %429 = stablehlo.convert %428 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %430 = stablehlo.reshape %429 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %431 = stablehlo.convert %430 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %432 = stablehlo.dot_general %431, %arg90, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %433 = stablehlo.broadcast_in_dim %432, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %434 = stablehlo.multiply %433, %61 : tensor<257x2304xf32>
-    %435 = stablehlo.broadcast_in_dim %434, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %436 = stablehlo.broadcast_in_dim %arg91, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %437 = stablehlo.add %435, %436 : tensor<257x2304xf32>
-    %438 = stablehlo.convert %437 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %439 = stablehlo.reshape %438 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %440 = stablehlo.reshape %439 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %441 = stablehlo.transpose %440, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %442 = stablehlo.slice %441 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %443 = stablehlo.reshape %442 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %444 = stablehlo.slice %441 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %445 = stablehlo.reshape %444 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %446 = stablehlo.slice %441 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %447 = stablehlo.reshape %446 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %448 = stablehlo.transpose %445, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %449 = stablehlo.reshape %443 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %450 = stablehlo.reshape %448 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %451 = stablehlo.broadcast_in_dim %450, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %452 = stablehlo.dot_general %449, %451, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %453 = stablehlo.reshape %452 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %454 = stablehlo.broadcast_in_dim %453, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %455 = stablehlo.multiply %454, %85 : tensor<1x12x257x257xbf16>
-    %456 = stablehlo.convert %455 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %457 = stablehlo.reduce(%456 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %458 = stablehlo.reshape %457 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %459 = stablehlo.broadcast_in_dim %456, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %460 = stablehlo.broadcast_in_dim %458, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %461 = stablehlo.subtract %459, %460 : tensor<1x12x257x257xf32>
-    %462 = stablehlo.exponential %461 : tensor<1x12x257x257xf32>
-    %463 = stablehlo.reduce(%462 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %464 = stablehlo.reshape %463 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %465 = stablehlo.broadcast_in_dim %462, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %466 = stablehlo.broadcast_in_dim %464, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %467 = stablehlo.divide %465, %466 : tensor<1x12x257x257xf32>
-    %468 = stablehlo.convert %467 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %469 = stablehlo.reshape %468 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %470 = stablehlo.reshape %447 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %471 = stablehlo.broadcast_in_dim %470, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %472 = stablehlo.dot_general %469, %471, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %473 = stablehlo.reshape %472 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %474 = stablehlo.transpose %473, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %475 = stablehlo.reshape %474 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %476 = stablehlo.reshape %475 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %477 = stablehlo.convert %476 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %478 = stablehlo.dot_general %477, %arg92, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %479 = stablehlo.broadcast_in_dim %478, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %480 = stablehlo.multiply %479, %111 : tensor<257x768xf32>
-    %481 = stablehlo.broadcast_in_dim %480, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %482 = stablehlo.broadcast_in_dim %arg93, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %483 = stablehlo.add %481, %482 : tensor<257x768xf32>
-    %484 = stablehlo.convert %483 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %485 = stablehlo.reshape %484 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %486 = stablehlo.add %485, %392 : tensor<1x257x768xbf16>
-    %487 = stablehlo.convert %486 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %488 = stablehlo.convert %487 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %489 = stablehlo.reduce(%488 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %490 = stablehlo.reshape %489 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %491 = stablehlo.broadcast_in_dim %490, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %492 = stablehlo.divide %491, %16 : tensor<1x257x1xf64>
-    %493 = stablehlo.broadcast_in_dim %488, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %494 = stablehlo.broadcast_in_dim %492, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %495 = stablehlo.subtract %493, %494 : tensor<1x257x768xf64>
-    %496 = stablehlo.multiply %495, %495 : tensor<1x257x768xf64>
-    %497 = stablehlo.reduce(%496 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %498 = stablehlo.reshape %497 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %499 = stablehlo.broadcast_in_dim %498, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %500 = stablehlo.divide %499, %16 : tensor<1x257x1xf64>
-    %501 = stablehlo.convert %500 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %502 = stablehlo.reduce(%487 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %503 = stablehlo.reshape %502 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %504 = stablehlo.broadcast_in_dim %503, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %505 = stablehlo.divide %504, %32 : tensor<1x257x1xf32>
-    %506 = stablehlo.broadcast_in_dim %501, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %507 = stablehlo.add %506, %37 : tensor<1x257x1xf32>
-    %508 = stablehlo.rsqrt %507 : tensor<1x257x1xf32>
-    %509 = stablehlo.broadcast_in_dim %487, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %510 = stablehlo.broadcast_in_dim %505, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %511 = stablehlo.subtract %509, %510 : tensor<1x257x768xf32>
-    %512 = stablehlo.broadcast_in_dim %511, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %513 = stablehlo.broadcast_in_dim %508, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %514 = stablehlo.multiply %512, %513 : tensor<1x257x768xf32>
-    %515 = stablehlo.convert %arg14 : (tensor<768xbf16>) -> tensor<768xf32>
-    %516 = stablehlo.broadcast_in_dim %514, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %517 = stablehlo.broadcast_in_dim %515, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %518 = stablehlo.multiply %516, %517 : tensor<1x257x768xf32>
-    %519 = stablehlo.convert %arg15 : (tensor<768xbf16>) -> tensor<768xf32>
-    %520 = stablehlo.broadcast_in_dim %518, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %521 = stablehlo.broadcast_in_dim %519, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %522 = stablehlo.add %520, %521 : tensor<1x257x768xf32>
-    %523 = stablehlo.convert %522 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %524 = stablehlo.reshape %523 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %525 = stablehlo.convert %524 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %526 = stablehlo.dot_general %525, %arg94, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %527 = stablehlo.broadcast_in_dim %526, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %528 = stablehlo.multiply %527, %160 : tensor<257x3072xf32>
-    %529 = stablehlo.broadcast_in_dim %528, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %530 = stablehlo.broadcast_in_dim %arg95, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %531 = stablehlo.add %529, %530 : tensor<257x3072xf32>
-    %532 = stablehlo.convert %531 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %533 = stablehlo.reshape %532 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %534 = stablehlo.multiply %533, %cst_4 : tensor<1x257x3072xbf16>
-    %535 = stablehlo.multiply %533, %168 : tensor<1x257x3072xbf16>
-    %536 = stablehlo.convert %535 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %537 = stablehlo.clamp %cst_5, %536, %cst_6 : tensor<1x257x3072xf32>
-    %538 = stablehlo.multiply %537, %537 : tensor<1x257x3072xf32>
-    %539 = stablehlo.multiply %cst_7, %538 : tensor<1x257x3072xf32>
-    %540 = stablehlo.add %539, %cst_8 : tensor<1x257x3072xf32>
-    %541 = stablehlo.multiply %540, %538 : tensor<1x257x3072xf32>
-    %542 = stablehlo.add %541, %cst_9 : tensor<1x257x3072xf32>
-    %543 = stablehlo.multiply %542, %538 : tensor<1x257x3072xf32>
-    %544 = stablehlo.add %543, %cst_10 : tensor<1x257x3072xf32>
-    %545 = stablehlo.multiply %544, %538 : tensor<1x257x3072xf32>
-    %546 = stablehlo.add %545, %cst_11 : tensor<1x257x3072xf32>
-    %547 = stablehlo.multiply %546, %538 : tensor<1x257x3072xf32>
-    %548 = stablehlo.add %547, %cst_12 : tensor<1x257x3072xf32>
-    %549 = stablehlo.multiply %548, %538 : tensor<1x257x3072xf32>
-    %550 = stablehlo.add %549, %cst_13 : tensor<1x257x3072xf32>
-    %551 = stablehlo.multiply %cst_14, %538 : tensor<1x257x3072xf32>
-    %552 = stablehlo.add %551, %cst_15 : tensor<1x257x3072xf32>
-    %553 = stablehlo.multiply %552, %538 : tensor<1x257x3072xf32>
-    %554 = stablehlo.add %553, %cst_16 : tensor<1x257x3072xf32>
-    %555 = stablehlo.multiply %554, %538 : tensor<1x257x3072xf32>
-    %556 = stablehlo.add %555, %cst_17 : tensor<1x257x3072xf32>
-    %557 = stablehlo.multiply %556, %538 : tensor<1x257x3072xf32>
-    %558 = stablehlo.add %557, %cst_18 : tensor<1x257x3072xf32>
-    %559 = stablehlo.multiply %537, %550 : tensor<1x257x3072xf32>
-    %560 = stablehlo.divide %559, %558 : tensor<1x257x3072xf32>
-    %561 = stablehlo.clamp %cst_19, %560, %cst_20 : tensor<1x257x3072xf32>
-    %562 = stablehlo.convert %561 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %563 = stablehlo.add %562, %cst_2 : tensor<1x257x3072xbf16>
-    %564 = stablehlo.multiply %563, %534 : tensor<1x257x3072xbf16>
-    %565 = stablehlo.reshape %564 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %566 = stablehlo.convert %565 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %567 = stablehlo.dot_general %566, %arg96, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %568 = stablehlo.broadcast_in_dim %567, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %569 = stablehlo.multiply %568, %111 : tensor<257x768xf32>
-    %570 = stablehlo.broadcast_in_dim %569, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %571 = stablehlo.broadcast_in_dim %arg97, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %572 = stablehlo.add %570, %571 : tensor<257x768xf32>
-    %573 = stablehlo.convert %572 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %574 = stablehlo.reshape %573 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %575 = stablehlo.add %486, %574 : tensor<1x257x768xbf16>
-    %576 = stablehlo.convert %575 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %577 = stablehlo.convert %576 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %578 = stablehlo.reduce(%577 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %579 = stablehlo.reshape %578 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %580 = stablehlo.broadcast_in_dim %579, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %581 = stablehlo.divide %580, %16 : tensor<1x257x1xf64>
-    %582 = stablehlo.broadcast_in_dim %577, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %583 = stablehlo.broadcast_in_dim %581, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %584 = stablehlo.subtract %582, %583 : tensor<1x257x768xf64>
-    %585 = stablehlo.multiply %584, %584 : tensor<1x257x768xf64>
-    %586 = stablehlo.reduce(%585 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %587 = stablehlo.reshape %586 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %588 = stablehlo.broadcast_in_dim %587, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %589 = stablehlo.divide %588, %16 : tensor<1x257x1xf64>
-    %590 = stablehlo.convert %589 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %591 = stablehlo.reduce(%576 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %592 = stablehlo.reshape %591 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %593 = stablehlo.broadcast_in_dim %592, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %594 = stablehlo.divide %593, %32 : tensor<1x257x1xf32>
-    %595 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %596 = stablehlo.add %595, %37 : tensor<1x257x1xf32>
-    %597 = stablehlo.rsqrt %596 : tensor<1x257x1xf32>
-    %598 = stablehlo.broadcast_in_dim %576, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %599 = stablehlo.broadcast_in_dim %594, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %600 = stablehlo.subtract %598, %599 : tensor<1x257x768xf32>
-    %601 = stablehlo.broadcast_in_dim %600, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %602 = stablehlo.broadcast_in_dim %597, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %603 = stablehlo.multiply %601, %602 : tensor<1x257x768xf32>
-    %604 = stablehlo.convert %arg16 : (tensor<768xbf16>) -> tensor<768xf32>
-    %605 = stablehlo.broadcast_in_dim %603, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %606 = stablehlo.broadcast_in_dim %604, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %607 = stablehlo.multiply %605, %606 : tensor<1x257x768xf32>
-    %608 = stablehlo.convert %arg17 : (tensor<768xbf16>) -> tensor<768xf32>
-    %609 = stablehlo.broadcast_in_dim %607, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %610 = stablehlo.broadcast_in_dim %608, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %611 = stablehlo.add %609, %610 : tensor<1x257x768xf32>
-    %612 = stablehlo.convert %611 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %613 = stablehlo.reshape %612 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %614 = stablehlo.convert %613 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %615 = stablehlo.dot_general %614, %arg98, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %616 = stablehlo.broadcast_in_dim %615, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %617 = stablehlo.multiply %616, %61 : tensor<257x2304xf32>
-    %618 = stablehlo.broadcast_in_dim %617, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %619 = stablehlo.broadcast_in_dim %arg99, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %620 = stablehlo.add %618, %619 : tensor<257x2304xf32>
-    %621 = stablehlo.convert %620 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %622 = stablehlo.reshape %621 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %623 = stablehlo.reshape %622 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %624 = stablehlo.transpose %623, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %625 = stablehlo.slice %624 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %626 = stablehlo.reshape %625 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %627 = stablehlo.slice %624 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %628 = stablehlo.reshape %627 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %629 = stablehlo.slice %624 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %630 = stablehlo.reshape %629 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %631 = stablehlo.transpose %628, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %632 = stablehlo.reshape %626 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %633 = stablehlo.reshape %631 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %634 = stablehlo.broadcast_in_dim %633, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %635 = stablehlo.dot_general %632, %634, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %636 = stablehlo.reshape %635 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %637 = stablehlo.broadcast_in_dim %636, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %638 = stablehlo.multiply %637, %85 : tensor<1x12x257x257xbf16>
-    %639 = stablehlo.convert %638 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %640 = stablehlo.reduce(%639 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %641 = stablehlo.reshape %640 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %642 = stablehlo.broadcast_in_dim %639, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %643 = stablehlo.broadcast_in_dim %641, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %644 = stablehlo.subtract %642, %643 : tensor<1x12x257x257xf32>
-    %645 = stablehlo.exponential %644 : tensor<1x12x257x257xf32>
-    %646 = stablehlo.reduce(%645 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %647 = stablehlo.reshape %646 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %648 = stablehlo.broadcast_in_dim %645, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %649 = stablehlo.broadcast_in_dim %647, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %650 = stablehlo.divide %648, %649 : tensor<1x12x257x257xf32>
-    %651 = stablehlo.convert %650 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %652 = stablehlo.reshape %651 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %653 = stablehlo.reshape %630 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %654 = stablehlo.broadcast_in_dim %653, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %655 = stablehlo.dot_general %652, %654, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %656 = stablehlo.reshape %655 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %657 = stablehlo.transpose %656, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %658 = stablehlo.reshape %657 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %659 = stablehlo.reshape %658 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %660 = stablehlo.convert %659 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %661 = stablehlo.dot_general %660, %arg100, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %662 = stablehlo.broadcast_in_dim %661, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %663 = stablehlo.multiply %662, %111 : tensor<257x768xf32>
-    %664 = stablehlo.broadcast_in_dim %663, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %665 = stablehlo.broadcast_in_dim %arg101, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %666 = stablehlo.add %664, %665 : tensor<257x768xf32>
-    %667 = stablehlo.convert %666 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %668 = stablehlo.reshape %667 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %669 = stablehlo.add %668, %575 : tensor<1x257x768xbf16>
-    %670 = stablehlo.convert %669 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %671 = stablehlo.convert %670 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %672 = stablehlo.reduce(%671 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %673 = stablehlo.reshape %672 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %674 = stablehlo.broadcast_in_dim %673, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %675 = stablehlo.divide %674, %16 : tensor<1x257x1xf64>
-    %676 = stablehlo.broadcast_in_dim %671, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %677 = stablehlo.broadcast_in_dim %675, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %678 = stablehlo.subtract %676, %677 : tensor<1x257x768xf64>
-    %679 = stablehlo.multiply %678, %678 : tensor<1x257x768xf64>
-    %680 = stablehlo.reduce(%679 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %681 = stablehlo.reshape %680 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %682 = stablehlo.broadcast_in_dim %681, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %683 = stablehlo.divide %682, %16 : tensor<1x257x1xf64>
-    %684 = stablehlo.convert %683 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %685 = stablehlo.reduce(%670 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %686 = stablehlo.reshape %685 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %687 = stablehlo.broadcast_in_dim %686, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %688 = stablehlo.divide %687, %32 : tensor<1x257x1xf32>
-    %689 = stablehlo.broadcast_in_dim %684, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %690 = stablehlo.add %689, %37 : tensor<1x257x1xf32>
-    %691 = stablehlo.rsqrt %690 : tensor<1x257x1xf32>
-    %692 = stablehlo.broadcast_in_dim %670, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %693 = stablehlo.broadcast_in_dim %688, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %694 = stablehlo.subtract %692, %693 : tensor<1x257x768xf32>
-    %695 = stablehlo.broadcast_in_dim %694, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %696 = stablehlo.broadcast_in_dim %691, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %697 = stablehlo.multiply %695, %696 : tensor<1x257x768xf32>
-    %698 = stablehlo.convert %arg18 : (tensor<768xbf16>) -> tensor<768xf32>
-    %699 = stablehlo.broadcast_in_dim %697, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %700 = stablehlo.broadcast_in_dim %698, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %701 = stablehlo.multiply %699, %700 : tensor<1x257x768xf32>
-    %702 = stablehlo.convert %arg19 : (tensor<768xbf16>) -> tensor<768xf32>
-    %703 = stablehlo.broadcast_in_dim %701, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %704 = stablehlo.broadcast_in_dim %702, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %705 = stablehlo.add %703, %704 : tensor<1x257x768xf32>
-    %706 = stablehlo.convert %705 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %707 = stablehlo.reshape %706 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %708 = stablehlo.convert %707 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %709 = stablehlo.dot_general %708, %arg102, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %710 = stablehlo.broadcast_in_dim %709, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %711 = stablehlo.multiply %710, %160 : tensor<257x3072xf32>
-    %712 = stablehlo.broadcast_in_dim %711, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %713 = stablehlo.broadcast_in_dim %arg103, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %714 = stablehlo.add %712, %713 : tensor<257x3072xf32>
-    %715 = stablehlo.convert %714 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %716 = stablehlo.reshape %715 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %717 = stablehlo.multiply %716, %cst_4 : tensor<1x257x3072xbf16>
-    %718 = stablehlo.multiply %716, %168 : tensor<1x257x3072xbf16>
-    %719 = stablehlo.convert %718 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %720 = stablehlo.clamp %cst_5, %719, %cst_6 : tensor<1x257x3072xf32>
-    %721 = stablehlo.multiply %720, %720 : tensor<1x257x3072xf32>
-    %722 = stablehlo.multiply %cst_7, %721 : tensor<1x257x3072xf32>
-    %723 = stablehlo.add %722, %cst_8 : tensor<1x257x3072xf32>
-    %724 = stablehlo.multiply %723, %721 : tensor<1x257x3072xf32>
-    %725 = stablehlo.add %724, %cst_9 : tensor<1x257x3072xf32>
-    %726 = stablehlo.multiply %725, %721 : tensor<1x257x3072xf32>
-    %727 = stablehlo.add %726, %cst_10 : tensor<1x257x3072xf32>
-    %728 = stablehlo.multiply %727, %721 : tensor<1x257x3072xf32>
-    %729 = stablehlo.add %728, %cst_11 : tensor<1x257x3072xf32>
-    %730 = stablehlo.multiply %729, %721 : tensor<1x257x3072xf32>
-    %731 = stablehlo.add %730, %cst_12 : tensor<1x257x3072xf32>
-    %732 = stablehlo.multiply %731, %721 : tensor<1x257x3072xf32>
-    %733 = stablehlo.add %732, %cst_13 : tensor<1x257x3072xf32>
-    %734 = stablehlo.multiply %cst_14, %721 : tensor<1x257x3072xf32>
-    %735 = stablehlo.add %734, %cst_15 : tensor<1x257x3072xf32>
-    %736 = stablehlo.multiply %735, %721 : tensor<1x257x3072xf32>
-    %737 = stablehlo.add %736, %cst_16 : tensor<1x257x3072xf32>
-    %738 = stablehlo.multiply %737, %721 : tensor<1x257x3072xf32>
-    %739 = stablehlo.add %738, %cst_17 : tensor<1x257x3072xf32>
-    %740 = stablehlo.multiply %739, %721 : tensor<1x257x3072xf32>
-    %741 = stablehlo.add %740, %cst_18 : tensor<1x257x3072xf32>
-    %742 = stablehlo.multiply %720, %733 : tensor<1x257x3072xf32>
-    %743 = stablehlo.divide %742, %741 : tensor<1x257x3072xf32>
-    %744 = stablehlo.clamp %cst_19, %743, %cst_20 : tensor<1x257x3072xf32>
-    %745 = stablehlo.convert %744 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %746 = stablehlo.add %745, %cst_2 : tensor<1x257x3072xbf16>
-    %747 = stablehlo.multiply %746, %717 : tensor<1x257x3072xbf16>
-    %748 = stablehlo.reshape %747 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %749 = stablehlo.convert %748 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %750 = stablehlo.dot_general %749, %arg104, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %751 = stablehlo.broadcast_in_dim %750, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %752 = stablehlo.multiply %751, %111 : tensor<257x768xf32>
-    %753 = stablehlo.broadcast_in_dim %752, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %754 = stablehlo.broadcast_in_dim %arg105, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %755 = stablehlo.add %753, %754 : tensor<257x768xf32>
-    %756 = stablehlo.convert %755 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %757 = stablehlo.reshape %756 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %758 = stablehlo.add %669, %757 : tensor<1x257x768xbf16>
-    %759 = stablehlo.convert %758 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %760 = stablehlo.convert %759 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %761 = stablehlo.reduce(%760 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %762 = stablehlo.reshape %761 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %763 = stablehlo.broadcast_in_dim %762, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %764 = stablehlo.divide %763, %16 : tensor<1x257x1xf64>
-    %765 = stablehlo.broadcast_in_dim %760, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %766 = stablehlo.broadcast_in_dim %764, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %767 = stablehlo.subtract %765, %766 : tensor<1x257x768xf64>
-    %768 = stablehlo.multiply %767, %767 : tensor<1x257x768xf64>
-    %769 = stablehlo.reduce(%768 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %770 = stablehlo.reshape %769 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %771 = stablehlo.broadcast_in_dim %770, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %772 = stablehlo.divide %771, %16 : tensor<1x257x1xf64>
-    %773 = stablehlo.convert %772 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %774 = stablehlo.reduce(%759 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %775 = stablehlo.reshape %774 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %776 = stablehlo.broadcast_in_dim %775, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %777 = stablehlo.divide %776, %32 : tensor<1x257x1xf32>
-    %778 = stablehlo.broadcast_in_dim %773, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %779 = stablehlo.add %778, %37 : tensor<1x257x1xf32>
-    %780 = stablehlo.rsqrt %779 : tensor<1x257x1xf32>
-    %781 = stablehlo.broadcast_in_dim %759, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %782 = stablehlo.broadcast_in_dim %777, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %783 = stablehlo.subtract %781, %782 : tensor<1x257x768xf32>
-    %784 = stablehlo.broadcast_in_dim %783, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %785 = stablehlo.broadcast_in_dim %780, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %786 = stablehlo.multiply %784, %785 : tensor<1x257x768xf32>
-    %787 = stablehlo.convert %arg20 : (tensor<768xbf16>) -> tensor<768xf32>
-    %788 = stablehlo.broadcast_in_dim %786, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %789 = stablehlo.broadcast_in_dim %787, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %790 = stablehlo.multiply %788, %789 : tensor<1x257x768xf32>
-    %791 = stablehlo.convert %arg21 : (tensor<768xbf16>) -> tensor<768xf32>
-    %792 = stablehlo.broadcast_in_dim %790, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %793 = stablehlo.broadcast_in_dim %791, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %794 = stablehlo.add %792, %793 : tensor<1x257x768xf32>
-    %795 = stablehlo.convert %794 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %796 = stablehlo.reshape %795 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %797 = stablehlo.convert %796 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %798 = stablehlo.dot_general %797, %arg106, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %799 = stablehlo.broadcast_in_dim %798, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %800 = stablehlo.multiply %799, %61 : tensor<257x2304xf32>
-    %801 = stablehlo.broadcast_in_dim %800, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %802 = stablehlo.broadcast_in_dim %arg107, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %803 = stablehlo.add %801, %802 : tensor<257x2304xf32>
-    %804 = stablehlo.convert %803 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %805 = stablehlo.reshape %804 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %806 = stablehlo.reshape %805 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %807 = stablehlo.transpose %806, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %808 = stablehlo.slice %807 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %809 = stablehlo.reshape %808 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %810 = stablehlo.slice %807 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %811 = stablehlo.reshape %810 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %812 = stablehlo.slice %807 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %813 = stablehlo.reshape %812 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %814 = stablehlo.transpose %811, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %815 = stablehlo.reshape %809 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %816 = stablehlo.reshape %814 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %817 = stablehlo.broadcast_in_dim %816, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %818 = stablehlo.dot_general %815, %817, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %819 = stablehlo.reshape %818 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %820 = stablehlo.broadcast_in_dim %819, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %821 = stablehlo.multiply %820, %85 : tensor<1x12x257x257xbf16>
-    %822 = stablehlo.convert %821 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %823 = stablehlo.reduce(%822 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %824 = stablehlo.reshape %823 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %825 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %826 = stablehlo.broadcast_in_dim %824, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %827 = stablehlo.subtract %825, %826 : tensor<1x12x257x257xf32>
-    %828 = stablehlo.exponential %827 : tensor<1x12x257x257xf32>
-    %829 = stablehlo.reduce(%828 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %830 = stablehlo.reshape %829 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %831 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %832 = stablehlo.broadcast_in_dim %830, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %833 = stablehlo.divide %831, %832 : tensor<1x12x257x257xf32>
-    %834 = stablehlo.convert %833 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %835 = stablehlo.reshape %834 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %836 = stablehlo.reshape %813 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %837 = stablehlo.broadcast_in_dim %836, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %838 = stablehlo.dot_general %835, %837, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %839 = stablehlo.reshape %838 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %840 = stablehlo.transpose %839, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %841 = stablehlo.reshape %840 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %842 = stablehlo.reshape %841 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %843 = stablehlo.convert %842 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %844 = stablehlo.dot_general %843, %arg108, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %845 = stablehlo.broadcast_in_dim %844, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %846 = stablehlo.multiply %845, %111 : tensor<257x768xf32>
-    %847 = stablehlo.broadcast_in_dim %846, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %848 = stablehlo.broadcast_in_dim %arg109, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %849 = stablehlo.add %847, %848 : tensor<257x768xf32>
-    %850 = stablehlo.convert %849 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %851 = stablehlo.reshape %850 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %852 = stablehlo.add %851, %758 : tensor<1x257x768xbf16>
-    %853 = stablehlo.convert %852 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %854 = stablehlo.convert %853 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %855 = stablehlo.reduce(%854 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %856 = stablehlo.reshape %855 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %857 = stablehlo.broadcast_in_dim %856, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %858 = stablehlo.divide %857, %16 : tensor<1x257x1xf64>
-    %859 = stablehlo.broadcast_in_dim %854, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %860 = stablehlo.broadcast_in_dim %858, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %861 = stablehlo.subtract %859, %860 : tensor<1x257x768xf64>
-    %862 = stablehlo.multiply %861, %861 : tensor<1x257x768xf64>
-    %863 = stablehlo.reduce(%862 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %864 = stablehlo.reshape %863 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %865 = stablehlo.broadcast_in_dim %864, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %866 = stablehlo.divide %865, %16 : tensor<1x257x1xf64>
-    %867 = stablehlo.convert %866 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %868 = stablehlo.reduce(%853 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %869 = stablehlo.reshape %868 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %870 = stablehlo.broadcast_in_dim %869, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %871 = stablehlo.divide %870, %32 : tensor<1x257x1xf32>
-    %872 = stablehlo.broadcast_in_dim %867, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %873 = stablehlo.add %872, %37 : tensor<1x257x1xf32>
-    %874 = stablehlo.rsqrt %873 : tensor<1x257x1xf32>
-    %875 = stablehlo.broadcast_in_dim %853, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %876 = stablehlo.broadcast_in_dim %871, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %877 = stablehlo.subtract %875, %876 : tensor<1x257x768xf32>
-    %878 = stablehlo.broadcast_in_dim %877, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %879 = stablehlo.broadcast_in_dim %874, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %880 = stablehlo.multiply %878, %879 : tensor<1x257x768xf32>
-    %881 = stablehlo.convert %arg22 : (tensor<768xbf16>) -> tensor<768xf32>
-    %882 = stablehlo.broadcast_in_dim %880, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %883 = stablehlo.broadcast_in_dim %881, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %884 = stablehlo.multiply %882, %883 : tensor<1x257x768xf32>
-    %885 = stablehlo.convert %arg23 : (tensor<768xbf16>) -> tensor<768xf32>
-    %886 = stablehlo.broadcast_in_dim %884, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %887 = stablehlo.broadcast_in_dim %885, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %888 = stablehlo.add %886, %887 : tensor<1x257x768xf32>
-    %889 = stablehlo.convert %888 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %890 = stablehlo.reshape %889 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %891 = stablehlo.convert %890 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %892 = stablehlo.dot_general %891, %arg110, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %893 = stablehlo.broadcast_in_dim %892, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %894 = stablehlo.multiply %893, %160 : tensor<257x3072xf32>
-    %895 = stablehlo.broadcast_in_dim %894, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %896 = stablehlo.broadcast_in_dim %arg111, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %897 = stablehlo.add %895, %896 : tensor<257x3072xf32>
-    %898 = stablehlo.convert %897 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %899 = stablehlo.reshape %898 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %900 = stablehlo.multiply %899, %cst_4 : tensor<1x257x3072xbf16>
-    %901 = stablehlo.multiply %899, %168 : tensor<1x257x3072xbf16>
-    %902 = stablehlo.convert %901 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %903 = stablehlo.clamp %cst_5, %902, %cst_6 : tensor<1x257x3072xf32>
-    %904 = stablehlo.multiply %903, %903 : tensor<1x257x3072xf32>
-    %905 = stablehlo.multiply %cst_7, %904 : tensor<1x257x3072xf32>
-    %906 = stablehlo.add %905, %cst_8 : tensor<1x257x3072xf32>
-    %907 = stablehlo.multiply %906, %904 : tensor<1x257x3072xf32>
-    %908 = stablehlo.add %907, %cst_9 : tensor<1x257x3072xf32>
-    %909 = stablehlo.multiply %908, %904 : tensor<1x257x3072xf32>
-    %910 = stablehlo.add %909, %cst_10 : tensor<1x257x3072xf32>
-    %911 = stablehlo.multiply %910, %904 : tensor<1x257x3072xf32>
-    %912 = stablehlo.add %911, %cst_11 : tensor<1x257x3072xf32>
-    %913 = stablehlo.multiply %912, %904 : tensor<1x257x3072xf32>
-    %914 = stablehlo.add %913, %cst_12 : tensor<1x257x3072xf32>
-    %915 = stablehlo.multiply %914, %904 : tensor<1x257x3072xf32>
-    %916 = stablehlo.add %915, %cst_13 : tensor<1x257x3072xf32>
-    %917 = stablehlo.multiply %cst_14, %904 : tensor<1x257x3072xf32>
-    %918 = stablehlo.add %917, %cst_15 : tensor<1x257x3072xf32>
-    %919 = stablehlo.multiply %918, %904 : tensor<1x257x3072xf32>
-    %920 = stablehlo.add %919, %cst_16 : tensor<1x257x3072xf32>
-    %921 = stablehlo.multiply %920, %904 : tensor<1x257x3072xf32>
-    %922 = stablehlo.add %921, %cst_17 : tensor<1x257x3072xf32>
-    %923 = stablehlo.multiply %922, %904 : tensor<1x257x3072xf32>
-    %924 = stablehlo.add %923, %cst_18 : tensor<1x257x3072xf32>
-    %925 = stablehlo.multiply %903, %916 : tensor<1x257x3072xf32>
-    %926 = stablehlo.divide %925, %924 : tensor<1x257x3072xf32>
-    %927 = stablehlo.clamp %cst_19, %926, %cst_20 : tensor<1x257x3072xf32>
-    %928 = stablehlo.convert %927 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %929 = stablehlo.add %928, %cst_2 : tensor<1x257x3072xbf16>
-    %930 = stablehlo.multiply %929, %900 : tensor<1x257x3072xbf16>
-    %931 = stablehlo.reshape %930 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %932 = stablehlo.convert %931 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %933 = stablehlo.dot_general %932, %arg112, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %934 = stablehlo.broadcast_in_dim %933, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %935 = stablehlo.multiply %934, %111 : tensor<257x768xf32>
-    %936 = stablehlo.broadcast_in_dim %935, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %937 = stablehlo.broadcast_in_dim %arg113, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %938 = stablehlo.add %936, %937 : tensor<257x768xf32>
-    %939 = stablehlo.convert %938 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %940 = stablehlo.reshape %939 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %941 = stablehlo.add %852, %940 : tensor<1x257x768xbf16>
-    %942 = stablehlo.convert %941 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %943 = stablehlo.convert %942 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %944 = stablehlo.reduce(%943 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %945 = stablehlo.reshape %944 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %946 = stablehlo.broadcast_in_dim %945, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %947 = stablehlo.divide %946, %16 : tensor<1x257x1xf64>
-    %948 = stablehlo.broadcast_in_dim %943, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %949 = stablehlo.broadcast_in_dim %947, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %950 = stablehlo.subtract %948, %949 : tensor<1x257x768xf64>
-    %951 = stablehlo.multiply %950, %950 : tensor<1x257x768xf64>
-    %952 = stablehlo.reduce(%951 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %953 = stablehlo.reshape %952 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %954 = stablehlo.broadcast_in_dim %953, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %955 = stablehlo.divide %954, %16 : tensor<1x257x1xf64>
-    %956 = stablehlo.convert %955 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %957 = stablehlo.reduce(%942 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %958 = stablehlo.reshape %957 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %959 = stablehlo.broadcast_in_dim %958, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %960 = stablehlo.divide %959, %32 : tensor<1x257x1xf32>
-    %961 = stablehlo.broadcast_in_dim %956, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %962 = stablehlo.add %961, %37 : tensor<1x257x1xf32>
-    %963 = stablehlo.rsqrt %962 : tensor<1x257x1xf32>
-    %964 = stablehlo.broadcast_in_dim %942, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %965 = stablehlo.broadcast_in_dim %960, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %966 = stablehlo.subtract %964, %965 : tensor<1x257x768xf32>
-    %967 = stablehlo.broadcast_in_dim %966, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %968 = stablehlo.broadcast_in_dim %963, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %969 = stablehlo.multiply %967, %968 : tensor<1x257x768xf32>
-    %970 = stablehlo.convert %arg24 : (tensor<768xbf16>) -> tensor<768xf32>
-    %971 = stablehlo.broadcast_in_dim %969, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %972 = stablehlo.broadcast_in_dim %970, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %973 = stablehlo.multiply %971, %972 : tensor<1x257x768xf32>
-    %974 = stablehlo.convert %arg25 : (tensor<768xbf16>) -> tensor<768xf32>
-    %975 = stablehlo.broadcast_in_dim %973, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %976 = stablehlo.broadcast_in_dim %974, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %977 = stablehlo.add %975, %976 : tensor<1x257x768xf32>
-    %978 = stablehlo.convert %977 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %979 = stablehlo.reshape %978 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %980 = stablehlo.convert %979 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %981 = stablehlo.dot_general %980, %arg114, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %982 = stablehlo.broadcast_in_dim %981, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %983 = stablehlo.multiply %982, %61 : tensor<257x2304xf32>
-    %984 = stablehlo.broadcast_in_dim %983, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %985 = stablehlo.broadcast_in_dim %arg115, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %986 = stablehlo.add %984, %985 : tensor<257x2304xf32>
-    %987 = stablehlo.convert %986 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %988 = stablehlo.reshape %987 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %989 = stablehlo.reshape %988 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %990 = stablehlo.transpose %989, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %991 = stablehlo.slice %990 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %992 = stablehlo.reshape %991 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %993 = stablehlo.slice %990 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %994 = stablehlo.reshape %993 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %995 = stablehlo.slice %990 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %996 = stablehlo.reshape %995 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %997 = stablehlo.transpose %994, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %998 = stablehlo.reshape %992 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %999 = stablehlo.reshape %997 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1000 = stablehlo.broadcast_in_dim %999, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1001 = stablehlo.dot_general %998, %1000, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %1002 = stablehlo.reshape %1001 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1003 = stablehlo.broadcast_in_dim %1002, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1004 = stablehlo.multiply %1003, %85 : tensor<1x12x257x257xbf16>
-    %1005 = stablehlo.convert %1004 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %1006 = stablehlo.reduce(%1005 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1007 = stablehlo.reshape %1006 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1008 = stablehlo.broadcast_in_dim %1005, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1009 = stablehlo.broadcast_in_dim %1007, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1010 = stablehlo.subtract %1008, %1009 : tensor<1x12x257x257xf32>
-    %1011 = stablehlo.exponential %1010 : tensor<1x12x257x257xf32>
-    %1012 = stablehlo.reduce(%1011 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1013 = stablehlo.reshape %1012 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1014 = stablehlo.broadcast_in_dim %1011, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1015 = stablehlo.broadcast_in_dim %1013, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1016 = stablehlo.divide %1014, %1015 : tensor<1x12x257x257xf32>
-    %1017 = stablehlo.convert %1016 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %1018 = stablehlo.reshape %1017 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %1019 = stablehlo.reshape %996 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1020 = stablehlo.broadcast_in_dim %1019, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1021 = stablehlo.dot_general %1018, %1020, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1022 = stablehlo.reshape %1021 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1023 = stablehlo.transpose %1022, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %1024 = stablehlo.reshape %1023 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %1025 = stablehlo.reshape %1024 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1026 = stablehlo.convert %1025 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1027 = stablehlo.dot_general %1026, %arg116, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %1028 = stablehlo.broadcast_in_dim %1027, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1029 = stablehlo.multiply %1028, %111 : tensor<257x768xf32>
-    %1030 = stablehlo.broadcast_in_dim %1029, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1031 = stablehlo.broadcast_in_dim %arg117, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1032 = stablehlo.add %1030, %1031 : tensor<257x768xf32>
-    %1033 = stablehlo.convert %1032 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1034 = stablehlo.reshape %1033 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1035 = stablehlo.add %1034, %941 : tensor<1x257x768xbf16>
-    %1036 = stablehlo.convert %1035 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1037 = stablehlo.convert %1036 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1038 = stablehlo.reduce(%1037 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1039 = stablehlo.reshape %1038 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1040 = stablehlo.broadcast_in_dim %1039, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1041 = stablehlo.divide %1040, %16 : tensor<1x257x1xf64>
-    %1042 = stablehlo.broadcast_in_dim %1037, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1043 = stablehlo.broadcast_in_dim %1041, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1044 = stablehlo.subtract %1042, %1043 : tensor<1x257x768xf64>
-    %1045 = stablehlo.multiply %1044, %1044 : tensor<1x257x768xf64>
-    %1046 = stablehlo.reduce(%1045 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1047 = stablehlo.reshape %1046 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1048 = stablehlo.broadcast_in_dim %1047, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1049 = stablehlo.divide %1048, %16 : tensor<1x257x1xf64>
-    %1050 = stablehlo.convert %1049 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1051 = stablehlo.reduce(%1036 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1052 = stablehlo.reshape %1051 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1053 = stablehlo.broadcast_in_dim %1052, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1054 = stablehlo.divide %1053, %32 : tensor<1x257x1xf32>
-    %1055 = stablehlo.broadcast_in_dim %1050, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1056 = stablehlo.add %1055, %37 : tensor<1x257x1xf32>
-    %1057 = stablehlo.rsqrt %1056 : tensor<1x257x1xf32>
-    %1058 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1059 = stablehlo.broadcast_in_dim %1054, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1060 = stablehlo.subtract %1058, %1059 : tensor<1x257x768xf32>
-    %1061 = stablehlo.broadcast_in_dim %1060, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1062 = stablehlo.broadcast_in_dim %1057, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1063 = stablehlo.multiply %1061, %1062 : tensor<1x257x768xf32>
-    %1064 = stablehlo.convert %arg26 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1065 = stablehlo.broadcast_in_dim %1063, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1066 = stablehlo.broadcast_in_dim %1064, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1067 = stablehlo.multiply %1065, %1066 : tensor<1x257x768xf32>
-    %1068 = stablehlo.convert %arg27 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1069 = stablehlo.broadcast_in_dim %1067, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1070 = stablehlo.broadcast_in_dim %1068, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1071 = stablehlo.add %1069, %1070 : tensor<1x257x768xf32>
-    %1072 = stablehlo.convert %1071 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1073 = stablehlo.reshape %1072 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1074 = stablehlo.convert %1073 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1075 = stablehlo.dot_general %1074, %arg118, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %1076 = stablehlo.broadcast_in_dim %1075, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1077 = stablehlo.multiply %1076, %160 : tensor<257x3072xf32>
-    %1078 = stablehlo.broadcast_in_dim %1077, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1079 = stablehlo.broadcast_in_dim %arg119, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %1080 = stablehlo.add %1078, %1079 : tensor<257x3072xf32>
-    %1081 = stablehlo.convert %1080 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %1082 = stablehlo.reshape %1081 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %1083 = stablehlo.multiply %1082, %cst_4 : tensor<1x257x3072xbf16>
-    %1084 = stablehlo.multiply %1082, %168 : tensor<1x257x3072xbf16>
-    %1085 = stablehlo.convert %1084 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %1086 = stablehlo.clamp %cst_5, %1085, %cst_6 : tensor<1x257x3072xf32>
-    %1087 = stablehlo.multiply %1086, %1086 : tensor<1x257x3072xf32>
-    %1088 = stablehlo.multiply %cst_7, %1087 : tensor<1x257x3072xf32>
-    %1089 = stablehlo.add %1088, %cst_8 : tensor<1x257x3072xf32>
-    %1090 = stablehlo.multiply %1089, %1087 : tensor<1x257x3072xf32>
-    %1091 = stablehlo.add %1090, %cst_9 : tensor<1x257x3072xf32>
-    %1092 = stablehlo.multiply %1091, %1087 : tensor<1x257x3072xf32>
-    %1093 = stablehlo.add %1092, %cst_10 : tensor<1x257x3072xf32>
-    %1094 = stablehlo.multiply %1093, %1087 : tensor<1x257x3072xf32>
-    %1095 = stablehlo.add %1094, %cst_11 : tensor<1x257x3072xf32>
-    %1096 = stablehlo.multiply %1095, %1087 : tensor<1x257x3072xf32>
-    %1097 = stablehlo.add %1096, %cst_12 : tensor<1x257x3072xf32>
-    %1098 = stablehlo.multiply %1097, %1087 : tensor<1x257x3072xf32>
-    %1099 = stablehlo.add %1098, %cst_13 : tensor<1x257x3072xf32>
-    %1100 = stablehlo.multiply %cst_14, %1087 : tensor<1x257x3072xf32>
-    %1101 = stablehlo.add %1100, %cst_15 : tensor<1x257x3072xf32>
-    %1102 = stablehlo.multiply %1101, %1087 : tensor<1x257x3072xf32>
-    %1103 = stablehlo.add %1102, %cst_16 : tensor<1x257x3072xf32>
-    %1104 = stablehlo.multiply %1103, %1087 : tensor<1x257x3072xf32>
-    %1105 = stablehlo.add %1104, %cst_17 : tensor<1x257x3072xf32>
-    %1106 = stablehlo.multiply %1105, %1087 : tensor<1x257x3072xf32>
-    %1107 = stablehlo.add %1106, %cst_18 : tensor<1x257x3072xf32>
-    %1108 = stablehlo.multiply %1086, %1099 : tensor<1x257x3072xf32>
-    %1109 = stablehlo.divide %1108, %1107 : tensor<1x257x3072xf32>
-    %1110 = stablehlo.clamp %cst_19, %1109, %cst_20 : tensor<1x257x3072xf32>
-    %1111 = stablehlo.convert %1110 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %1112 = stablehlo.add %1111, %cst_2 : tensor<1x257x3072xbf16>
-    %1113 = stablehlo.multiply %1112, %1083 : tensor<1x257x3072xbf16>
-    %1114 = stablehlo.reshape %1113 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %1115 = stablehlo.convert %1114 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %1116 = stablehlo.dot_general %1115, %arg120, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %1117 = stablehlo.broadcast_in_dim %1116, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1118 = stablehlo.multiply %1117, %111 : tensor<257x768xf32>
-    %1119 = stablehlo.broadcast_in_dim %1118, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1120 = stablehlo.broadcast_in_dim %arg121, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1121 = stablehlo.add %1119, %1120 : tensor<257x768xf32>
-    %1122 = stablehlo.convert %1121 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1123 = stablehlo.reshape %1122 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1124 = stablehlo.add %1035, %1123 : tensor<1x257x768xbf16>
-    %1125 = stablehlo.convert %1124 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1126 = stablehlo.convert %1125 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1127 = stablehlo.reduce(%1126 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1128 = stablehlo.reshape %1127 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1129 = stablehlo.broadcast_in_dim %1128, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1130 = stablehlo.divide %1129, %16 : tensor<1x257x1xf64>
-    %1131 = stablehlo.broadcast_in_dim %1126, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1132 = stablehlo.broadcast_in_dim %1130, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1133 = stablehlo.subtract %1131, %1132 : tensor<1x257x768xf64>
-    %1134 = stablehlo.multiply %1133, %1133 : tensor<1x257x768xf64>
-    %1135 = stablehlo.reduce(%1134 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1136 = stablehlo.reshape %1135 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1137 = stablehlo.broadcast_in_dim %1136, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1138 = stablehlo.divide %1137, %16 : tensor<1x257x1xf64>
-    %1139 = stablehlo.convert %1138 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1140 = stablehlo.reduce(%1125 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1141 = stablehlo.reshape %1140 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1142 = stablehlo.broadcast_in_dim %1141, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1143 = stablehlo.divide %1142, %32 : tensor<1x257x1xf32>
-    %1144 = stablehlo.broadcast_in_dim %1139, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1145 = stablehlo.add %1144, %37 : tensor<1x257x1xf32>
-    %1146 = stablehlo.rsqrt %1145 : tensor<1x257x1xf32>
-    %1147 = stablehlo.broadcast_in_dim %1125, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1148 = stablehlo.broadcast_in_dim %1143, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1149 = stablehlo.subtract %1147, %1148 : tensor<1x257x768xf32>
-    %1150 = stablehlo.broadcast_in_dim %1149, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1151 = stablehlo.broadcast_in_dim %1146, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1152 = stablehlo.multiply %1150, %1151 : tensor<1x257x768xf32>
-    %1153 = stablehlo.convert %arg28 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1154 = stablehlo.broadcast_in_dim %1152, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1155 = stablehlo.broadcast_in_dim %1153, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1156 = stablehlo.multiply %1154, %1155 : tensor<1x257x768xf32>
-    %1157 = stablehlo.convert %arg29 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1158 = stablehlo.broadcast_in_dim %1156, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1159 = stablehlo.broadcast_in_dim %1157, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1160 = stablehlo.add %1158, %1159 : tensor<1x257x768xf32>
-    %1161 = stablehlo.convert %1160 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1162 = stablehlo.reshape %1161 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1163 = stablehlo.convert %1162 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1164 = stablehlo.dot_general %1163, %arg122, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %1165 = stablehlo.broadcast_in_dim %1164, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1166 = stablehlo.multiply %1165, %61 : tensor<257x2304xf32>
-    %1167 = stablehlo.broadcast_in_dim %1166, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1168 = stablehlo.broadcast_in_dim %arg123, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %1169 = stablehlo.add %1167, %1168 : tensor<257x2304xf32>
-    %1170 = stablehlo.convert %1169 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %1171 = stablehlo.reshape %1170 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %1172 = stablehlo.reshape %1171 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %1173 = stablehlo.transpose %1172, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %1174 = stablehlo.slice %1173 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1175 = stablehlo.reshape %1174 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1176 = stablehlo.slice %1173 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1177 = stablehlo.reshape %1176 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1178 = stablehlo.slice %1173 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1179 = stablehlo.reshape %1178 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1180 = stablehlo.transpose %1177, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %1181 = stablehlo.reshape %1175 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1182 = stablehlo.reshape %1180 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1183 = stablehlo.broadcast_in_dim %1182, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1184 = stablehlo.dot_general %1181, %1183, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %1185 = stablehlo.reshape %1184 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1186 = stablehlo.broadcast_in_dim %1185, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1187 = stablehlo.multiply %1186, %85 : tensor<1x12x257x257xbf16>
-    %1188 = stablehlo.convert %1187 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %1189 = stablehlo.reduce(%1188 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1190 = stablehlo.reshape %1189 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1191 = stablehlo.broadcast_in_dim %1188, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1192 = stablehlo.broadcast_in_dim %1190, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1193 = stablehlo.subtract %1191, %1192 : tensor<1x12x257x257xf32>
-    %1194 = stablehlo.exponential %1193 : tensor<1x12x257x257xf32>
-    %1195 = stablehlo.reduce(%1194 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1196 = stablehlo.reshape %1195 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1197 = stablehlo.broadcast_in_dim %1194, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1198 = stablehlo.broadcast_in_dim %1196, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1199 = stablehlo.divide %1197, %1198 : tensor<1x12x257x257xf32>
-    %1200 = stablehlo.convert %1199 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %1201 = stablehlo.reshape %1200 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %1202 = stablehlo.reshape %1179 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1203 = stablehlo.broadcast_in_dim %1202, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1204 = stablehlo.dot_general %1201, %1203, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1205 = stablehlo.reshape %1204 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1206 = stablehlo.transpose %1205, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %1207 = stablehlo.reshape %1206 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %1208 = stablehlo.reshape %1207 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1209 = stablehlo.convert %1208 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1210 = stablehlo.dot_general %1209, %arg124, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %1211 = stablehlo.broadcast_in_dim %1210, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1212 = stablehlo.multiply %1211, %111 : tensor<257x768xf32>
-    %1213 = stablehlo.broadcast_in_dim %1212, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1214 = stablehlo.broadcast_in_dim %arg125, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1215 = stablehlo.add %1213, %1214 : tensor<257x768xf32>
-    %1216 = stablehlo.convert %1215 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1217 = stablehlo.reshape %1216 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1218 = stablehlo.add %1217, %1124 : tensor<1x257x768xbf16>
-    %1219 = stablehlo.convert %1218 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1220 = stablehlo.convert %1219 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1221 = stablehlo.reduce(%1220 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1222 = stablehlo.reshape %1221 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1223 = stablehlo.broadcast_in_dim %1222, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1224 = stablehlo.divide %1223, %16 : tensor<1x257x1xf64>
-    %1225 = stablehlo.broadcast_in_dim %1220, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1226 = stablehlo.broadcast_in_dim %1224, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1227 = stablehlo.subtract %1225, %1226 : tensor<1x257x768xf64>
-    %1228 = stablehlo.multiply %1227, %1227 : tensor<1x257x768xf64>
-    %1229 = stablehlo.reduce(%1228 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1230 = stablehlo.reshape %1229 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1231 = stablehlo.broadcast_in_dim %1230, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1232 = stablehlo.divide %1231, %16 : tensor<1x257x1xf64>
-    %1233 = stablehlo.convert %1232 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1234 = stablehlo.reduce(%1219 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1235 = stablehlo.reshape %1234 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1236 = stablehlo.broadcast_in_dim %1235, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1237 = stablehlo.divide %1236, %32 : tensor<1x257x1xf32>
-    %1238 = stablehlo.broadcast_in_dim %1233, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1239 = stablehlo.add %1238, %37 : tensor<1x257x1xf32>
-    %1240 = stablehlo.rsqrt %1239 : tensor<1x257x1xf32>
-    %1241 = stablehlo.broadcast_in_dim %1219, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1242 = stablehlo.broadcast_in_dim %1237, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1243 = stablehlo.subtract %1241, %1242 : tensor<1x257x768xf32>
-    %1244 = stablehlo.broadcast_in_dim %1243, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1245 = stablehlo.broadcast_in_dim %1240, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1246 = stablehlo.multiply %1244, %1245 : tensor<1x257x768xf32>
-    %1247 = stablehlo.convert %arg30 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1248 = stablehlo.broadcast_in_dim %1246, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1249 = stablehlo.broadcast_in_dim %1247, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1250 = stablehlo.multiply %1248, %1249 : tensor<1x257x768xf32>
-    %1251 = stablehlo.convert %arg31 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1252 = stablehlo.broadcast_in_dim %1250, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1253 = stablehlo.broadcast_in_dim %1251, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1254 = stablehlo.add %1252, %1253 : tensor<1x257x768xf32>
-    %1255 = stablehlo.convert %1254 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1256 = stablehlo.reshape %1255 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1257 = stablehlo.convert %1256 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1258 = stablehlo.dot_general %1257, %arg126, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %1259 = stablehlo.broadcast_in_dim %1258, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1260 = stablehlo.multiply %1259, %160 : tensor<257x3072xf32>
-    %1261 = stablehlo.broadcast_in_dim %1260, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1262 = stablehlo.broadcast_in_dim %arg127, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %1263 = stablehlo.add %1261, %1262 : tensor<257x3072xf32>
-    %1264 = stablehlo.convert %1263 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %1265 = stablehlo.reshape %1264 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %1266 = stablehlo.multiply %1265, %cst_4 : tensor<1x257x3072xbf16>
-    %1267 = stablehlo.multiply %1265, %168 : tensor<1x257x3072xbf16>
-    %1268 = stablehlo.convert %1267 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %1269 = stablehlo.clamp %cst_5, %1268, %cst_6 : tensor<1x257x3072xf32>
-    %1270 = stablehlo.multiply %1269, %1269 : tensor<1x257x3072xf32>
-    %1271 = stablehlo.multiply %cst_7, %1270 : tensor<1x257x3072xf32>
-    %1272 = stablehlo.add %1271, %cst_8 : tensor<1x257x3072xf32>
-    %1273 = stablehlo.multiply %1272, %1270 : tensor<1x257x3072xf32>
-    %1274 = stablehlo.add %1273, %cst_9 : tensor<1x257x3072xf32>
-    %1275 = stablehlo.multiply %1274, %1270 : tensor<1x257x3072xf32>
-    %1276 = stablehlo.add %1275, %cst_10 : tensor<1x257x3072xf32>
-    %1277 = stablehlo.multiply %1276, %1270 : tensor<1x257x3072xf32>
-    %1278 = stablehlo.add %1277, %cst_11 : tensor<1x257x3072xf32>
-    %1279 = stablehlo.multiply %1278, %1270 : tensor<1x257x3072xf32>
-    %1280 = stablehlo.add %1279, %cst_12 : tensor<1x257x3072xf32>
-    %1281 = stablehlo.multiply %1280, %1270 : tensor<1x257x3072xf32>
-    %1282 = stablehlo.add %1281, %cst_13 : tensor<1x257x3072xf32>
-    %1283 = stablehlo.multiply %cst_14, %1270 : tensor<1x257x3072xf32>
-    %1284 = stablehlo.add %1283, %cst_15 : tensor<1x257x3072xf32>
-    %1285 = stablehlo.multiply %1284, %1270 : tensor<1x257x3072xf32>
-    %1286 = stablehlo.add %1285, %cst_16 : tensor<1x257x3072xf32>
-    %1287 = stablehlo.multiply %1286, %1270 : tensor<1x257x3072xf32>
-    %1288 = stablehlo.add %1287, %cst_17 : tensor<1x257x3072xf32>
-    %1289 = stablehlo.multiply %1288, %1270 : tensor<1x257x3072xf32>
-    %1290 = stablehlo.add %1289, %cst_18 : tensor<1x257x3072xf32>
-    %1291 = stablehlo.multiply %1269, %1282 : tensor<1x257x3072xf32>
-    %1292 = stablehlo.divide %1291, %1290 : tensor<1x257x3072xf32>
-    %1293 = stablehlo.clamp %cst_19, %1292, %cst_20 : tensor<1x257x3072xf32>
-    %1294 = stablehlo.convert %1293 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %1295 = stablehlo.add %1294, %cst_2 : tensor<1x257x3072xbf16>
-    %1296 = stablehlo.multiply %1295, %1266 : tensor<1x257x3072xbf16>
-    %1297 = stablehlo.reshape %1296 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %1298 = stablehlo.convert %1297 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %1299 = stablehlo.dot_general %1298, %arg128, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %1300 = stablehlo.broadcast_in_dim %1299, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1301 = stablehlo.multiply %1300, %111 : tensor<257x768xf32>
-    %1302 = stablehlo.broadcast_in_dim %1301, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1303 = stablehlo.broadcast_in_dim %arg129, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1304 = stablehlo.add %1302, %1303 : tensor<257x768xf32>
-    %1305 = stablehlo.convert %1304 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1306 = stablehlo.reshape %1305 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1307 = stablehlo.add %1218, %1306 : tensor<1x257x768xbf16>
-    %1308 = stablehlo.convert %1307 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1309 = stablehlo.convert %1308 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1310 = stablehlo.reduce(%1309 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1311 = stablehlo.reshape %1310 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1312 = stablehlo.broadcast_in_dim %1311, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1313 = stablehlo.divide %1312, %16 : tensor<1x257x1xf64>
-    %1314 = stablehlo.broadcast_in_dim %1309, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1315 = stablehlo.broadcast_in_dim %1313, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1316 = stablehlo.subtract %1314, %1315 : tensor<1x257x768xf64>
-    %1317 = stablehlo.multiply %1316, %1316 : tensor<1x257x768xf64>
-    %1318 = stablehlo.reduce(%1317 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1319 = stablehlo.reshape %1318 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1320 = stablehlo.broadcast_in_dim %1319, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1321 = stablehlo.divide %1320, %16 : tensor<1x257x1xf64>
-    %1322 = stablehlo.convert %1321 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1323 = stablehlo.reduce(%1308 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1324 = stablehlo.reshape %1323 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1325 = stablehlo.broadcast_in_dim %1324, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1326 = stablehlo.divide %1325, %32 : tensor<1x257x1xf32>
-    %1327 = stablehlo.broadcast_in_dim %1322, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1328 = stablehlo.add %1327, %37 : tensor<1x257x1xf32>
-    %1329 = stablehlo.rsqrt %1328 : tensor<1x257x1xf32>
-    %1330 = stablehlo.broadcast_in_dim %1308, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1331 = stablehlo.broadcast_in_dim %1326, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1332 = stablehlo.subtract %1330, %1331 : tensor<1x257x768xf32>
-    %1333 = stablehlo.broadcast_in_dim %1332, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1334 = stablehlo.broadcast_in_dim %1329, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1335 = stablehlo.multiply %1333, %1334 : tensor<1x257x768xf32>
-    %1336 = stablehlo.convert %arg32 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1337 = stablehlo.broadcast_in_dim %1335, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1338 = stablehlo.broadcast_in_dim %1336, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1339 = stablehlo.multiply %1337, %1338 : tensor<1x257x768xf32>
-    %1340 = stablehlo.convert %arg33 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1341 = stablehlo.broadcast_in_dim %1339, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1342 = stablehlo.broadcast_in_dim %1340, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1343 = stablehlo.add %1341, %1342 : tensor<1x257x768xf32>
-    %1344 = stablehlo.convert %1343 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1345 = stablehlo.reshape %1344 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1346 = stablehlo.convert %1345 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1347 = stablehlo.dot_general %1346, %arg130, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %1348 = stablehlo.broadcast_in_dim %1347, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1349 = stablehlo.multiply %1348, %61 : tensor<257x2304xf32>
-    %1350 = stablehlo.broadcast_in_dim %1349, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1351 = stablehlo.broadcast_in_dim %arg131, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %1352 = stablehlo.add %1350, %1351 : tensor<257x2304xf32>
-    %1353 = stablehlo.convert %1352 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %1354 = stablehlo.reshape %1353 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %1355 = stablehlo.reshape %1354 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %1356 = stablehlo.transpose %1355, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %1357 = stablehlo.slice %1356 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1358 = stablehlo.reshape %1357 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1359 = stablehlo.slice %1356 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1360 = stablehlo.reshape %1359 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1361 = stablehlo.slice %1356 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1362 = stablehlo.reshape %1361 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1363 = stablehlo.transpose %1360, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %1364 = stablehlo.reshape %1358 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1365 = stablehlo.reshape %1363 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1366 = stablehlo.broadcast_in_dim %1365, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1367 = stablehlo.dot_general %1364, %1366, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %1368 = stablehlo.reshape %1367 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1369 = stablehlo.broadcast_in_dim %1368, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1370 = stablehlo.multiply %1369, %85 : tensor<1x12x257x257xbf16>
-    %1371 = stablehlo.convert %1370 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %1372 = stablehlo.reduce(%1371 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1373 = stablehlo.reshape %1372 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1374 = stablehlo.broadcast_in_dim %1371, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1375 = stablehlo.broadcast_in_dim %1373, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1376 = stablehlo.subtract %1374, %1375 : tensor<1x12x257x257xf32>
-    %1377 = stablehlo.exponential %1376 : tensor<1x12x257x257xf32>
-    %1378 = stablehlo.reduce(%1377 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1379 = stablehlo.reshape %1378 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1380 = stablehlo.broadcast_in_dim %1377, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1381 = stablehlo.broadcast_in_dim %1379, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1382 = stablehlo.divide %1380, %1381 : tensor<1x12x257x257xf32>
-    %1383 = stablehlo.convert %1382 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %1384 = stablehlo.reshape %1383 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %1385 = stablehlo.reshape %1362 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1386 = stablehlo.broadcast_in_dim %1385, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1387 = stablehlo.dot_general %1384, %1386, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1388 = stablehlo.reshape %1387 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1389 = stablehlo.transpose %1388, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %1390 = stablehlo.reshape %1389 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %1391 = stablehlo.reshape %1390 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1392 = stablehlo.convert %1391 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1393 = stablehlo.dot_general %1392, %arg132, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %1394 = stablehlo.broadcast_in_dim %1393, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1395 = stablehlo.multiply %1394, %111 : tensor<257x768xf32>
-    %1396 = stablehlo.broadcast_in_dim %1395, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1397 = stablehlo.broadcast_in_dim %arg133, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1398 = stablehlo.add %1396, %1397 : tensor<257x768xf32>
-    %1399 = stablehlo.convert %1398 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1400 = stablehlo.reshape %1399 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1401 = stablehlo.add %1400, %1307 : tensor<1x257x768xbf16>
-    %1402 = stablehlo.convert %1401 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1403 = stablehlo.convert %1402 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1404 = stablehlo.reduce(%1403 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1405 = stablehlo.reshape %1404 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1406 = stablehlo.broadcast_in_dim %1405, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1407 = stablehlo.divide %1406, %16 : tensor<1x257x1xf64>
-    %1408 = stablehlo.broadcast_in_dim %1403, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1409 = stablehlo.broadcast_in_dim %1407, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1410 = stablehlo.subtract %1408, %1409 : tensor<1x257x768xf64>
-    %1411 = stablehlo.multiply %1410, %1410 : tensor<1x257x768xf64>
-    %1412 = stablehlo.reduce(%1411 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1413 = stablehlo.reshape %1412 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1414 = stablehlo.broadcast_in_dim %1413, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1415 = stablehlo.divide %1414, %16 : tensor<1x257x1xf64>
-    %1416 = stablehlo.convert %1415 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1417 = stablehlo.reduce(%1402 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1418 = stablehlo.reshape %1417 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1419 = stablehlo.broadcast_in_dim %1418, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1420 = stablehlo.divide %1419, %32 : tensor<1x257x1xf32>
-    %1421 = stablehlo.broadcast_in_dim %1416, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1422 = stablehlo.add %1421, %37 : tensor<1x257x1xf32>
-    %1423 = stablehlo.rsqrt %1422 : tensor<1x257x1xf32>
-    %1424 = stablehlo.broadcast_in_dim %1402, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1425 = stablehlo.broadcast_in_dim %1420, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1426 = stablehlo.subtract %1424, %1425 : tensor<1x257x768xf32>
-    %1427 = stablehlo.broadcast_in_dim %1426, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1428 = stablehlo.broadcast_in_dim %1423, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1429 = stablehlo.multiply %1427, %1428 : tensor<1x257x768xf32>
-    %1430 = stablehlo.convert %arg34 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1431 = stablehlo.broadcast_in_dim %1429, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1432 = stablehlo.broadcast_in_dim %1430, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1433 = stablehlo.multiply %1431, %1432 : tensor<1x257x768xf32>
-    %1434 = stablehlo.convert %arg35 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1435 = stablehlo.broadcast_in_dim %1433, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1436 = stablehlo.broadcast_in_dim %1434, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1437 = stablehlo.add %1435, %1436 : tensor<1x257x768xf32>
-    %1438 = stablehlo.convert %1437 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1439 = stablehlo.reshape %1438 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1440 = stablehlo.convert %1439 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1441 = stablehlo.dot_general %1440, %arg134, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %1442 = stablehlo.broadcast_in_dim %1441, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1443 = stablehlo.multiply %1442, %160 : tensor<257x3072xf32>
-    %1444 = stablehlo.broadcast_in_dim %1443, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1445 = stablehlo.broadcast_in_dim %arg135, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %1446 = stablehlo.add %1444, %1445 : tensor<257x3072xf32>
-    %1447 = stablehlo.convert %1446 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %1448 = stablehlo.reshape %1447 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %1449 = stablehlo.multiply %1448, %cst_4 : tensor<1x257x3072xbf16>
-    %1450 = stablehlo.multiply %1448, %168 : tensor<1x257x3072xbf16>
-    %1451 = stablehlo.convert %1450 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %1452 = stablehlo.clamp %cst_5, %1451, %cst_6 : tensor<1x257x3072xf32>
-    %1453 = stablehlo.multiply %1452, %1452 : tensor<1x257x3072xf32>
-    %1454 = stablehlo.multiply %cst_7, %1453 : tensor<1x257x3072xf32>
-    %1455 = stablehlo.add %1454, %cst_8 : tensor<1x257x3072xf32>
-    %1456 = stablehlo.multiply %1455, %1453 : tensor<1x257x3072xf32>
-    %1457 = stablehlo.add %1456, %cst_9 : tensor<1x257x3072xf32>
-    %1458 = stablehlo.multiply %1457, %1453 : tensor<1x257x3072xf32>
-    %1459 = stablehlo.add %1458, %cst_10 : tensor<1x257x3072xf32>
-    %1460 = stablehlo.multiply %1459, %1453 : tensor<1x257x3072xf32>
-    %1461 = stablehlo.add %1460, %cst_11 : tensor<1x257x3072xf32>
-    %1462 = stablehlo.multiply %1461, %1453 : tensor<1x257x3072xf32>
-    %1463 = stablehlo.add %1462, %cst_12 : tensor<1x257x3072xf32>
-    %1464 = stablehlo.multiply %1463, %1453 : tensor<1x257x3072xf32>
-    %1465 = stablehlo.add %1464, %cst_13 : tensor<1x257x3072xf32>
-    %1466 = stablehlo.multiply %cst_14, %1453 : tensor<1x257x3072xf32>
-    %1467 = stablehlo.add %1466, %cst_15 : tensor<1x257x3072xf32>
-    %1468 = stablehlo.multiply %1467, %1453 : tensor<1x257x3072xf32>
-    %1469 = stablehlo.add %1468, %cst_16 : tensor<1x257x3072xf32>
-    %1470 = stablehlo.multiply %1469, %1453 : tensor<1x257x3072xf32>
-    %1471 = stablehlo.add %1470, %cst_17 : tensor<1x257x3072xf32>
-    %1472 = stablehlo.multiply %1471, %1453 : tensor<1x257x3072xf32>
-    %1473 = stablehlo.add %1472, %cst_18 : tensor<1x257x3072xf32>
-    %1474 = stablehlo.multiply %1452, %1465 : tensor<1x257x3072xf32>
-    %1475 = stablehlo.divide %1474, %1473 : tensor<1x257x3072xf32>
-    %1476 = stablehlo.clamp %cst_19, %1475, %cst_20 : tensor<1x257x3072xf32>
-    %1477 = stablehlo.convert %1476 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %1478 = stablehlo.add %1477, %cst_2 : tensor<1x257x3072xbf16>
-    %1479 = stablehlo.multiply %1478, %1449 : tensor<1x257x3072xbf16>
-    %1480 = stablehlo.reshape %1479 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %1481 = stablehlo.convert %1480 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %1482 = stablehlo.dot_general %1481, %arg136, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %1483 = stablehlo.broadcast_in_dim %1482, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1484 = stablehlo.multiply %1483, %111 : tensor<257x768xf32>
-    %1485 = stablehlo.broadcast_in_dim %1484, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1486 = stablehlo.broadcast_in_dim %arg137, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1487 = stablehlo.add %1485, %1486 : tensor<257x768xf32>
-    %1488 = stablehlo.convert %1487 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1489 = stablehlo.reshape %1488 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1490 = stablehlo.add %1401, %1489 : tensor<1x257x768xbf16>
-    %1491 = stablehlo.convert %1490 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1492 = stablehlo.convert %1491 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1493 = stablehlo.reduce(%1492 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1494 = stablehlo.reshape %1493 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1495 = stablehlo.broadcast_in_dim %1494, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1496 = stablehlo.divide %1495, %16 : tensor<1x257x1xf64>
-    %1497 = stablehlo.broadcast_in_dim %1492, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1498 = stablehlo.broadcast_in_dim %1496, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1499 = stablehlo.subtract %1497, %1498 : tensor<1x257x768xf64>
-    %1500 = stablehlo.multiply %1499, %1499 : tensor<1x257x768xf64>
-    %1501 = stablehlo.reduce(%1500 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1502 = stablehlo.reshape %1501 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1503 = stablehlo.broadcast_in_dim %1502, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1504 = stablehlo.divide %1503, %16 : tensor<1x257x1xf64>
-    %1505 = stablehlo.convert %1504 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1506 = stablehlo.reduce(%1491 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1507 = stablehlo.reshape %1506 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1508 = stablehlo.broadcast_in_dim %1507, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1509 = stablehlo.divide %1508, %32 : tensor<1x257x1xf32>
-    %1510 = stablehlo.broadcast_in_dim %1505, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1511 = stablehlo.add %1510, %37 : tensor<1x257x1xf32>
-    %1512 = stablehlo.rsqrt %1511 : tensor<1x257x1xf32>
-    %1513 = stablehlo.broadcast_in_dim %1491, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1514 = stablehlo.broadcast_in_dim %1509, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1515 = stablehlo.subtract %1513, %1514 : tensor<1x257x768xf32>
-    %1516 = stablehlo.broadcast_in_dim %1515, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1517 = stablehlo.broadcast_in_dim %1512, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1518 = stablehlo.multiply %1516, %1517 : tensor<1x257x768xf32>
-    %1519 = stablehlo.convert %arg36 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1520 = stablehlo.broadcast_in_dim %1518, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1521 = stablehlo.broadcast_in_dim %1519, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1522 = stablehlo.multiply %1520, %1521 : tensor<1x257x768xf32>
-    %1523 = stablehlo.convert %arg37 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1524 = stablehlo.broadcast_in_dim %1522, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1525 = stablehlo.broadcast_in_dim %1523, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1526 = stablehlo.add %1524, %1525 : tensor<1x257x768xf32>
-    %1527 = stablehlo.convert %1526 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1528 = stablehlo.reshape %1527 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1529 = stablehlo.convert %1528 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1530 = stablehlo.dot_general %1529, %arg138, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %1531 = stablehlo.broadcast_in_dim %1530, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1532 = stablehlo.multiply %1531, %61 : tensor<257x2304xf32>
-    %1533 = stablehlo.broadcast_in_dim %1532, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1534 = stablehlo.broadcast_in_dim %arg139, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %1535 = stablehlo.add %1533, %1534 : tensor<257x2304xf32>
-    %1536 = stablehlo.convert %1535 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %1537 = stablehlo.reshape %1536 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %1538 = stablehlo.reshape %1537 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %1539 = stablehlo.transpose %1538, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %1540 = stablehlo.slice %1539 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1541 = stablehlo.reshape %1540 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1542 = stablehlo.slice %1539 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1543 = stablehlo.reshape %1542 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1544 = stablehlo.slice %1539 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1545 = stablehlo.reshape %1544 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1546 = stablehlo.transpose %1543, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %1547 = stablehlo.reshape %1541 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1548 = stablehlo.reshape %1546 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1549 = stablehlo.broadcast_in_dim %1548, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1550 = stablehlo.dot_general %1547, %1549, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %1551 = stablehlo.reshape %1550 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1552 = stablehlo.broadcast_in_dim %1551, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1553 = stablehlo.multiply %1552, %85 : tensor<1x12x257x257xbf16>
-    %1554 = stablehlo.convert %1553 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %1555 = stablehlo.reduce(%1554 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1556 = stablehlo.reshape %1555 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1557 = stablehlo.broadcast_in_dim %1554, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1558 = stablehlo.broadcast_in_dim %1556, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1559 = stablehlo.subtract %1557, %1558 : tensor<1x12x257x257xf32>
-    %1560 = stablehlo.exponential %1559 : tensor<1x12x257x257xf32>
-    %1561 = stablehlo.reduce(%1560 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1562 = stablehlo.reshape %1561 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1563 = stablehlo.broadcast_in_dim %1560, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1564 = stablehlo.broadcast_in_dim %1562, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1565 = stablehlo.divide %1563, %1564 : tensor<1x12x257x257xf32>
-    %1566 = stablehlo.convert %1565 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %1567 = stablehlo.reshape %1566 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %1568 = stablehlo.reshape %1545 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1569 = stablehlo.broadcast_in_dim %1568, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1570 = stablehlo.dot_general %1567, %1569, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1571 = stablehlo.reshape %1570 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1572 = stablehlo.transpose %1571, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %1573 = stablehlo.reshape %1572 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %1574 = stablehlo.reshape %1573 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1575 = stablehlo.convert %1574 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1576 = stablehlo.dot_general %1575, %arg140, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %1577 = stablehlo.broadcast_in_dim %1576, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1578 = stablehlo.multiply %1577, %111 : tensor<257x768xf32>
-    %1579 = stablehlo.broadcast_in_dim %1578, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1580 = stablehlo.broadcast_in_dim %arg141, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1581 = stablehlo.add %1579, %1580 : tensor<257x768xf32>
-    %1582 = stablehlo.convert %1581 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1583 = stablehlo.reshape %1582 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1584 = stablehlo.add %1583, %1490 : tensor<1x257x768xbf16>
-    %1585 = stablehlo.convert %1584 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1586 = stablehlo.convert %1585 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1587 = stablehlo.reduce(%1586 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1588 = stablehlo.reshape %1587 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1589 = stablehlo.broadcast_in_dim %1588, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1590 = stablehlo.divide %1589, %16 : tensor<1x257x1xf64>
-    %1591 = stablehlo.broadcast_in_dim %1586, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1592 = stablehlo.broadcast_in_dim %1590, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1593 = stablehlo.subtract %1591, %1592 : tensor<1x257x768xf64>
-    %1594 = stablehlo.multiply %1593, %1593 : tensor<1x257x768xf64>
-    %1595 = stablehlo.reduce(%1594 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1596 = stablehlo.reshape %1595 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1597 = stablehlo.broadcast_in_dim %1596, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1598 = stablehlo.divide %1597, %16 : tensor<1x257x1xf64>
-    %1599 = stablehlo.convert %1598 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1600 = stablehlo.reduce(%1585 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1601 = stablehlo.reshape %1600 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1602 = stablehlo.broadcast_in_dim %1601, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1603 = stablehlo.divide %1602, %32 : tensor<1x257x1xf32>
-    %1604 = stablehlo.broadcast_in_dim %1599, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1605 = stablehlo.add %1604, %37 : tensor<1x257x1xf32>
-    %1606 = stablehlo.rsqrt %1605 : tensor<1x257x1xf32>
-    %1607 = stablehlo.broadcast_in_dim %1585, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1608 = stablehlo.broadcast_in_dim %1603, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1609 = stablehlo.subtract %1607, %1608 : tensor<1x257x768xf32>
-    %1610 = stablehlo.broadcast_in_dim %1609, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1611 = stablehlo.broadcast_in_dim %1606, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1612 = stablehlo.multiply %1610, %1611 : tensor<1x257x768xf32>
-    %1613 = stablehlo.convert %arg38 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1614 = stablehlo.broadcast_in_dim %1612, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1615 = stablehlo.broadcast_in_dim %1613, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1616 = stablehlo.multiply %1614, %1615 : tensor<1x257x768xf32>
-    %1617 = stablehlo.convert %arg39 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1618 = stablehlo.broadcast_in_dim %1616, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1619 = stablehlo.broadcast_in_dim %1617, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1620 = stablehlo.add %1618, %1619 : tensor<1x257x768xf32>
-    %1621 = stablehlo.convert %1620 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1622 = stablehlo.reshape %1621 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1623 = stablehlo.convert %1622 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1624 = stablehlo.dot_general %1623, %arg142, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %1625 = stablehlo.broadcast_in_dim %1624, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1626 = stablehlo.multiply %1625, %160 : tensor<257x3072xf32>
-    %1627 = stablehlo.broadcast_in_dim %1626, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1628 = stablehlo.broadcast_in_dim %arg143, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %1629 = stablehlo.add %1627, %1628 : tensor<257x3072xf32>
-    %1630 = stablehlo.convert %1629 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %1631 = stablehlo.reshape %1630 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %1632 = stablehlo.multiply %1631, %cst_4 : tensor<1x257x3072xbf16>
-    %1633 = stablehlo.multiply %1631, %168 : tensor<1x257x3072xbf16>
-    %1634 = stablehlo.convert %1633 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %1635 = stablehlo.clamp %cst_5, %1634, %cst_6 : tensor<1x257x3072xf32>
-    %1636 = stablehlo.multiply %1635, %1635 : tensor<1x257x3072xf32>
-    %1637 = stablehlo.multiply %cst_7, %1636 : tensor<1x257x3072xf32>
-    %1638 = stablehlo.add %1637, %cst_8 : tensor<1x257x3072xf32>
-    %1639 = stablehlo.multiply %1638, %1636 : tensor<1x257x3072xf32>
-    %1640 = stablehlo.add %1639, %cst_9 : tensor<1x257x3072xf32>
-    %1641 = stablehlo.multiply %1640, %1636 : tensor<1x257x3072xf32>
-    %1642 = stablehlo.add %1641, %cst_10 : tensor<1x257x3072xf32>
-    %1643 = stablehlo.multiply %1642, %1636 : tensor<1x257x3072xf32>
-    %1644 = stablehlo.add %1643, %cst_11 : tensor<1x257x3072xf32>
-    %1645 = stablehlo.multiply %1644, %1636 : tensor<1x257x3072xf32>
-    %1646 = stablehlo.add %1645, %cst_12 : tensor<1x257x3072xf32>
-    %1647 = stablehlo.multiply %1646, %1636 : tensor<1x257x3072xf32>
-    %1648 = stablehlo.add %1647, %cst_13 : tensor<1x257x3072xf32>
-    %1649 = stablehlo.multiply %cst_14, %1636 : tensor<1x257x3072xf32>
-    %1650 = stablehlo.add %1649, %cst_15 : tensor<1x257x3072xf32>
-    %1651 = stablehlo.multiply %1650, %1636 : tensor<1x257x3072xf32>
-    %1652 = stablehlo.add %1651, %cst_16 : tensor<1x257x3072xf32>
-    %1653 = stablehlo.multiply %1652, %1636 : tensor<1x257x3072xf32>
-    %1654 = stablehlo.add %1653, %cst_17 : tensor<1x257x3072xf32>
-    %1655 = stablehlo.multiply %1654, %1636 : tensor<1x257x3072xf32>
-    %1656 = stablehlo.add %1655, %cst_18 : tensor<1x257x3072xf32>
-    %1657 = stablehlo.multiply %1635, %1648 : tensor<1x257x3072xf32>
-    %1658 = stablehlo.divide %1657, %1656 : tensor<1x257x3072xf32>
-    %1659 = stablehlo.clamp %cst_19, %1658, %cst_20 : tensor<1x257x3072xf32>
-    %1660 = stablehlo.convert %1659 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %1661 = stablehlo.add %1660, %cst_2 : tensor<1x257x3072xbf16>
-    %1662 = stablehlo.multiply %1661, %1632 : tensor<1x257x3072xbf16>
-    %1663 = stablehlo.reshape %1662 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %1664 = stablehlo.convert %1663 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %1665 = stablehlo.dot_general %1664, %arg144, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %1666 = stablehlo.broadcast_in_dim %1665, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1667 = stablehlo.multiply %1666, %111 : tensor<257x768xf32>
-    %1668 = stablehlo.broadcast_in_dim %1667, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1669 = stablehlo.broadcast_in_dim %arg145, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1670 = stablehlo.add %1668, %1669 : tensor<257x768xf32>
-    %1671 = stablehlo.convert %1670 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1672 = stablehlo.reshape %1671 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1673 = stablehlo.add %1584, %1672 : tensor<1x257x768xbf16>
-    %1674 = stablehlo.convert %1673 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1675 = stablehlo.convert %1674 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1676 = stablehlo.reduce(%1675 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1677 = stablehlo.reshape %1676 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1678 = stablehlo.broadcast_in_dim %1677, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1679 = stablehlo.divide %1678, %16 : tensor<1x257x1xf64>
-    %1680 = stablehlo.broadcast_in_dim %1675, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1681 = stablehlo.broadcast_in_dim %1679, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1682 = stablehlo.subtract %1680, %1681 : tensor<1x257x768xf64>
-    %1683 = stablehlo.multiply %1682, %1682 : tensor<1x257x768xf64>
-    %1684 = stablehlo.reduce(%1683 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1685 = stablehlo.reshape %1684 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1686 = stablehlo.broadcast_in_dim %1685, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1687 = stablehlo.divide %1686, %16 : tensor<1x257x1xf64>
-    %1688 = stablehlo.convert %1687 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1689 = stablehlo.reduce(%1674 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1690 = stablehlo.reshape %1689 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1691 = stablehlo.broadcast_in_dim %1690, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1692 = stablehlo.divide %1691, %32 : tensor<1x257x1xf32>
-    %1693 = stablehlo.broadcast_in_dim %1688, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1694 = stablehlo.add %1693, %37 : tensor<1x257x1xf32>
-    %1695 = stablehlo.rsqrt %1694 : tensor<1x257x1xf32>
-    %1696 = stablehlo.broadcast_in_dim %1674, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1697 = stablehlo.broadcast_in_dim %1692, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1698 = stablehlo.subtract %1696, %1697 : tensor<1x257x768xf32>
-    %1699 = stablehlo.broadcast_in_dim %1698, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1700 = stablehlo.broadcast_in_dim %1695, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1701 = stablehlo.multiply %1699, %1700 : tensor<1x257x768xf32>
-    %1702 = stablehlo.convert %arg40 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1703 = stablehlo.broadcast_in_dim %1701, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1704 = stablehlo.broadcast_in_dim %1702, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1705 = stablehlo.multiply %1703, %1704 : tensor<1x257x768xf32>
-    %1706 = stablehlo.convert %arg41 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1707 = stablehlo.broadcast_in_dim %1705, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1708 = stablehlo.broadcast_in_dim %1706, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1709 = stablehlo.add %1707, %1708 : tensor<1x257x768xf32>
-    %1710 = stablehlo.convert %1709 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1711 = stablehlo.reshape %1710 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1712 = stablehlo.convert %1711 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1713 = stablehlo.dot_general %1712, %arg146, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %1714 = stablehlo.broadcast_in_dim %1713, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1715 = stablehlo.multiply %1714, %61 : tensor<257x2304xf32>
-    %1716 = stablehlo.broadcast_in_dim %1715, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1717 = stablehlo.broadcast_in_dim %arg147, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %1718 = stablehlo.add %1716, %1717 : tensor<257x2304xf32>
-    %1719 = stablehlo.convert %1718 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %1720 = stablehlo.reshape %1719 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %1721 = stablehlo.reshape %1720 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %1722 = stablehlo.transpose %1721, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %1723 = stablehlo.slice %1722 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1724 = stablehlo.reshape %1723 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1725 = stablehlo.slice %1722 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1726 = stablehlo.reshape %1725 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1727 = stablehlo.slice %1722 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1728 = stablehlo.reshape %1727 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1729 = stablehlo.transpose %1726, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %1730 = stablehlo.reshape %1724 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1731 = stablehlo.reshape %1729 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1732 = stablehlo.broadcast_in_dim %1731, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1733 = stablehlo.dot_general %1730, %1732, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %1734 = stablehlo.reshape %1733 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1735 = stablehlo.broadcast_in_dim %1734, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1736 = stablehlo.multiply %1735, %85 : tensor<1x12x257x257xbf16>
-    %1737 = stablehlo.convert %1736 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %1738 = stablehlo.reduce(%1737 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1739 = stablehlo.reshape %1738 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1740 = stablehlo.broadcast_in_dim %1737, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1741 = stablehlo.broadcast_in_dim %1739, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1742 = stablehlo.subtract %1740, %1741 : tensor<1x12x257x257xf32>
-    %1743 = stablehlo.exponential %1742 : tensor<1x12x257x257xf32>
-    %1744 = stablehlo.reduce(%1743 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1745 = stablehlo.reshape %1744 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1746 = stablehlo.broadcast_in_dim %1743, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1747 = stablehlo.broadcast_in_dim %1745, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1748 = stablehlo.divide %1746, %1747 : tensor<1x12x257x257xf32>
-    %1749 = stablehlo.convert %1748 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %1750 = stablehlo.reshape %1749 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %1751 = stablehlo.reshape %1728 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1752 = stablehlo.broadcast_in_dim %1751, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1753 = stablehlo.dot_general %1750, %1752, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1754 = stablehlo.reshape %1753 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1755 = stablehlo.transpose %1754, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %1756 = stablehlo.reshape %1755 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %1757 = stablehlo.reshape %1756 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1758 = stablehlo.convert %1757 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1759 = stablehlo.dot_general %1758, %arg148, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %1760 = stablehlo.broadcast_in_dim %1759, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1761 = stablehlo.multiply %1760, %111 : tensor<257x768xf32>
-    %1762 = stablehlo.broadcast_in_dim %1761, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1763 = stablehlo.broadcast_in_dim %arg149, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1764 = stablehlo.add %1762, %1763 : tensor<257x768xf32>
-    %1765 = stablehlo.convert %1764 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1766 = stablehlo.reshape %1765 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1767 = stablehlo.add %1766, %1673 : tensor<1x257x768xbf16>
-    %1768 = stablehlo.convert %1767 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1769 = stablehlo.convert %1768 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1770 = stablehlo.reduce(%1769 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1771 = stablehlo.reshape %1770 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1772 = stablehlo.broadcast_in_dim %1771, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1773 = stablehlo.divide %1772, %16 : tensor<1x257x1xf64>
-    %1774 = stablehlo.broadcast_in_dim %1769, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1775 = stablehlo.broadcast_in_dim %1773, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1776 = stablehlo.subtract %1774, %1775 : tensor<1x257x768xf64>
-    %1777 = stablehlo.multiply %1776, %1776 : tensor<1x257x768xf64>
-    %1778 = stablehlo.reduce(%1777 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1779 = stablehlo.reshape %1778 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1780 = stablehlo.broadcast_in_dim %1779, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1781 = stablehlo.divide %1780, %16 : tensor<1x257x1xf64>
-    %1782 = stablehlo.convert %1781 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1783 = stablehlo.reduce(%1768 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1784 = stablehlo.reshape %1783 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1785 = stablehlo.broadcast_in_dim %1784, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1786 = stablehlo.divide %1785, %32 : tensor<1x257x1xf32>
-    %1787 = stablehlo.broadcast_in_dim %1782, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1788 = stablehlo.add %1787, %37 : tensor<1x257x1xf32>
-    %1789 = stablehlo.rsqrt %1788 : tensor<1x257x1xf32>
-    %1790 = stablehlo.broadcast_in_dim %1768, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1791 = stablehlo.broadcast_in_dim %1786, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1792 = stablehlo.subtract %1790, %1791 : tensor<1x257x768xf32>
-    %1793 = stablehlo.broadcast_in_dim %1792, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1794 = stablehlo.broadcast_in_dim %1789, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1795 = stablehlo.multiply %1793, %1794 : tensor<1x257x768xf32>
-    %1796 = stablehlo.convert %arg42 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1797 = stablehlo.broadcast_in_dim %1795, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1798 = stablehlo.broadcast_in_dim %1796, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1799 = stablehlo.multiply %1797, %1798 : tensor<1x257x768xf32>
-    %1800 = stablehlo.convert %arg43 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1801 = stablehlo.broadcast_in_dim %1799, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1802 = stablehlo.broadcast_in_dim %1800, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1803 = stablehlo.add %1801, %1802 : tensor<1x257x768xf32>
-    %1804 = stablehlo.convert %1803 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1805 = stablehlo.reshape %1804 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1806 = stablehlo.convert %1805 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1807 = stablehlo.dot_general %1806, %arg150, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %1808 = stablehlo.broadcast_in_dim %1807, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1809 = stablehlo.multiply %1808, %160 : tensor<257x3072xf32>
-    %1810 = stablehlo.broadcast_in_dim %1809, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1811 = stablehlo.broadcast_in_dim %arg151, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %1812 = stablehlo.add %1810, %1811 : tensor<257x3072xf32>
-    %1813 = stablehlo.convert %1812 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %1814 = stablehlo.reshape %1813 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %1815 = stablehlo.multiply %1814, %cst_4 : tensor<1x257x3072xbf16>
-    %1816 = stablehlo.multiply %1814, %168 : tensor<1x257x3072xbf16>
-    %1817 = stablehlo.convert %1816 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %1818 = stablehlo.clamp %cst_5, %1817, %cst_6 : tensor<1x257x3072xf32>
-    %1819 = stablehlo.multiply %1818, %1818 : tensor<1x257x3072xf32>
-    %1820 = stablehlo.multiply %cst_7, %1819 : tensor<1x257x3072xf32>
-    %1821 = stablehlo.add %1820, %cst_8 : tensor<1x257x3072xf32>
-    %1822 = stablehlo.multiply %1821, %1819 : tensor<1x257x3072xf32>
-    %1823 = stablehlo.add %1822, %cst_9 : tensor<1x257x3072xf32>
-    %1824 = stablehlo.multiply %1823, %1819 : tensor<1x257x3072xf32>
-    %1825 = stablehlo.add %1824, %cst_10 : tensor<1x257x3072xf32>
-    %1826 = stablehlo.multiply %1825, %1819 : tensor<1x257x3072xf32>
-    %1827 = stablehlo.add %1826, %cst_11 : tensor<1x257x3072xf32>
-    %1828 = stablehlo.multiply %1827, %1819 : tensor<1x257x3072xf32>
-    %1829 = stablehlo.add %1828, %cst_12 : tensor<1x257x3072xf32>
-    %1830 = stablehlo.multiply %1829, %1819 : tensor<1x257x3072xf32>
-    %1831 = stablehlo.add %1830, %cst_13 : tensor<1x257x3072xf32>
-    %1832 = stablehlo.multiply %cst_14, %1819 : tensor<1x257x3072xf32>
-    %1833 = stablehlo.add %1832, %cst_15 : tensor<1x257x3072xf32>
-    %1834 = stablehlo.multiply %1833, %1819 : tensor<1x257x3072xf32>
-    %1835 = stablehlo.add %1834, %cst_16 : tensor<1x257x3072xf32>
-    %1836 = stablehlo.multiply %1835, %1819 : tensor<1x257x3072xf32>
-    %1837 = stablehlo.add %1836, %cst_17 : tensor<1x257x3072xf32>
-    %1838 = stablehlo.multiply %1837, %1819 : tensor<1x257x3072xf32>
-    %1839 = stablehlo.add %1838, %cst_18 : tensor<1x257x3072xf32>
-    %1840 = stablehlo.multiply %1818, %1831 : tensor<1x257x3072xf32>
-    %1841 = stablehlo.divide %1840, %1839 : tensor<1x257x3072xf32>
-    %1842 = stablehlo.clamp %cst_19, %1841, %cst_20 : tensor<1x257x3072xf32>
-    %1843 = stablehlo.convert %1842 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %1844 = stablehlo.add %1843, %cst_2 : tensor<1x257x3072xbf16>
-    %1845 = stablehlo.multiply %1844, %1815 : tensor<1x257x3072xbf16>
-    %1846 = stablehlo.reshape %1845 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %1847 = stablehlo.convert %1846 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %1848 = stablehlo.dot_general %1847, %arg152, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %1849 = stablehlo.broadcast_in_dim %1848, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1850 = stablehlo.multiply %1849, %111 : tensor<257x768xf32>
-    %1851 = stablehlo.broadcast_in_dim %1850, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1852 = stablehlo.broadcast_in_dim %arg153, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1853 = stablehlo.add %1851, %1852 : tensor<257x768xf32>
-    %1854 = stablehlo.convert %1853 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1855 = stablehlo.reshape %1854 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1856 = stablehlo.add %1767, %1855 : tensor<1x257x768xbf16>
-    %1857 = stablehlo.convert %1856 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1858 = stablehlo.convert %1857 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1859 = stablehlo.reduce(%1858 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1860 = stablehlo.reshape %1859 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1861 = stablehlo.broadcast_in_dim %1860, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1862 = stablehlo.divide %1861, %16 : tensor<1x257x1xf64>
-    %1863 = stablehlo.broadcast_in_dim %1858, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1864 = stablehlo.broadcast_in_dim %1862, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1865 = stablehlo.subtract %1863, %1864 : tensor<1x257x768xf64>
-    %1866 = stablehlo.multiply %1865, %1865 : tensor<1x257x768xf64>
-    %1867 = stablehlo.reduce(%1866 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1868 = stablehlo.reshape %1867 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1869 = stablehlo.broadcast_in_dim %1868, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1870 = stablehlo.divide %1869, %16 : tensor<1x257x1xf64>
-    %1871 = stablehlo.convert %1870 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1872 = stablehlo.reduce(%1857 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1873 = stablehlo.reshape %1872 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1874 = stablehlo.broadcast_in_dim %1873, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1875 = stablehlo.divide %1874, %32 : tensor<1x257x1xf32>
-    %1876 = stablehlo.broadcast_in_dim %1871, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1877 = stablehlo.add %1876, %37 : tensor<1x257x1xf32>
-    %1878 = stablehlo.rsqrt %1877 : tensor<1x257x1xf32>
-    %1879 = stablehlo.broadcast_in_dim %1857, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1880 = stablehlo.broadcast_in_dim %1875, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1881 = stablehlo.subtract %1879, %1880 : tensor<1x257x768xf32>
-    %1882 = stablehlo.broadcast_in_dim %1881, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1883 = stablehlo.broadcast_in_dim %1878, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1884 = stablehlo.multiply %1882, %1883 : tensor<1x257x768xf32>
-    %1885 = stablehlo.convert %arg44 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1886 = stablehlo.broadcast_in_dim %1884, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1887 = stablehlo.broadcast_in_dim %1885, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1888 = stablehlo.multiply %1886, %1887 : tensor<1x257x768xf32>
-    %1889 = stablehlo.convert %arg45 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1890 = stablehlo.broadcast_in_dim %1888, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1891 = stablehlo.broadcast_in_dim %1889, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1892 = stablehlo.add %1890, %1891 : tensor<1x257x768xf32>
-    %1893 = stablehlo.convert %1892 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1894 = stablehlo.reshape %1893 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1895 = stablehlo.convert %1894 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1896 = stablehlo.dot_general %1895, %arg154, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %1897 = stablehlo.broadcast_in_dim %1896, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1898 = stablehlo.multiply %1897, %61 : tensor<257x2304xf32>
-    %1899 = stablehlo.broadcast_in_dim %1898, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %1900 = stablehlo.broadcast_in_dim %arg155, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %1901 = stablehlo.add %1899, %1900 : tensor<257x2304xf32>
-    %1902 = stablehlo.convert %1901 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %1903 = stablehlo.reshape %1902 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %1904 = stablehlo.reshape %1903 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %1905 = stablehlo.transpose %1904, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %1906 = stablehlo.slice %1905 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1907 = stablehlo.reshape %1906 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1908 = stablehlo.slice %1905 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1909 = stablehlo.reshape %1908 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1910 = stablehlo.slice %1905 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %1911 = stablehlo.reshape %1910 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1912 = stablehlo.transpose %1909, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %1913 = stablehlo.reshape %1907 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1914 = stablehlo.reshape %1912 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1915 = stablehlo.broadcast_in_dim %1914, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %1916 = stablehlo.dot_general %1913, %1915, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %1917 = stablehlo.reshape %1916 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1918 = stablehlo.broadcast_in_dim %1917, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %1919 = stablehlo.multiply %1918, %85 : tensor<1x12x257x257xbf16>
-    %1920 = stablehlo.convert %1919 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %1921 = stablehlo.reduce(%1920 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1922 = stablehlo.reshape %1921 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1923 = stablehlo.broadcast_in_dim %1920, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1924 = stablehlo.broadcast_in_dim %1922, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1925 = stablehlo.subtract %1923, %1924 : tensor<1x12x257x257xf32>
-    %1926 = stablehlo.exponential %1925 : tensor<1x12x257x257xf32>
-    %1927 = stablehlo.reduce(%1926 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %1928 = stablehlo.reshape %1927 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %1929 = stablehlo.broadcast_in_dim %1926, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %1930 = stablehlo.broadcast_in_dim %1928, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %1931 = stablehlo.divide %1929, %1930 : tensor<1x12x257x257xf32>
-    %1932 = stablehlo.convert %1931 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %1933 = stablehlo.reshape %1932 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %1934 = stablehlo.reshape %1911 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1935 = stablehlo.broadcast_in_dim %1934, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1936 = stablehlo.dot_general %1933, %1935, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %1937 = stablehlo.reshape %1936 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %1938 = stablehlo.transpose %1937, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %1939 = stablehlo.reshape %1938 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %1940 = stablehlo.reshape %1939 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1941 = stablehlo.convert %1940 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1942 = stablehlo.dot_general %1941, %arg156, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %1943 = stablehlo.broadcast_in_dim %1942, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1944 = stablehlo.multiply %1943, %111 : tensor<257x768xf32>
-    %1945 = stablehlo.broadcast_in_dim %1944, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %1946 = stablehlo.broadcast_in_dim %arg157, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %1947 = stablehlo.add %1945, %1946 : tensor<257x768xf32>
-    %1948 = stablehlo.convert %1947 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %1949 = stablehlo.reshape %1948 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %1950 = stablehlo.add %1949, %1856 : tensor<1x257x768xbf16>
-    %1951 = stablehlo.convert %1950 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %1952 = stablehlo.convert %1951 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %1953 = stablehlo.reduce(%1952 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1954 = stablehlo.reshape %1953 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1955 = stablehlo.broadcast_in_dim %1954, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1956 = stablehlo.divide %1955, %16 : tensor<1x257x1xf64>
-    %1957 = stablehlo.broadcast_in_dim %1952, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %1958 = stablehlo.broadcast_in_dim %1956, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %1959 = stablehlo.subtract %1957, %1958 : tensor<1x257x768xf64>
-    %1960 = stablehlo.multiply %1959, %1959 : tensor<1x257x768xf64>
-    %1961 = stablehlo.reduce(%1960 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %1962 = stablehlo.reshape %1961 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %1963 = stablehlo.broadcast_in_dim %1962, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %1964 = stablehlo.divide %1963, %16 : tensor<1x257x1xf64>
-    %1965 = stablehlo.convert %1964 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %1966 = stablehlo.reduce(%1951 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %1967 = stablehlo.reshape %1966 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %1968 = stablehlo.broadcast_in_dim %1967, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1969 = stablehlo.divide %1968, %32 : tensor<1x257x1xf32>
-    %1970 = stablehlo.broadcast_in_dim %1965, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %1971 = stablehlo.add %1970, %37 : tensor<1x257x1xf32>
-    %1972 = stablehlo.rsqrt %1971 : tensor<1x257x1xf32>
-    %1973 = stablehlo.broadcast_in_dim %1951, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1974 = stablehlo.broadcast_in_dim %1969, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1975 = stablehlo.subtract %1973, %1974 : tensor<1x257x768xf32>
-    %1976 = stablehlo.broadcast_in_dim %1975, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1977 = stablehlo.broadcast_in_dim %1972, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %1978 = stablehlo.multiply %1976, %1977 : tensor<1x257x768xf32>
-    %1979 = stablehlo.convert %arg46 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1980 = stablehlo.broadcast_in_dim %1978, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1981 = stablehlo.broadcast_in_dim %1979, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1982 = stablehlo.multiply %1980, %1981 : tensor<1x257x768xf32>
-    %1983 = stablehlo.convert %arg47 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1984 = stablehlo.broadcast_in_dim %1982, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %1985 = stablehlo.broadcast_in_dim %1983, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %1986 = stablehlo.add %1984, %1985 : tensor<1x257x768xf32>
-    %1987 = stablehlo.convert %1986 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %1988 = stablehlo.reshape %1987 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %1989 = stablehlo.convert %1988 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %1990 = stablehlo.dot_general %1989, %arg158, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %1991 = stablehlo.broadcast_in_dim %1990, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1992 = stablehlo.multiply %1991, %160 : tensor<257x3072xf32>
-    %1993 = stablehlo.broadcast_in_dim %1992, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %1994 = stablehlo.broadcast_in_dim %arg159, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %1995 = stablehlo.add %1993, %1994 : tensor<257x3072xf32>
-    %1996 = stablehlo.convert %1995 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %1997 = stablehlo.reshape %1996 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %1998 = stablehlo.multiply %1997, %cst_4 : tensor<1x257x3072xbf16>
-    %1999 = stablehlo.multiply %1997, %168 : tensor<1x257x3072xbf16>
-    %2000 = stablehlo.convert %1999 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %2001 = stablehlo.clamp %cst_5, %2000, %cst_6 : tensor<1x257x3072xf32>
-    %2002 = stablehlo.multiply %2001, %2001 : tensor<1x257x3072xf32>
-    %2003 = stablehlo.multiply %cst_7, %2002 : tensor<1x257x3072xf32>
-    %2004 = stablehlo.add %2003, %cst_8 : tensor<1x257x3072xf32>
-    %2005 = stablehlo.multiply %2004, %2002 : tensor<1x257x3072xf32>
-    %2006 = stablehlo.add %2005, %cst_9 : tensor<1x257x3072xf32>
-    %2007 = stablehlo.multiply %2006, %2002 : tensor<1x257x3072xf32>
-    %2008 = stablehlo.add %2007, %cst_10 : tensor<1x257x3072xf32>
-    %2009 = stablehlo.multiply %2008, %2002 : tensor<1x257x3072xf32>
-    %2010 = stablehlo.add %2009, %cst_11 : tensor<1x257x3072xf32>
-    %2011 = stablehlo.multiply %2010, %2002 : tensor<1x257x3072xf32>
-    %2012 = stablehlo.add %2011, %cst_12 : tensor<1x257x3072xf32>
-    %2013 = stablehlo.multiply %2012, %2002 : tensor<1x257x3072xf32>
-    %2014 = stablehlo.add %2013, %cst_13 : tensor<1x257x3072xf32>
-    %2015 = stablehlo.multiply %cst_14, %2002 : tensor<1x257x3072xf32>
-    %2016 = stablehlo.add %2015, %cst_15 : tensor<1x257x3072xf32>
-    %2017 = stablehlo.multiply %2016, %2002 : tensor<1x257x3072xf32>
-    %2018 = stablehlo.add %2017, %cst_16 : tensor<1x257x3072xf32>
-    %2019 = stablehlo.multiply %2018, %2002 : tensor<1x257x3072xf32>
-    %2020 = stablehlo.add %2019, %cst_17 : tensor<1x257x3072xf32>
-    %2021 = stablehlo.multiply %2020, %2002 : tensor<1x257x3072xf32>
-    %2022 = stablehlo.add %2021, %cst_18 : tensor<1x257x3072xf32>
-    %2023 = stablehlo.multiply %2001, %2014 : tensor<1x257x3072xf32>
-    %2024 = stablehlo.divide %2023, %2022 : tensor<1x257x3072xf32>
-    %2025 = stablehlo.clamp %cst_19, %2024, %cst_20 : tensor<1x257x3072xf32>
-    %2026 = stablehlo.convert %2025 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %2027 = stablehlo.add %2026, %cst_2 : tensor<1x257x3072xbf16>
-    %2028 = stablehlo.multiply %2027, %1998 : tensor<1x257x3072xbf16>
-    %2029 = stablehlo.reshape %2028 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %2030 = stablehlo.convert %2029 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %2031 = stablehlo.dot_general %2030, %arg160, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %2032 = stablehlo.broadcast_in_dim %2031, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %2033 = stablehlo.multiply %2032, %111 : tensor<257x768xf32>
-    %2034 = stablehlo.broadcast_in_dim %2033, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %2035 = stablehlo.broadcast_in_dim %arg161, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %2036 = stablehlo.add %2034, %2035 : tensor<257x768xf32>
-    %2037 = stablehlo.convert %2036 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %2038 = stablehlo.reshape %2037 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %2039 = stablehlo.add %1950, %2038 : tensor<1x257x768xbf16>
-    %2040 = stablehlo.convert %2039 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %2041 = stablehlo.convert %2040 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %2042 = stablehlo.reduce(%2041 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %2043 = stablehlo.reshape %2042 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %2044 = stablehlo.broadcast_in_dim %2043, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %2045 = stablehlo.divide %2044, %16 : tensor<1x257x1xf64>
-    %2046 = stablehlo.broadcast_in_dim %2041, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %2047 = stablehlo.broadcast_in_dim %2045, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %2048 = stablehlo.subtract %2046, %2047 : tensor<1x257x768xf64>
-    %2049 = stablehlo.multiply %2048, %2048 : tensor<1x257x768xf64>
-    %2050 = stablehlo.reduce(%2049 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %2051 = stablehlo.reshape %2050 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %2052 = stablehlo.broadcast_in_dim %2051, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %2053 = stablehlo.divide %2052, %16 : tensor<1x257x1xf64>
-    %2054 = stablehlo.convert %2053 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %2055 = stablehlo.reduce(%2040 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %2056 = stablehlo.reshape %2055 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %2057 = stablehlo.broadcast_in_dim %2056, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %2058 = stablehlo.divide %2057, %32 : tensor<1x257x1xf32>
-    %2059 = stablehlo.broadcast_in_dim %2054, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %2060 = stablehlo.add %2059, %37 : tensor<1x257x1xf32>
-    %2061 = stablehlo.rsqrt %2060 : tensor<1x257x1xf32>
-    %2062 = stablehlo.broadcast_in_dim %2040, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2063 = stablehlo.broadcast_in_dim %2058, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %2064 = stablehlo.subtract %2062, %2063 : tensor<1x257x768xf32>
-    %2065 = stablehlo.broadcast_in_dim %2064, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2066 = stablehlo.broadcast_in_dim %2061, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %2067 = stablehlo.multiply %2065, %2066 : tensor<1x257x768xf32>
-    %2068 = stablehlo.convert %arg48 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2069 = stablehlo.broadcast_in_dim %2067, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2070 = stablehlo.broadcast_in_dim %2068, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2071 = stablehlo.multiply %2069, %2070 : tensor<1x257x768xf32>
-    %2072 = stablehlo.convert %arg49 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2073 = stablehlo.broadcast_in_dim %2071, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2074 = stablehlo.broadcast_in_dim %2072, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2075 = stablehlo.add %2073, %2074 : tensor<1x257x768xf32>
-    %2076 = stablehlo.convert %2075 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %2077 = stablehlo.reshape %2076 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %2078 = stablehlo.convert %2077 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %2079 = stablehlo.dot_general %2078, %arg162, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x2304xf32>) -> tensor<257x2304xf32>
-    %2080 = stablehlo.broadcast_in_dim %2079, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %2081 = stablehlo.multiply %2080, %61 : tensor<257x2304xf32>
-    %2082 = stablehlo.broadcast_in_dim %2081, dims = [0, 1] : (tensor<257x2304xf32>) -> tensor<257x2304xf32>
-    %2083 = stablehlo.broadcast_in_dim %arg163, dims = [1] : (tensor<2304xf32>) -> tensor<257x2304xf32>
-    %2084 = stablehlo.add %2082, %2083 : tensor<257x2304xf32>
-    %2085 = stablehlo.convert %2084 : (tensor<257x2304xf32>) -> tensor<257x2304xbf16>
-    %2086 = stablehlo.reshape %2085 : (tensor<257x2304xbf16>) -> tensor<1x257x2304xbf16>
-    %2087 = stablehlo.reshape %2086 : (tensor<1x257x2304xbf16>) -> tensor<1x257x3x12x64xbf16>
-    %2088 = stablehlo.transpose %2087, dims = [2, 0, 3, 1, 4] : (tensor<1x257x3x12x64xbf16>) -> tensor<3x1x12x257x64xbf16>
-    %2089 = stablehlo.slice %2088 [0:1, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %2090 = stablehlo.reshape %2089 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %2091 = stablehlo.slice %2088 [1:2, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %2092 = stablehlo.reshape %2091 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %2093 = stablehlo.slice %2088 [2:3, 0:1, 0:12, 0:257, 0:64] : (tensor<3x1x12x257x64xbf16>) -> tensor<1x1x12x257x64xbf16>
-    %2094 = stablehlo.reshape %2093 : (tensor<1x1x12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %2095 = stablehlo.transpose %2092, dims = [0, 1, 3, 2] : (tensor<1x12x257x64xbf16>) -> tensor<1x12x64x257xbf16>
-    %2096 = stablehlo.reshape %2090 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %2097 = stablehlo.reshape %2095 : (tensor<1x12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %2098 = stablehlo.broadcast_in_dim %2097, dims = [0, 1, 2] : (tensor<12x64x257xbf16>) -> tensor<12x64x257xbf16>
-    %2099 = stablehlo.dot_general %2096, %2098, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x64xbf16>, tensor<12x64x257xbf16>) -> tensor<12x257x257xbf16>
-    %2100 = stablehlo.reshape %2099 : (tensor<12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %2101 = stablehlo.broadcast_in_dim %2100, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xbf16>
-    %2102 = stablehlo.multiply %2101, %85 : tensor<1x12x257x257xbf16>
-    %2103 = stablehlo.convert %2102 : (tensor<1x12x257x257xbf16>) -> tensor<1x12x257x257xf32>
-    %2104 = stablehlo.reduce(%2103 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %2105 = stablehlo.reshape %2104 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %2106 = stablehlo.broadcast_in_dim %2103, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %2107 = stablehlo.broadcast_in_dim %2105, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %2108 = stablehlo.subtract %2106, %2107 : tensor<1x12x257x257xf32>
-    %2109 = stablehlo.exponential %2108 : tensor<1x12x257x257xf32>
-    %2110 = stablehlo.reduce(%2109 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x257x257xf32>, tensor<f32>) -> tensor<1x12x257xf32>
-    %2111 = stablehlo.reshape %2110 : (tensor<1x12x257xf32>) -> tensor<1x12x257x1xf32>
-    %2112 = stablehlo.broadcast_in_dim %2109, dims = [0, 1, 2, 3] : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xf32>
-    %2113 = stablehlo.broadcast_in_dim %2111, dims = [0, 1, 2, 3] : (tensor<1x12x257x1xf32>) -> tensor<1x12x257x257xf32>
-    %2114 = stablehlo.divide %2112, %2113 : tensor<1x12x257x257xf32>
-    %2115 = stablehlo.convert %2114 : (tensor<1x12x257x257xf32>) -> tensor<1x12x257x257xbf16>
-    %2116 = stablehlo.reshape %2115 : (tensor<1x12x257x257xbf16>) -> tensor<12x257x257xbf16>
-    %2117 = stablehlo.reshape %2094 : (tensor<1x12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %2118 = stablehlo.broadcast_in_dim %2117, dims = [0, 1, 2] : (tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %2119 = stablehlo.dot_general %2116, %2118, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x257x257xbf16>, tensor<12x257x64xbf16>) -> tensor<12x257x64xbf16>
-    %2120 = stablehlo.reshape %2119 : (tensor<12x257x64xbf16>) -> tensor<1x12x257x64xbf16>
-    %2121 = stablehlo.transpose %2120, dims = [0, 2, 1, 3] : (tensor<1x12x257x64xbf16>) -> tensor<1x257x12x64xbf16>
-    %2122 = stablehlo.reshape %2121 : (tensor<1x257x12x64xbf16>) -> tensor<1x257x768xbf16>
-    %2123 = stablehlo.reshape %2122 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %2124 = stablehlo.convert %2123 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %2125 = stablehlo.dot_general %2124, %arg164, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x768xf32>) -> tensor<257x768xf32>
-    %2126 = stablehlo.broadcast_in_dim %2125, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %2127 = stablehlo.multiply %2126, %111 : tensor<257x768xf32>
-    %2128 = stablehlo.broadcast_in_dim %2127, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %2129 = stablehlo.broadcast_in_dim %arg165, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %2130 = stablehlo.add %2128, %2129 : tensor<257x768xf32>
-    %2131 = stablehlo.convert %2130 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %2132 = stablehlo.reshape %2131 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %2133 = stablehlo.add %2132, %2039 : tensor<1x257x768xbf16>
-    %2134 = stablehlo.convert %2133 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %2135 = stablehlo.convert %2134 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %2136 = stablehlo.reduce(%2135 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %2137 = stablehlo.reshape %2136 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %2138 = stablehlo.broadcast_in_dim %2137, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %2139 = stablehlo.divide %2138, %16 : tensor<1x257x1xf64>
-    %2140 = stablehlo.broadcast_in_dim %2135, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %2141 = stablehlo.broadcast_in_dim %2139, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %2142 = stablehlo.subtract %2140, %2141 : tensor<1x257x768xf64>
-    %2143 = stablehlo.multiply %2142, %2142 : tensor<1x257x768xf64>
-    %2144 = stablehlo.reduce(%2143 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %2145 = stablehlo.reshape %2144 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %2146 = stablehlo.broadcast_in_dim %2145, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %2147 = stablehlo.divide %2146, %16 : tensor<1x257x1xf64>
-    %2148 = stablehlo.convert %2147 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %2149 = stablehlo.reduce(%2134 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %2150 = stablehlo.reshape %2149 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %2151 = stablehlo.broadcast_in_dim %2150, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %2152 = stablehlo.divide %2151, %32 : tensor<1x257x1xf32>
-    %2153 = stablehlo.broadcast_in_dim %2148, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %2154 = stablehlo.add %2153, %37 : tensor<1x257x1xf32>
-    %2155 = stablehlo.rsqrt %2154 : tensor<1x257x1xf32>
-    %2156 = stablehlo.broadcast_in_dim %2134, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2157 = stablehlo.broadcast_in_dim %2152, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %2158 = stablehlo.subtract %2156, %2157 : tensor<1x257x768xf32>
-    %2159 = stablehlo.broadcast_in_dim %2158, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2160 = stablehlo.broadcast_in_dim %2155, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %2161 = stablehlo.multiply %2159, %2160 : tensor<1x257x768xf32>
-    %2162 = stablehlo.convert %arg50 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2163 = stablehlo.broadcast_in_dim %2161, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2164 = stablehlo.broadcast_in_dim %2162, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2165 = stablehlo.multiply %2163, %2164 : tensor<1x257x768xf32>
-    %2166 = stablehlo.convert %arg51 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2167 = stablehlo.broadcast_in_dim %2165, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2168 = stablehlo.broadcast_in_dim %2166, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2169 = stablehlo.add %2167, %2168 : tensor<1x257x768xf32>
-    %2170 = stablehlo.convert %2169 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %2171 = stablehlo.reshape %2170 : (tensor<1x257x768xbf16>) -> tensor<257x768xbf16>
-    %2172 = stablehlo.convert %2171 : (tensor<257x768xbf16>) -> tensor<257x768xf32>
-    %2173 = stablehlo.dot_general %2172, %arg166, contracting_dims = [1] x [0] : (tensor<257x768xf32>, tensor<768x3072xf32>) -> tensor<257x3072xf32>
-    %2174 = stablehlo.broadcast_in_dim %2173, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %2175 = stablehlo.multiply %2174, %160 : tensor<257x3072xf32>
-    %2176 = stablehlo.broadcast_in_dim %2175, dims = [0, 1] : (tensor<257x3072xf32>) -> tensor<257x3072xf32>
-    %2177 = stablehlo.broadcast_in_dim %arg167, dims = [1] : (tensor<3072xf32>) -> tensor<257x3072xf32>
-    %2178 = stablehlo.add %2176, %2177 : tensor<257x3072xf32>
-    %2179 = stablehlo.convert %2178 : (tensor<257x3072xf32>) -> tensor<257x3072xbf16>
-    %2180 = stablehlo.reshape %2179 : (tensor<257x3072xbf16>) -> tensor<1x257x3072xbf16>
-    %2181 = stablehlo.multiply %2180, %cst_4 : tensor<1x257x3072xbf16>
-    %2182 = stablehlo.multiply %2180, %168 : tensor<1x257x3072xbf16>
-    %2183 = stablehlo.convert %2182 : (tensor<1x257x3072xbf16>) -> tensor<1x257x3072xf32>
-    %2184 = stablehlo.clamp %cst_5, %2183, %cst_6 : tensor<1x257x3072xf32>
-    %2185 = stablehlo.multiply %2184, %2184 : tensor<1x257x3072xf32>
-    %2186 = stablehlo.multiply %cst_7, %2185 : tensor<1x257x3072xf32>
-    %2187 = stablehlo.add %2186, %cst_8 : tensor<1x257x3072xf32>
-    %2188 = stablehlo.multiply %2187, %2185 : tensor<1x257x3072xf32>
-    %2189 = stablehlo.add %2188, %cst_9 : tensor<1x257x3072xf32>
-    %2190 = stablehlo.multiply %2189, %2185 : tensor<1x257x3072xf32>
-    %2191 = stablehlo.add %2190, %cst_10 : tensor<1x257x3072xf32>
-    %2192 = stablehlo.multiply %2191, %2185 : tensor<1x257x3072xf32>
-    %2193 = stablehlo.add %2192, %cst_11 : tensor<1x257x3072xf32>
-    %2194 = stablehlo.multiply %2193, %2185 : tensor<1x257x3072xf32>
-    %2195 = stablehlo.add %2194, %cst_12 : tensor<1x257x3072xf32>
-    %2196 = stablehlo.multiply %2195, %2185 : tensor<1x257x3072xf32>
-    %2197 = stablehlo.add %2196, %cst_13 : tensor<1x257x3072xf32>
-    %2198 = stablehlo.multiply %cst_14, %2185 : tensor<1x257x3072xf32>
-    %2199 = stablehlo.add %2198, %cst_15 : tensor<1x257x3072xf32>
-    %2200 = stablehlo.multiply %2199, %2185 : tensor<1x257x3072xf32>
-    %2201 = stablehlo.add %2200, %cst_16 : tensor<1x257x3072xf32>
-    %2202 = stablehlo.multiply %2201, %2185 : tensor<1x257x3072xf32>
-    %2203 = stablehlo.add %2202, %cst_17 : tensor<1x257x3072xf32>
-    %2204 = stablehlo.multiply %2203, %2185 : tensor<1x257x3072xf32>
-    %2205 = stablehlo.add %2204, %cst_18 : tensor<1x257x3072xf32>
-    %2206 = stablehlo.multiply %2184, %2197 : tensor<1x257x3072xf32>
-    %2207 = stablehlo.divide %2206, %2205 : tensor<1x257x3072xf32>
-    %2208 = stablehlo.clamp %cst_19, %2207, %cst_20 : tensor<1x257x3072xf32>
-    %2209 = stablehlo.convert %2208 : (tensor<1x257x3072xf32>) -> tensor<1x257x3072xbf16>
-    %2210 = stablehlo.add %2209, %cst_2 : tensor<1x257x3072xbf16>
-    %2211 = stablehlo.multiply %2210, %2181 : tensor<1x257x3072xbf16>
-    %2212 = stablehlo.reshape %2211 : (tensor<1x257x3072xbf16>) -> tensor<257x3072xbf16>
-    %2213 = stablehlo.convert %2212 : (tensor<257x3072xbf16>) -> tensor<257x3072xf32>
-    %2214 = stablehlo.dot_general %2213, %arg168, contracting_dims = [1] x [0] : (tensor<257x3072xf32>, tensor<3072x768xf32>) -> tensor<257x768xf32>
-    %2215 = stablehlo.broadcast_in_dim %2214, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %2216 = stablehlo.multiply %2215, %111 : tensor<257x768xf32>
-    %2217 = stablehlo.broadcast_in_dim %2216, dims = [0, 1] : (tensor<257x768xf32>) -> tensor<257x768xf32>
-    %2218 = stablehlo.broadcast_in_dim %arg169, dims = [1] : (tensor<768xf32>) -> tensor<257x768xf32>
-    %2219 = stablehlo.add %2217, %2218 : tensor<257x768xf32>
-    %2220 = stablehlo.convert %2219 : (tensor<257x768xf32>) -> tensor<257x768xbf16>
-    %2221 = stablehlo.reshape %2220 : (tensor<257x768xbf16>) -> tensor<1x257x768xbf16>
-    %2222 = stablehlo.add %2133, %2221 : tensor<1x257x768xbf16>
-    %2223 = stablehlo.convert %2222 : (tensor<1x257x768xbf16>) -> tensor<1x257x768xf32>
-    %2224 = stablehlo.convert %2223 : (tensor<1x257x768xf32>) -> tensor<1x257x768xf64>
-    %2225 = stablehlo.reduce(%2224 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %2226 = stablehlo.reshape %2225 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %2227 = stablehlo.broadcast_in_dim %2226, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %2228 = stablehlo.divide %2227, %16 : tensor<1x257x1xf64>
-    %2229 = stablehlo.broadcast_in_dim %2224, dims = [0, 1, 2] : (tensor<1x257x768xf64>) -> tensor<1x257x768xf64>
-    %2230 = stablehlo.broadcast_in_dim %2228, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x768xf64>
-    %2231 = stablehlo.subtract %2229, %2230 : tensor<1x257x768xf64>
-    %2232 = stablehlo.multiply %2231, %2231 : tensor<1x257x768xf64>
-    %2233 = stablehlo.reduce(%2232 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf64>, tensor<f64>) -> tensor<1x257xf64>
-    %2234 = stablehlo.reshape %2233 : (tensor<1x257xf64>) -> tensor<1x257x1xf64>
-    %2235 = stablehlo.broadcast_in_dim %2234, dims = [0, 1, 2] : (tensor<1x257x1xf64>) -> tensor<1x257x1xf64>
-    %2236 = stablehlo.divide %2235, %16 : tensor<1x257x1xf64>
-    %2237 = stablehlo.convert %2236 : (tensor<1x257x1xf64>) -> tensor<1x257x1xf32>
-    %2238 = stablehlo.reduce(%2223 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x257x768xf32>, tensor<f32>) -> tensor<1x257xf32>
-    %2239 = stablehlo.reshape %2238 : (tensor<1x257xf32>) -> tensor<1x257x1xf32>
-    %2240 = stablehlo.broadcast_in_dim %2239, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %2241 = stablehlo.divide %2240, %32 : tensor<1x257x1xf32>
-    %2242 = stablehlo.broadcast_in_dim %2237, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x1xf32>
-    %2243 = stablehlo.add %2242, %37 : tensor<1x257x1xf32>
-    %2244 = stablehlo.rsqrt %2243 : tensor<1x257x1xf32>
-    %2245 = stablehlo.broadcast_in_dim %2223, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2246 = stablehlo.broadcast_in_dim %2241, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %2247 = stablehlo.subtract %2245, %2246 : tensor<1x257x768xf32>
-    %2248 = stablehlo.broadcast_in_dim %2247, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2249 = stablehlo.broadcast_in_dim %2244, dims = [0, 1, 2] : (tensor<1x257x1xf32>) -> tensor<1x257x768xf32>
-    %2250 = stablehlo.multiply %2248, %2249 : tensor<1x257x768xf32>
-    %2251 = stablehlo.convert %arg52 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2252 = stablehlo.broadcast_in_dim %2250, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2253 = stablehlo.broadcast_in_dim %2251, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2254 = stablehlo.multiply %2252, %2253 : tensor<1x257x768xf32>
-    %2255 = stablehlo.convert %arg53 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2256 = stablehlo.broadcast_in_dim %2254, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2257 = stablehlo.broadcast_in_dim %2255, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2258 = stablehlo.add %2256, %2257 : tensor<1x257x768xf32>
-    %2259 = stablehlo.convert %2258 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %2260 = stablehlo.transpose %2259, dims = [0, 2, 1] : (tensor<1x257x768xbf16>) -> tensor<1x768x257xbf16>
-    %2261 = stablehlo.reshape %2260 : (tensor<1x768x257xbf16>) -> tensor<1x768x257x1xbf16>
-    %2262 = stablehlo.convolution(%2261, %arg54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 8 : i64} : (tensor<1x768x257x1xbf16>, tensor<768x96x1x1xbf16>) -> tensor<1x768x257x1xbf16>
-    %2263 = stablehlo.convolution(%2262, %arg55) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x768x257x1xbf16>, tensor<27x768x1x1xbf16>) -> tensor<1x27x257x1xbf16>
-    %2264 = stablehlo.reshape %2263 : (tensor<1x27x257x1xbf16>) -> tensor<1x27x257xbf16>
-    %2265 = stablehlo.convert %2264 : (tensor<1x27x257xbf16>) -> tensor<1x27x257xf32>
-    %2266 = stablehlo.reduce(%2265 init: %cst_1) applies stablehlo.maximum across dimensions = [2] : (tensor<1x27x257xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2267 = stablehlo.reshape %2266 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2268 = stablehlo.broadcast_in_dim %2265, dims = [0, 1, 2] : (tensor<1x27x257xf32>) -> tensor<1x27x257xf32>
-    %2269 = stablehlo.broadcast_in_dim %2267, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x257xf32>
-    %2270 = stablehlo.subtract %2268, %2269 : tensor<1x27x257xf32>
-    %2271 = stablehlo.exponential %2270 : tensor<1x27x257xf32>
-    %2272 = stablehlo.reduce(%2271 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x27x257xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2273 = stablehlo.reshape %2272 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2274 = stablehlo.broadcast_in_dim %2271, dims = [0, 1, 2] : (tensor<1x27x257xf32>) -> tensor<1x27x257xf32>
-    %2275 = stablehlo.broadcast_in_dim %2273, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x257xf32>
-    %2276 = stablehlo.divide %2274, %2275 : tensor<1x27x257xf32>
-    %2277 = stablehlo.convert %2276 : (tensor<1x27x257xf32>) -> tensor<1x27x257xbf16>
-    %2278 = stablehlo.convolution(%2261, %arg56) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 8 : i64} : (tensor<1x768x257x1xbf16>, tensor<768x96x1x1xbf16>) -> tensor<1x768x257x1xbf16>
-    %2279 = stablehlo.reshape %2278 : (tensor<1x768x257x1xbf16>) -> tensor<1x768x257xbf16>
-    %2280 = stablehlo.transpose %2279, dims = [0, 2, 1] : (tensor<1x768x257xbf16>) -> tensor<1x257x768xbf16>
-    %2281 = stablehlo.reshape %2277 : (tensor<1x27x257xbf16>) -> tensor<1x27x257x1xbf16>
-    %2282 = stablehlo.transpose %2281, dims = [0, 1, 3, 2] : (tensor<1x27x257x1xbf16>) -> tensor<1x27x1x257xbf16>
-    %2283 = stablehlo.reshape %2280 : (tensor<1x257x768xbf16>) -> tensor<1x257x768x1xbf16>
-    %2284 = stablehlo.transpose %2283, dims = [0, 3, 2, 1] : (tensor<1x257x768x1xbf16>) -> tensor<1x1x768x257xbf16>
-    %2285 = stablehlo.transpose %2282, dims = [1, 3, 0, 2] : (tensor<1x27x1x257xbf16>) -> tensor<27x257x1x1xbf16>
-    %2286 = stablehlo.reshape %2285 : (tensor<27x257x1x1xbf16>) -> tensor<1x27x257xbf16>
-    %2287 = stablehlo.transpose %2284, dims = [3, 0, 2, 1] : (tensor<1x1x768x257xbf16>) -> tensor<257x1x768x1xbf16>
-    %2288 = stablehlo.reshape %2287 : (tensor<257x1x768x1xbf16>) -> tensor<1x257x768xbf16>
-    %2289 = stablehlo.broadcast_in_dim %2288, dims = [0, 1, 2] : (tensor<1x257x768xbf16>) -> tensor<1x257x768xbf16>
-    %2290 = stablehlo.dot_general %2286, %2289, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x27x257xbf16>, tensor<1x257x768xbf16>) -> tensor<1x27x768xbf16>
-    %2291 = stablehlo.reshape %2290 : (tensor<1x27x768xbf16>) -> tensor<27x1x1x768xbf16>
-    %2292 = stablehlo.transpose %2291, dims = [2, 0, 3, 1] : (tensor<27x1x1x768xbf16>) -> tensor<1x27x768x1xbf16>
-    %2293 = stablehlo.reshape %2292 : (tensor<1x27x768x1xbf16>) -> tensor<1x27x768xbf16>
-    %2294 = stablehlo.convert %2293 : (tensor<1x27x768xbf16>) -> tensor<1x27x768xf32>
-    %2295 = stablehlo.convert %2294 : (tensor<1x27x768xf32>) -> tensor<1x27x768xf64>
-    %2296 = stablehlo.reduce(%2295 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf64>, tensor<f64>) -> tensor<1x27xf64>
-    %2297 = stablehlo.reshape %2296 : (tensor<1x27xf64>) -> tensor<1x27x1xf64>
-    %2298 = stablehlo.broadcast_in_dim %2297, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x1xf64>
-    %2299 = stablehlo.broadcast_in_dim %14, dims = [] : (tensor<f64>) -> tensor<1x27x1xf64>
-    %2300 = stablehlo.divide %2298, %2299 : tensor<1x27x1xf64>
-    %2301 = stablehlo.broadcast_in_dim %2295, dims = [0, 1, 2] : (tensor<1x27x768xf64>) -> tensor<1x27x768xf64>
-    %2302 = stablehlo.broadcast_in_dim %2300, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x768xf64>
-    %2303 = stablehlo.subtract %2301, %2302 : tensor<1x27x768xf64>
-    %2304 = stablehlo.multiply %2303, %2303 : tensor<1x27x768xf64>
-    %2305 = stablehlo.reduce(%2304 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf64>, tensor<f64>) -> tensor<1x27xf64>
-    %2306 = stablehlo.reshape %2305 : (tensor<1x27xf64>) -> tensor<1x27x1xf64>
-    %2307 = stablehlo.broadcast_in_dim %2306, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x1xf64>
-    %2308 = stablehlo.divide %2307, %2299 : tensor<1x27x1xf64>
-    %2309 = stablehlo.convert %2308 : (tensor<1x27x1xf64>) -> tensor<1x27x1xf32>
-    %2310 = stablehlo.reduce(%2294 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2311 = stablehlo.reshape %2310 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2312 = stablehlo.broadcast_in_dim %2311, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x1xf32>
-    %2313 = stablehlo.broadcast_in_dim %30, dims = [] : (tensor<f32>) -> tensor<1x27x1xf32>
-    %2314 = stablehlo.divide %2312, %2313 : tensor<1x27x1xf32>
-    %2315 = stablehlo.broadcast_in_dim %2309, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x1xf32>
-    %2316 = stablehlo.broadcast_in_dim %35, dims = [] : (tensor<f32>) -> tensor<1x27x1xf32>
-    %2317 = stablehlo.add %2315, %2316 : tensor<1x27x1xf32>
-    %2318 = stablehlo.rsqrt %2317 : tensor<1x27x1xf32>
-    %2319 = stablehlo.broadcast_in_dim %2294, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2320 = stablehlo.broadcast_in_dim %2314, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x768xf32>
-    %2321 = stablehlo.subtract %2319, %2320 : tensor<1x27x768xf32>
-    %2322 = stablehlo.broadcast_in_dim %2321, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2323 = stablehlo.broadcast_in_dim %2318, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x768xf32>
-    %2324 = stablehlo.multiply %2322, %2323 : tensor<1x27x768xf32>
-    %2325 = stablehlo.convert %arg57 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2326 = stablehlo.broadcast_in_dim %2324, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2327 = stablehlo.broadcast_in_dim %2325, dims = [2] : (tensor<768xf32>) -> tensor<1x27x768xf32>
-    %2328 = stablehlo.multiply %2326, %2327 : tensor<1x27x768xf32>
-    %2329 = stablehlo.convert %arg58 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2330 = stablehlo.broadcast_in_dim %2328, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2331 = stablehlo.broadcast_in_dim %2329, dims = [2] : (tensor<768xf32>) -> tensor<1x27x768xf32>
-    %2332 = stablehlo.add %2330, %2331 : tensor<1x27x768xf32>
-    %2333 = stablehlo.convert %2332 : (tensor<1x27x768xf32>) -> tensor<1x27x768xbf16>
-    %2334 = stablehlo.convert %arg59 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2335 = stablehlo.broadcast_in_dim %2334, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2336 = stablehlo.multiply %2252, %2335 : tensor<1x257x768xf32>
-    %2337 = stablehlo.convert %arg60 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2338 = stablehlo.broadcast_in_dim %2336, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2339 = stablehlo.broadcast_in_dim %2337, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2340 = stablehlo.add %2338, %2339 : tensor<1x257x768xf32>
-    %2341 = stablehlo.convert %2340 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %2342 = stablehlo.transpose %2341, dims = [0, 2, 1] : (tensor<1x257x768xbf16>) -> tensor<1x768x257xbf16>
-    %2343 = stablehlo.reshape %2342 : (tensor<1x768x257xbf16>) -> tensor<1x768x257x1xbf16>
-    %2344 = stablehlo.convolution(%2343, %arg61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 8 : i64} : (tensor<1x768x257x1xbf16>, tensor<768x96x1x1xbf16>) -> tensor<1x768x257x1xbf16>
-    %2345 = stablehlo.convolution(%2344, %arg62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x768x257x1xbf16>, tensor<27x768x1x1xbf16>) -> tensor<1x27x257x1xbf16>
-    %2346 = stablehlo.reshape %2345 : (tensor<1x27x257x1xbf16>) -> tensor<1x27x257xbf16>
-    %2347 = stablehlo.convert %2346 : (tensor<1x27x257xbf16>) -> tensor<1x27x257xf32>
-    %2348 = stablehlo.reduce(%2347 init: %cst_1) applies stablehlo.maximum across dimensions = [2] : (tensor<1x27x257xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2349 = stablehlo.reshape %2348 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2350 = stablehlo.broadcast_in_dim %2347, dims = [0, 1, 2] : (tensor<1x27x257xf32>) -> tensor<1x27x257xf32>
-    %2351 = stablehlo.broadcast_in_dim %2349, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x257xf32>
-    %2352 = stablehlo.subtract %2350, %2351 : tensor<1x27x257xf32>
-    %2353 = stablehlo.exponential %2352 : tensor<1x27x257xf32>
-    %2354 = stablehlo.reduce(%2353 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x27x257xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2355 = stablehlo.reshape %2354 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2356 = stablehlo.broadcast_in_dim %2353, dims = [0, 1, 2] : (tensor<1x27x257xf32>) -> tensor<1x27x257xf32>
-    %2357 = stablehlo.broadcast_in_dim %2355, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x257xf32>
-    %2358 = stablehlo.divide %2356, %2357 : tensor<1x27x257xf32>
-    %2359 = stablehlo.convert %2358 : (tensor<1x27x257xf32>) -> tensor<1x27x257xbf16>
-    %2360 = stablehlo.convolution(%2343, %arg63) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 8 : i64} : (tensor<1x768x257x1xbf16>, tensor<768x96x1x1xbf16>) -> tensor<1x768x257x1xbf16>
-    %2361 = stablehlo.reshape %2360 : (tensor<1x768x257x1xbf16>) -> tensor<1x768x257xbf16>
-    %2362 = stablehlo.transpose %2361, dims = [0, 2, 1] : (tensor<1x768x257xbf16>) -> tensor<1x257x768xbf16>
-    %2363 = stablehlo.reshape %2359 : (tensor<1x27x257xbf16>) -> tensor<1x27x257x1xbf16>
-    %2364 = stablehlo.transpose %2363, dims = [0, 1, 3, 2] : (tensor<1x27x257x1xbf16>) -> tensor<1x27x1x257xbf16>
-    %2365 = stablehlo.reshape %2362 : (tensor<1x257x768xbf16>) -> tensor<1x257x768x1xbf16>
-    %2366 = stablehlo.transpose %2365, dims = [0, 3, 2, 1] : (tensor<1x257x768x1xbf16>) -> tensor<1x1x768x257xbf16>
-    %2367 = stablehlo.transpose %2364, dims = [1, 3, 0, 2] : (tensor<1x27x1x257xbf16>) -> tensor<27x257x1x1xbf16>
-    %2368 = stablehlo.reshape %2367 : (tensor<27x257x1x1xbf16>) -> tensor<1x27x257xbf16>
-    %2369 = stablehlo.transpose %2366, dims = [3, 0, 2, 1] : (tensor<1x1x768x257xbf16>) -> tensor<257x1x768x1xbf16>
-    %2370 = stablehlo.reshape %2369 : (tensor<257x1x768x1xbf16>) -> tensor<1x257x768xbf16>
-    %2371 = stablehlo.broadcast_in_dim %2370, dims = [0, 1, 2] : (tensor<1x257x768xbf16>) -> tensor<1x257x768xbf16>
-    %2372 = stablehlo.dot_general %2368, %2371, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x27x257xbf16>, tensor<1x257x768xbf16>) -> tensor<1x27x768xbf16>
-    %2373 = stablehlo.reshape %2372 : (tensor<1x27x768xbf16>) -> tensor<27x1x1x768xbf16>
-    %2374 = stablehlo.transpose %2373, dims = [2, 0, 3, 1] : (tensor<27x1x1x768xbf16>) -> tensor<1x27x768x1xbf16>
-    %2375 = stablehlo.reshape %2374 : (tensor<1x27x768x1xbf16>) -> tensor<1x27x768xbf16>
-    %2376 = stablehlo.convert %2375 : (tensor<1x27x768xbf16>) -> tensor<1x27x768xf32>
-    %2377 = stablehlo.convert %2376 : (tensor<1x27x768xf32>) -> tensor<1x27x768xf64>
-    %2378 = stablehlo.reduce(%2377 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf64>, tensor<f64>) -> tensor<1x27xf64>
-    %2379 = stablehlo.reshape %2378 : (tensor<1x27xf64>) -> tensor<1x27x1xf64>
-    %2380 = stablehlo.broadcast_in_dim %2379, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x1xf64>
-    %2381 = stablehlo.divide %2380, %2299 : tensor<1x27x1xf64>
-    %2382 = stablehlo.broadcast_in_dim %2377, dims = [0, 1, 2] : (tensor<1x27x768xf64>) -> tensor<1x27x768xf64>
-    %2383 = stablehlo.broadcast_in_dim %2381, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x768xf64>
-    %2384 = stablehlo.subtract %2382, %2383 : tensor<1x27x768xf64>
-    %2385 = stablehlo.multiply %2384, %2384 : tensor<1x27x768xf64>
-    %2386 = stablehlo.reduce(%2385 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf64>, tensor<f64>) -> tensor<1x27xf64>
-    %2387 = stablehlo.reshape %2386 : (tensor<1x27xf64>) -> tensor<1x27x1xf64>
-    %2388 = stablehlo.broadcast_in_dim %2387, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x1xf64>
-    %2389 = stablehlo.divide %2388, %2299 : tensor<1x27x1xf64>
-    %2390 = stablehlo.convert %2389 : (tensor<1x27x1xf64>) -> tensor<1x27x1xf32>
-    %2391 = stablehlo.reduce(%2376 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2392 = stablehlo.reshape %2391 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2393 = stablehlo.broadcast_in_dim %2392, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x1xf32>
-    %2394 = stablehlo.divide %2393, %2313 : tensor<1x27x1xf32>
-    %2395 = stablehlo.broadcast_in_dim %2390, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x1xf32>
-    %2396 = stablehlo.add %2395, %2316 : tensor<1x27x1xf32>
-    %2397 = stablehlo.rsqrt %2396 : tensor<1x27x1xf32>
-    %2398 = stablehlo.broadcast_in_dim %2376, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2399 = stablehlo.broadcast_in_dim %2394, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x768xf32>
-    %2400 = stablehlo.subtract %2398, %2399 : tensor<1x27x768xf32>
-    %2401 = stablehlo.broadcast_in_dim %2400, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2402 = stablehlo.broadcast_in_dim %2397, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x768xf32>
-    %2403 = stablehlo.multiply %2401, %2402 : tensor<1x27x768xf32>
-    %2404 = stablehlo.convert %arg64 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2405 = stablehlo.broadcast_in_dim %2403, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2406 = stablehlo.broadcast_in_dim %2404, dims = [2] : (tensor<768xf32>) -> tensor<1x27x768xf32>
-    %2407 = stablehlo.multiply %2405, %2406 : tensor<1x27x768xf32>
-    %2408 = stablehlo.convert %arg65 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2409 = stablehlo.broadcast_in_dim %2407, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2410 = stablehlo.broadcast_in_dim %2408, dims = [2] : (tensor<768xf32>) -> tensor<1x27x768xf32>
-    %2411 = stablehlo.add %2409, %2410 : tensor<1x27x768xf32>
-    %2412 = stablehlo.convert %2411 : (tensor<1x27x768xf32>) -> tensor<1x27x768xbf16>
-    %2413 = stablehlo.convert %arg66 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2414 = stablehlo.broadcast_in_dim %2413, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2415 = stablehlo.multiply %2252, %2414 : tensor<1x257x768xf32>
-    %2416 = stablehlo.convert %arg67 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2417 = stablehlo.broadcast_in_dim %2415, dims = [0, 1, 2] : (tensor<1x257x768xf32>) -> tensor<1x257x768xf32>
-    %2418 = stablehlo.broadcast_in_dim %2416, dims = [2] : (tensor<768xf32>) -> tensor<1x257x768xf32>
-    %2419 = stablehlo.add %2417, %2418 : tensor<1x257x768xf32>
-    %2420 = stablehlo.convert %2419 : (tensor<1x257x768xf32>) -> tensor<1x257x768xbf16>
-    %2421 = stablehlo.transpose %2420, dims = [0, 2, 1] : (tensor<1x257x768xbf16>) -> tensor<1x768x257xbf16>
-    %2422 = stablehlo.reshape %2421 : (tensor<1x768x257xbf16>) -> tensor<1x768x257x1xbf16>
-    %2423 = stablehlo.convolution(%2422, %arg68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 8 : i64} : (tensor<1x768x257x1xbf16>, tensor<768x96x1x1xbf16>) -> tensor<1x768x257x1xbf16>
-    %2424 = stablehlo.convolution(%2423, %arg69) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x768x257x1xbf16>, tensor<27x768x1x1xbf16>) -> tensor<1x27x257x1xbf16>
-    %2425 = stablehlo.reshape %2424 : (tensor<1x27x257x1xbf16>) -> tensor<1x27x257xbf16>
-    %2426 = stablehlo.convert %2425 : (tensor<1x27x257xbf16>) -> tensor<1x27x257xf32>
-    %2427 = stablehlo.reduce(%2426 init: %cst_1) applies stablehlo.maximum across dimensions = [2] : (tensor<1x27x257xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2428 = stablehlo.reshape %2427 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2429 = stablehlo.broadcast_in_dim %2426, dims = [0, 1, 2] : (tensor<1x27x257xf32>) -> tensor<1x27x257xf32>
-    %2430 = stablehlo.broadcast_in_dim %2428, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x257xf32>
-    %2431 = stablehlo.subtract %2429, %2430 : tensor<1x27x257xf32>
-    %2432 = stablehlo.exponential %2431 : tensor<1x27x257xf32>
-    %2433 = stablehlo.reduce(%2432 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x27x257xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2434 = stablehlo.reshape %2433 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2435 = stablehlo.broadcast_in_dim %2432, dims = [0, 1, 2] : (tensor<1x27x257xf32>) -> tensor<1x27x257xf32>
-    %2436 = stablehlo.broadcast_in_dim %2434, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x257xf32>
-    %2437 = stablehlo.divide %2435, %2436 : tensor<1x27x257xf32>
-    %2438 = stablehlo.convert %2437 : (tensor<1x27x257xf32>) -> tensor<1x27x257xbf16>
-    %2439 = stablehlo.convolution(%2422, %arg70) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 8 : i64} : (tensor<1x768x257x1xbf16>, tensor<768x96x1x1xbf16>) -> tensor<1x768x257x1xbf16>
-    %2440 = stablehlo.reshape %2439 : (tensor<1x768x257x1xbf16>) -> tensor<1x768x257xbf16>
-    %2441 = stablehlo.transpose %2440, dims = [0, 2, 1] : (tensor<1x768x257xbf16>) -> tensor<1x257x768xbf16>
-    %2442 = stablehlo.reshape %2438 : (tensor<1x27x257xbf16>) -> tensor<1x27x257x1xbf16>
-    %2443 = stablehlo.transpose %2442, dims = [0, 1, 3, 2] : (tensor<1x27x257x1xbf16>) -> tensor<1x27x1x257xbf16>
-    %2444 = stablehlo.reshape %2441 : (tensor<1x257x768xbf16>) -> tensor<1x257x768x1xbf16>
-    %2445 = stablehlo.transpose %2444, dims = [0, 3, 2, 1] : (tensor<1x257x768x1xbf16>) -> tensor<1x1x768x257xbf16>
-    %2446 = stablehlo.transpose %2443, dims = [1, 3, 0, 2] : (tensor<1x27x1x257xbf16>) -> tensor<27x257x1x1xbf16>
-    %2447 = stablehlo.reshape %2446 : (tensor<27x257x1x1xbf16>) -> tensor<1x27x257xbf16>
-    %2448 = stablehlo.transpose %2445, dims = [3, 0, 2, 1] : (tensor<1x1x768x257xbf16>) -> tensor<257x1x768x1xbf16>
-    %2449 = stablehlo.reshape %2448 : (tensor<257x1x768x1xbf16>) -> tensor<1x257x768xbf16>
-    %2450 = stablehlo.broadcast_in_dim %2449, dims = [0, 1, 2] : (tensor<1x257x768xbf16>) -> tensor<1x257x768xbf16>
-    %2451 = stablehlo.dot_general %2447, %2450, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x27x257xbf16>, tensor<1x257x768xbf16>) -> tensor<1x27x768xbf16>
-    %2452 = stablehlo.reshape %2451 : (tensor<1x27x768xbf16>) -> tensor<27x1x1x768xbf16>
-    %2453 = stablehlo.transpose %2452, dims = [2, 0, 3, 1] : (tensor<27x1x1x768xbf16>) -> tensor<1x27x768x1xbf16>
-    %2454 = stablehlo.reshape %2453 : (tensor<1x27x768x1xbf16>) -> tensor<1x27x768xbf16>
-    %2455 = stablehlo.convert %2454 : (tensor<1x27x768xbf16>) -> tensor<1x27x768xf32>
-    %2456 = stablehlo.convert %2455 : (tensor<1x27x768xf32>) -> tensor<1x27x768xf64>
-    %2457 = stablehlo.reduce(%2456 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf64>, tensor<f64>) -> tensor<1x27xf64>
-    %2458 = stablehlo.reshape %2457 : (tensor<1x27xf64>) -> tensor<1x27x1xf64>
-    %2459 = stablehlo.broadcast_in_dim %2458, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x1xf64>
-    %2460 = stablehlo.divide %2459, %2299 : tensor<1x27x1xf64>
-    %2461 = stablehlo.broadcast_in_dim %2456, dims = [0, 1, 2] : (tensor<1x27x768xf64>) -> tensor<1x27x768xf64>
-    %2462 = stablehlo.broadcast_in_dim %2460, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x768xf64>
-    %2463 = stablehlo.subtract %2461, %2462 : tensor<1x27x768xf64>
-    %2464 = stablehlo.multiply %2463, %2463 : tensor<1x27x768xf64>
-    %2465 = stablehlo.reduce(%2464 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf64>, tensor<f64>) -> tensor<1x27xf64>
-    %2466 = stablehlo.reshape %2465 : (tensor<1x27xf64>) -> tensor<1x27x1xf64>
-    %2467 = stablehlo.broadcast_in_dim %2466, dims = [0, 1, 2] : (tensor<1x27x1xf64>) -> tensor<1x27x1xf64>
-    %2468 = stablehlo.divide %2467, %2299 : tensor<1x27x1xf64>
-    %2469 = stablehlo.convert %2468 : (tensor<1x27x1xf64>) -> tensor<1x27x1xf32>
-    %2470 = stablehlo.reduce(%2455 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x27x768xf32>, tensor<f32>) -> tensor<1x27xf32>
-    %2471 = stablehlo.reshape %2470 : (tensor<1x27xf32>) -> tensor<1x27x1xf32>
-    %2472 = stablehlo.broadcast_in_dim %2471, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x1xf32>
-    %2473 = stablehlo.divide %2472, %2313 : tensor<1x27x1xf32>
-    %2474 = stablehlo.broadcast_in_dim %2469, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x1xf32>
-    %2475 = stablehlo.add %2474, %2316 : tensor<1x27x1xf32>
-    %2476 = stablehlo.rsqrt %2475 : tensor<1x27x1xf32>
-    %2477 = stablehlo.broadcast_in_dim %2455, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2478 = stablehlo.broadcast_in_dim %2473, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x768xf32>
-    %2479 = stablehlo.subtract %2477, %2478 : tensor<1x27x768xf32>
-    %2480 = stablehlo.broadcast_in_dim %2479, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2481 = stablehlo.broadcast_in_dim %2476, dims = [0, 1, 2] : (tensor<1x27x1xf32>) -> tensor<1x27x768xf32>
-    %2482 = stablehlo.multiply %2480, %2481 : tensor<1x27x768xf32>
-    %2483 = stablehlo.convert %arg71 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2484 = stablehlo.broadcast_in_dim %2482, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2485 = stablehlo.broadcast_in_dim %2483, dims = [2] : (tensor<768xf32>) -> tensor<1x27x768xf32>
-    %2486 = stablehlo.multiply %2484, %2485 : tensor<1x27x768xf32>
-    %2487 = stablehlo.convert %arg72 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2488 = stablehlo.broadcast_in_dim %2486, dims = [0, 1, 2] : (tensor<1x27x768xf32>) -> tensor<1x27x768xf32>
-    %2489 = stablehlo.broadcast_in_dim %2487, dims = [2] : (tensor<768xf32>) -> tensor<1x27x768xf32>
-    %2490 = stablehlo.add %2488, %2489 : tensor<1x27x768xf32>
-    %2491 = stablehlo.convert %2490 : (tensor<1x27x768xf32>) -> tensor<1x27x768xbf16>
-    %2492 = stablehlo.reshape %2333 : (tensor<1x27x768xbf16>) -> tensor<27x768xbf16>
-    %2493 = stablehlo.convert %2492 : (tensor<27x768xbf16>) -> tensor<27x768xf32>
-    %2494 = stablehlo.dot_general %2493, %arg170, contracting_dims = [1] x [0] : (tensor<27x768xf32>, tensor<768x38xf32>) -> tensor<27x38xf32>
-    %2495 = stablehlo.broadcast_in_dim %2494, dims = [0, 1] : (tensor<27x38xf32>) -> tensor<27x38xf32>
-    %2496 = stablehlo.broadcast_in_dim %59, dims = [] : (tensor<f32>) -> tensor<27x38xf32>
-    %2497 = stablehlo.multiply %2495, %2496 : tensor<27x38xf32>
-    %2498 = stablehlo.broadcast_in_dim %2497, dims = [0, 1] : (tensor<27x38xf32>) -> tensor<27x38xf32>
-    %2499 = stablehlo.broadcast_in_dim %arg171, dims = [1] : (tensor<38xf32>) -> tensor<27x38xf32>
-    %2500 = stablehlo.add %2498, %2499 : tensor<27x38xf32>
-    %2501 = stablehlo.convert %2500 : (tensor<27x38xf32>) -> tensor<27x38xbf16>
-    %2502 = stablehlo.reshape %2501 : (tensor<27x38xbf16>) -> tensor<1x27x38xbf16>
-    %2503 = stablehlo.reshape %2412 : (tensor<1x27x768xbf16>) -> tensor<27x768xbf16>
-    %2504 = stablehlo.convert %2503 : (tensor<27x768xbf16>) -> tensor<27x768xf32>
-    %2505 = stablehlo.dot_general %2504, %arg172, contracting_dims = [1] x [0] : (tensor<27x768xf32>, tensor<768x50257xf32>) -> tensor<27x50257xf32>
-    %2506 = stablehlo.broadcast_in_dim %2505, dims = [0, 1] : (tensor<27x50257xf32>) -> tensor<27x50257xf32>
-    %2507 = stablehlo.broadcast_in_dim %59, dims = [] : (tensor<f32>) -> tensor<27x50257xf32>
-    %2508 = stablehlo.multiply %2506, %2507 : tensor<27x50257xf32>
-    %2509 = stablehlo.broadcast_in_dim %2508, dims = [0, 1] : (tensor<27x50257xf32>) -> tensor<27x50257xf32>
-    %2510 = stablehlo.broadcast_in_dim %arg173, dims = [1] : (tensor<50257xf32>) -> tensor<27x50257xf32>
-    %2511 = stablehlo.add %2509, %2510 : tensor<27x50257xf32>
-    %2512 = stablehlo.convert %2511 : (tensor<27x50257xf32>) -> tensor<27x50257xbf16>
-    %2513 = stablehlo.reshape %2512 : (tensor<27x50257xbf16>) -> tensor<1x27x50257xbf16>
-    %2514 = stablehlo.reshape %2491 : (tensor<1x27x768xbf16>) -> tensor<27x768xbf16>
-    %2515 = stablehlo.convert %2514 : (tensor<27x768xbf16>) -> tensor<27x768xf32>
-    %2516 = stablehlo.dot_general %2515, %arg174, contracting_dims = [1] x [0] : (tensor<27x768xf32>, tensor<768x30522xf32>) -> tensor<27x30522xf32>
-    %2517 = stablehlo.broadcast_in_dim %2516, dims = [0, 1] : (tensor<27x30522xf32>) -> tensor<27x30522xf32>
-    %2518 = stablehlo.broadcast_in_dim %59, dims = [] : (tensor<f32>) -> tensor<27x30522xf32>
-    %2519 = stablehlo.multiply %2517, %2518 : tensor<27x30522xf32>
-    %2520 = stablehlo.broadcast_in_dim %2519, dims = [0, 1] : (tensor<27x30522xf32>) -> tensor<27x30522xf32>
-    %2521 = stablehlo.broadcast_in_dim %arg175, dims = [1] : (tensor<30522xf32>) -> tensor<27x30522xf32>
-    %2522 = stablehlo.add %2520, %2521 : tensor<27x30522xf32>
-    %2523 = stablehlo.convert %2522 : (tensor<27x30522xf32>) -> tensor<27x30522xbf16>
-    %2524 = stablehlo.reshape %2523 : (tensor<27x30522xbf16>) -> tensor<1x27x30522xbf16>
-    return %2502, %2513, %2524 : tensor<1x27x38xbf16>, tensor<1x27x50257xbf16>, tensor<1x27x30522xbf16>
-  }
-}
diff --git a/mlir_tests/distilbert-base-uncased.mlir b/mlir_tests/distilbert-base-uncased.mlir
deleted file mode 100644
index 96989d98..00000000
--- a/mlir_tests/distilbert-base-uncased.mlir
+++ /dev/null
@@ -1,1380 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x16xi64>, %arg1: tensor<1x16xi64>, %arg2: tensor<30522x768xbf16>, %arg3: tensor<768xbf16>, %arg4: tensor<768xbf16>, %arg5: tensor<768xbf16>, %arg6: tensor<768xbf16>, %arg7: tensor<768xbf16>, %arg8: tensor<768xbf16>, %arg9: tensor<768xbf16>, %arg10: tensor<768xbf16>, %arg11: tensor<768xbf16>, %arg12: tensor<768xbf16>, %arg13: tensor<768xbf16>, %arg14: tensor<768xbf16>, %arg15: tensor<768xbf16>, %arg16: tensor<768xbf16>, %arg17: tensor<768xbf16>, %arg18: tensor<768xbf16>, %arg19: tensor<768xbf16>, %arg20: tensor<768xbf16>, %arg21: tensor<768xbf16>, %arg22: tensor<768xbf16>, %arg23: tensor<768xbf16>, %arg24: tensor<768xbf16>, %arg25: tensor<768xbf16>, %arg26: tensor<768xbf16>, %arg27: tensor<768xbf16>, %arg28: tensor<768xbf16>, %arg29: tensor<1x16x768xbf16>, %arg30: tensor<bf16>, %arg31: tensor<768x768xf32>, %arg32: tensor<768xf32>, %arg33: tensor<768x768xf32>, %arg34: tensor<768xf32>, %arg35: tensor<768x768xf32>, %arg36: tensor<768xf32>, %arg37: tensor<768x768xf32>, %arg38: tensor<768xf32>, %arg39: tensor<768x3072xf32>, %arg40: tensor<3072xf32>, %arg41: tensor<3072x768xf32>, %arg42: tensor<768xf32>, %arg43: tensor<768x768xf32>, %arg44: tensor<768xf32>, %arg45: tensor<768x768xf32>, %arg46: tensor<768xf32>, %arg47: tensor<768x768xf32>, %arg48: tensor<768xf32>, %arg49: tensor<768x768xf32>, %arg50: tensor<768xf32>, %arg51: tensor<768x3072xf32>, %arg52: tensor<3072xf32>, %arg53: tensor<3072x768xf32>, %arg54: tensor<768xf32>, %arg55: tensor<768x768xf32>, %arg56: tensor<768xf32>, %arg57: tensor<768x768xf32>, %arg58: tensor<768xf32>, %arg59: tensor<768x768xf32>, %arg60: tensor<768xf32>, %arg61: tensor<768x768xf32>, %arg62: tensor<768xf32>, %arg63: tensor<768x3072xf32>, %arg64: tensor<3072xf32>, %arg65: tensor<3072x768xf32>, %arg66: tensor<768xf32>, %arg67: tensor<768x768xf32>, %arg68: tensor<768xf32>, %arg69: tensor<768x768xf32>, %arg70: tensor<768xf32>, %arg71: tensor<768x768xf32>, %arg72: tensor<768xf32>, %arg73: tensor<768x768xf32>, %arg74: tensor<768xf32>, %arg75: tensor<768x3072xf32>, %arg76: tensor<3072xf32>, %arg77: tensor<3072x768xf32>, %arg78: tensor<768xf32>, %arg79: tensor<768x768xf32>, %arg80: tensor<768xf32>, %arg81: tensor<768x768xf32>, %arg82: tensor<768xf32>, %arg83: tensor<768x768xf32>, %arg84: tensor<768xf32>, %arg85: tensor<768x768xf32>, %arg86: tensor<768xf32>, %arg87: tensor<768x3072xf32>, %arg88: tensor<3072xf32>, %arg89: tensor<3072x768xf32>, %arg90: tensor<768xf32>, %arg91: tensor<768x768xf32>, %arg92: tensor<768xf32>, %arg93: tensor<768x768xf32>, %arg94: tensor<768xf32>, %arg95: tensor<768x768xf32>, %arg96: tensor<768xf32>, %arg97: tensor<768x768xf32>, %arg98: tensor<768xf32>, %arg99: tensor<768x3072xf32>, %arg100: tensor<3072xf32>, %arg101: tensor<3072x768xf32>, %arg102: tensor<768xf32>) -> tensor<1x16x768xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %c = stablehlo.constant dense<true> : tensor<i1>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x16x3072xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x16x3072xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x16x3072xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x16x3072xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x16x3072xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x16x3072xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x16x3072xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x16x3072xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x16x3072xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x16x3072xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x16x3072xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x16x3072xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x16x3072xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x16x3072xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x16x3072xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x16x3072xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x16x3072xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x16x3072xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x16x3072xf32>
-    %cst_21 = arith.constant dense<768> : tensor<1xi64>
-    %cst_22 = arith.constant dense<9.9999999999999998E-13> : tensor<1xf64>
-    %cst_23 = arith.constant dense<1.000000e+00> : tensor<1xf64>
-    %cst_24 = arith.constant dense<1> : tensor<1xi64>
-    %cst_25 = arith.constant dense<0.35355339059327379> : tensor<1xf64>
-    %cst_26 = arith.constant dense<0xFFF0000000000000> : tensor<1xf64>
-    %0 = "stablehlo.gather"(%arg2, %arg0) <{dimension_numbers = #stablehlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 768>}> : (tensor<30522x768xbf16>, tensor<1x16xi64>) -> tensor<1x16x768xbf16>
-    %1 = stablehlo.convert %0 : tensor<1x16x768xbf16>
-    %2 = stablehlo.add %1, %arg29 : tensor<1x16x768xbf16>
-    %3 = stablehlo.convert %2 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %4 = stablehlo.convert %3 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %5 = stablehlo.reduce(%4 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %6 = stablehlo.reshape %5 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %7 = stablehlo.convert %cst_21 : (tensor<1xi64>) -> tensor<1xf64>
-    %8 = stablehlo.reshape %7 : (tensor<1xf64>) -> tensor<f64>
-    %9 = stablehlo.broadcast_in_dim %6, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %10 = stablehlo.broadcast_in_dim %8, dims = [] : (tensor<f64>) -> tensor<1x16x1xf64>
-    %11 = stablehlo.divide %9, %10 : tensor<1x16x1xf64>
-    %12 = stablehlo.broadcast_in_dim %4, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %13 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %14 = stablehlo.subtract %12, %13 : tensor<1x16x768xf64>
-    %15 = stablehlo.multiply %14, %14 : tensor<1x16x768xf64>
-    %16 = stablehlo.reduce(%15 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %17 = stablehlo.reshape %16 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %18 = stablehlo.broadcast_in_dim %17, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %19 = stablehlo.divide %18, %10 : tensor<1x16x1xf64>
-    %20 = stablehlo.convert %19 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %21 = stablehlo.reduce(%3 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %22 = stablehlo.reshape %21 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %23 = stablehlo.convert %cst_21 : (tensor<1xi64>) -> tensor<1xf32>
-    %24 = stablehlo.reshape %23 : (tensor<1xf32>) -> tensor<f32>
-    %25 = stablehlo.broadcast_in_dim %22, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %26 = stablehlo.broadcast_in_dim %24, dims = [] : (tensor<f32>) -> tensor<1x16x1xf32>
-    %27 = stablehlo.divide %25, %26 : tensor<1x16x1xf32>
-    %28 = stablehlo.convert %cst_22 : (tensor<1xf64>) -> tensor<1xf32>
-    %29 = stablehlo.reshape %28 : (tensor<1xf32>) -> tensor<f32>
-    %30 = stablehlo.broadcast_in_dim %20, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %31 = stablehlo.broadcast_in_dim %29, dims = [] : (tensor<f32>) -> tensor<1x16x1xf32>
-    %32 = stablehlo.add %30, %31 : tensor<1x16x1xf32>
-    %33 = stablehlo.rsqrt %32 : tensor<1x16x1xf32>
-    %34 = stablehlo.broadcast_in_dim %3, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %35 = stablehlo.broadcast_in_dim %27, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %36 = stablehlo.subtract %34, %35 : tensor<1x16x768xf32>
-    %37 = stablehlo.broadcast_in_dim %36, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %38 = stablehlo.broadcast_in_dim %33, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %39 = stablehlo.multiply %37, %38 : tensor<1x16x768xf32>
-    %40 = stablehlo.convert %arg3 : (tensor<768xbf16>) -> tensor<768xf32>
-    %41 = stablehlo.broadcast_in_dim %39, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %42 = stablehlo.broadcast_in_dim %40, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %43 = stablehlo.multiply %41, %42 : tensor<1x16x768xf32>
-    %44 = stablehlo.convert %arg4 : (tensor<768xbf16>) -> tensor<768xf32>
-    %45 = stablehlo.broadcast_in_dim %43, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %47 = stablehlo.add %45, %46 : tensor<1x16x768xf32>
-    %48 = stablehlo.convert %47 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %49 = stablehlo.reshape %arg1 : (tensor<1x16xi64>) -> tensor<1x1x16xi64>
-    %50 = stablehlo.reshape %49 : (tensor<1x1x16xi64>) -> tensor<1x1x1x16xi64>
-    %51 = stablehlo.broadcast_in_dim %50, dims = [0, 1, 2, 3] : (tensor<1x1x1x16xi64>) -> tensor<1x1x16x16xi64>
-    %52 = stablehlo.convert %51 : (tensor<1x1x16x16xi64>) -> tensor<1x1x16x16xbf16>
-    %53 = stablehlo.convert %cst_23 : (tensor<1xf64>) -> tensor<1xbf16>
-    %54 = stablehlo.reshape %53 : (tensor<1xbf16>) -> tensor<bf16>
-    %55 = stablehlo.broadcast_in_dim %54, dims = [] : (tensor<bf16>) -> tensor<1x1x16x16xbf16>
-    %56 = stablehlo.broadcast_in_dim %52, dims = [0, 1, 2, 3] : (tensor<1x1x16x16xbf16>) -> tensor<1x1x16x16xbf16>
-    %57 = stablehlo.subtract %55, %56 : tensor<1x1x16x16xbf16>
-    %58 = stablehlo.convert %57 : (tensor<1x1x16x16xbf16>) -> tensor<1x1x16x16xi1>
-    %59 = stablehlo.broadcast_in_dim %58, dims = [0, 1, 2, 3] : (tensor<1x1x16x16xi1>) -> tensor<1x1x16x16xi1>
-    %60 = stablehlo.broadcast_in_dim %arg30, dims = [] : (tensor<bf16>) -> tensor<1x1x16x16xbf16>
-    %61 = stablehlo.broadcast_in_dim %57, dims = [0, 1, 2, 3] : (tensor<1x1x16x16xbf16>) -> tensor<1x1x16x16xbf16>
-    %62 = stablehlo.select %59, %60, %61 : tensor<1x1x16x16xi1>, tensor<1x1x16x16xbf16>
-    %63 = stablehlo.reshape %48 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %64 = stablehlo.convert %63 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %65 = stablehlo.dot_general %64, %arg31, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %66 = stablehlo.convert %cst_24 : (tensor<1xi64>) -> tensor<1xf32>
-    %67 = stablehlo.reshape %66 : (tensor<1xf32>) -> tensor<f32>
-    %68 = stablehlo.broadcast_in_dim %65, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %69 = stablehlo.broadcast_in_dim %67, dims = [] : (tensor<f32>) -> tensor<16x768xf32>
-    %70 = stablehlo.multiply %68, %69 : tensor<16x768xf32>
-    %71 = stablehlo.broadcast_in_dim %70, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %72 = stablehlo.broadcast_in_dim %arg32, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %73 = stablehlo.add %71, %72 : tensor<16x768xf32>
-    %74 = stablehlo.convert %73 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %75 = stablehlo.reshape %74 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %76 = stablehlo.reshape %75 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %77 = stablehlo.transpose %76, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %78 = stablehlo.dot_general %64, %arg33, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %79 = stablehlo.broadcast_in_dim %78, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %80 = stablehlo.multiply %79, %69 : tensor<16x768xf32>
-    %81 = stablehlo.broadcast_in_dim %80, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %82 = stablehlo.broadcast_in_dim %arg34, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %83 = stablehlo.add %81, %82 : tensor<16x768xf32>
-    %84 = stablehlo.convert %83 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %85 = stablehlo.reshape %84 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %86 = stablehlo.reshape %85 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %87 = stablehlo.transpose %86, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %88 = stablehlo.dot_general %64, %arg35, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %89 = stablehlo.broadcast_in_dim %88, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %90 = stablehlo.multiply %89, %69 : tensor<16x768xf32>
-    %91 = stablehlo.broadcast_in_dim %90, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %92 = stablehlo.broadcast_in_dim %arg36, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %93 = stablehlo.add %91, %92 : tensor<16x768xf32>
-    %94 = stablehlo.convert %93 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %95 = stablehlo.reshape %94 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %96 = stablehlo.reshape %95 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %97 = stablehlo.transpose %96, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %98 = stablehlo.convert %77 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %99 = stablehlo.convert %87 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %100 = stablehlo.convert %97 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %101 = stablehlo.convert %cst_25 : (tensor<1xf64>) -> tensor<1xf32>
-    %102 = stablehlo.reshape %101 : (tensor<1xf32>) -> tensor<f32>
-    %103 = stablehlo.broadcast_in_dim %98, dims = [0, 1, 2, 3] : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %104 = stablehlo.broadcast_in_dim %102, dims = [] : (tensor<f32>) -> tensor<1x12x16x64xf32>
-    %105 = stablehlo.multiply %103, %104 : tensor<1x12x16x64xf32>
-    %106 = stablehlo.transpose %99, dims = [0, 1, 3, 2] : (tensor<1x12x16x64xf32>) -> tensor<1x12x64x16xf32>
-    %107 = stablehlo.broadcast_in_dim %106, dims = [0, 1, 2, 3] : (tensor<1x12x64x16xf32>) -> tensor<1x12x64x16xf32>
-    %108 = stablehlo.broadcast_in_dim %102, dims = [] : (tensor<f32>) -> tensor<1x12x64x16xf32>
-    %109 = stablehlo.multiply %107, %108 : tensor<1x12x64x16xf32>
-    %110 = stablehlo.reshape %105 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %111 = stablehlo.reshape %109 : (tensor<1x12x64x16xf32>) -> tensor<12x64x16xf32>
-    %112 = stablehlo.broadcast_in_dim %111, dims = [0, 1, 2] : (tensor<12x64x16xf32>) -> tensor<12x64x16xf32>
-    %113 = stablehlo.dot_general %110, %112, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x64xf32>, tensor<12x64x16xf32>) -> tensor<12x16x16xf32>
-    %114 = stablehlo.reshape %113 : (tensor<12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %115 = stablehlo.convert %62 : (tensor<1x1x16x16xbf16>) -> tensor<1x1x16x16xf32>
-    %116 = stablehlo.broadcast_in_dim %114, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %117 = stablehlo.broadcast_in_dim %115, dims = [0, 1, 2, 3] : (tensor<1x1x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %118 = stablehlo.add %116, %117 : tensor<1x12x16x16xf32>
-    %119 = stablehlo.reduce(%118 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %120 = stablehlo.reshape %119 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %121 = stablehlo.broadcast_in_dim %118, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %122 = stablehlo.broadcast_in_dim %120, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %123 = stablehlo.subtract %121, %122 : tensor<1x12x16x16xf32>
-    %124 = stablehlo.exponential %123 : tensor<1x12x16x16xf32>
-    %125 = stablehlo.reduce(%124 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %126 = stablehlo.reshape %125 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %127 = stablehlo.broadcast_in_dim %124, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %128 = stablehlo.broadcast_in_dim %126, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %129 = stablehlo.divide %127, %128 : tensor<1x12x16x16xf32>
-    %130 = stablehlo.convert %cst_26 : tensor<1xf64>
-    %131 = stablehlo.reshape %130 : (tensor<1xf64>) -> tensor<f64>
-    %132 = stablehlo.convert %131 : (tensor<f64>) -> tensor<f32>
-    %133 = stablehlo.broadcast_in_dim %132, dims = [] : (tensor<f32>) -> tensor<1x12x16x16xf32>
-    %134 = stablehlo.compare  EQ, %121, %133,  FLOAT : (tensor<1x12x16x16xf32>, tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xi1>
-    %135 = stablehlo.reduce(%134 init: %c) applies stablehlo.and across dimensions = [3] : (tensor<1x12x16x16xi1>, tensor<i1>) -> tensor<1x12x16xi1>
-    %136 = stablehlo.reshape %135 : (tensor<1x12x16xi1>) -> tensor<1x12x16x1xi1>
-    %137 = stablehlo.convert %cst : (tensor<f64>) -> tensor<f32>
-    %138 = stablehlo.broadcast_in_dim %136, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xi1>) -> tensor<1x12x16x16xi1>
-    %139 = stablehlo.broadcast_in_dim %137, dims = [] : (tensor<f32>) -> tensor<1x12x16x16xf32>
-    %140 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %141 = stablehlo.select %138, %139, %140 : tensor<1x12x16x16xi1>, tensor<1x12x16x16xf32>
-    %142 = stablehlo.reshape %141 : (tensor<1x12x16x16xf32>) -> tensor<12x16x16xf32>
-    %143 = stablehlo.reshape %100 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %144 = stablehlo.broadcast_in_dim %143, dims = [0, 1, 2] : (tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %145 = stablehlo.dot_general %142, %144, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x16xf32>, tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %146 = stablehlo.reshape %145 : (tensor<12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %147 = stablehlo.convert %146 : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xbf16>
-    %148 = stablehlo.transpose %147, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %149 = stablehlo.transpose %148, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %150 = stablehlo.transpose %149, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %151 = stablehlo.reshape %150 : (tensor<1x16x12x64xbf16>) -> tensor<1x16x768xbf16>
-    %152 = stablehlo.reshape %151 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %153 = stablehlo.convert %152 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %154 = stablehlo.dot_general %153, %arg37, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %155 = stablehlo.broadcast_in_dim %154, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %156 = stablehlo.multiply %155, %69 : tensor<16x768xf32>
-    %157 = stablehlo.broadcast_in_dim %156, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %158 = stablehlo.broadcast_in_dim %arg38, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %159 = stablehlo.add %157, %158 : tensor<16x768xf32>
-    %160 = stablehlo.convert %159 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %161 = stablehlo.reshape %160 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %162 = stablehlo.add %161, %48 : tensor<1x16x768xbf16>
-    %163 = stablehlo.convert %162 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %164 = stablehlo.convert %163 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %165 = stablehlo.reduce(%164 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %166 = stablehlo.reshape %165 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %167 = stablehlo.broadcast_in_dim %166, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %168 = stablehlo.divide %167, %10 : tensor<1x16x1xf64>
-    %169 = stablehlo.broadcast_in_dim %164, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %170 = stablehlo.broadcast_in_dim %168, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %171 = stablehlo.subtract %169, %170 : tensor<1x16x768xf64>
-    %172 = stablehlo.multiply %171, %171 : tensor<1x16x768xf64>
-    %173 = stablehlo.reduce(%172 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %174 = stablehlo.reshape %173 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %175 = stablehlo.broadcast_in_dim %174, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %176 = stablehlo.divide %175, %10 : tensor<1x16x1xf64>
-    %177 = stablehlo.convert %176 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %178 = stablehlo.reduce(%163 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %179 = stablehlo.reshape %178 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %180 = stablehlo.broadcast_in_dim %179, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %181 = stablehlo.divide %180, %26 : tensor<1x16x1xf32>
-    %182 = stablehlo.broadcast_in_dim %177, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %183 = stablehlo.add %182, %31 : tensor<1x16x1xf32>
-    %184 = stablehlo.rsqrt %183 : tensor<1x16x1xf32>
-    %185 = stablehlo.broadcast_in_dim %163, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %186 = stablehlo.broadcast_in_dim %181, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %187 = stablehlo.subtract %185, %186 : tensor<1x16x768xf32>
-    %188 = stablehlo.broadcast_in_dim %187, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %189 = stablehlo.broadcast_in_dim %184, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %190 = stablehlo.multiply %188, %189 : tensor<1x16x768xf32>
-    %191 = stablehlo.convert %arg5 : (tensor<768xbf16>) -> tensor<768xf32>
-    %192 = stablehlo.broadcast_in_dim %190, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %193 = stablehlo.broadcast_in_dim %191, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %194 = stablehlo.multiply %192, %193 : tensor<1x16x768xf32>
-    %195 = stablehlo.convert %arg6 : (tensor<768xbf16>) -> tensor<768xf32>
-    %196 = stablehlo.broadcast_in_dim %194, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %197 = stablehlo.broadcast_in_dim %195, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %198 = stablehlo.add %196, %197 : tensor<1x16x768xf32>
-    %199 = stablehlo.convert %198 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %200 = stablehlo.reshape %199 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %201 = stablehlo.convert %200 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %202 = stablehlo.dot_general %201, %arg39, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x3072xf32>) -> tensor<16x3072xf32>
-    %203 = stablehlo.broadcast_in_dim %202, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %204 = stablehlo.broadcast_in_dim %67, dims = [] : (tensor<f32>) -> tensor<16x3072xf32>
-    %205 = stablehlo.multiply %203, %204 : tensor<16x3072xf32>
-    %206 = stablehlo.broadcast_in_dim %205, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %207 = stablehlo.broadcast_in_dim %arg40, dims = [1] : (tensor<3072xf32>) -> tensor<16x3072xf32>
-    %208 = stablehlo.add %206, %207 : tensor<16x3072xf32>
-    %209 = stablehlo.convert %208 : (tensor<16x3072xf32>) -> tensor<16x3072xbf16>
-    %210 = stablehlo.reshape %209 : (tensor<16x3072xbf16>) -> tensor<1x16x3072xbf16>
-    %211 = stablehlo.multiply %210, %cst_4 : tensor<1x16x3072xbf16>
-    %212 = stablehlo.rsqrt %cst_3 : tensor<1x16x3072xbf16>
-    %213 = stablehlo.multiply %210, %212 : tensor<1x16x3072xbf16>
-    %214 = stablehlo.convert %213 : (tensor<1x16x3072xbf16>) -> tensor<1x16x3072xf32>
-    %215 = stablehlo.clamp %cst_5, %214, %cst_6 : tensor<1x16x3072xf32>
-    %216 = stablehlo.multiply %215, %215 : tensor<1x16x3072xf32>
-    %217 = stablehlo.multiply %cst_7, %216 : tensor<1x16x3072xf32>
-    %218 = stablehlo.add %217, %cst_8 : tensor<1x16x3072xf32>
-    %219 = stablehlo.multiply %218, %216 : tensor<1x16x3072xf32>
-    %220 = stablehlo.add %219, %cst_9 : tensor<1x16x3072xf32>
-    %221 = stablehlo.multiply %220, %216 : tensor<1x16x3072xf32>
-    %222 = stablehlo.add %221, %cst_10 : tensor<1x16x3072xf32>
-    %223 = stablehlo.multiply %222, %216 : tensor<1x16x3072xf32>
-    %224 = stablehlo.add %223, %cst_11 : tensor<1x16x3072xf32>
-    %225 = stablehlo.multiply %224, %216 : tensor<1x16x3072xf32>
-    %226 = stablehlo.add %225, %cst_12 : tensor<1x16x3072xf32>
-    %227 = stablehlo.multiply %226, %216 : tensor<1x16x3072xf32>
-    %228 = stablehlo.add %227, %cst_13 : tensor<1x16x3072xf32>
-    %229 = stablehlo.multiply %cst_14, %216 : tensor<1x16x3072xf32>
-    %230 = stablehlo.add %229, %cst_15 : tensor<1x16x3072xf32>
-    %231 = stablehlo.multiply %230, %216 : tensor<1x16x3072xf32>
-    %232 = stablehlo.add %231, %cst_16 : tensor<1x16x3072xf32>
-    %233 = stablehlo.multiply %232, %216 : tensor<1x16x3072xf32>
-    %234 = stablehlo.add %233, %cst_17 : tensor<1x16x3072xf32>
-    %235 = stablehlo.multiply %234, %216 : tensor<1x16x3072xf32>
-    %236 = stablehlo.add %235, %cst_18 : tensor<1x16x3072xf32>
-    %237 = stablehlo.multiply %215, %228 : tensor<1x16x3072xf32>
-    %238 = stablehlo.divide %237, %236 : tensor<1x16x3072xf32>
-    %239 = stablehlo.clamp %cst_19, %238, %cst_20 : tensor<1x16x3072xf32>
-    %240 = stablehlo.convert %239 : (tensor<1x16x3072xf32>) -> tensor<1x16x3072xbf16>
-    %241 = stablehlo.add %240, %cst_2 : tensor<1x16x3072xbf16>
-    %242 = stablehlo.multiply %241, %211 : tensor<1x16x3072xbf16>
-    %243 = stablehlo.reshape %242 : (tensor<1x16x3072xbf16>) -> tensor<16x3072xbf16>
-    %244 = stablehlo.convert %243 : (tensor<16x3072xbf16>) -> tensor<16x3072xf32>
-    %245 = stablehlo.dot_general %244, %arg41, contracting_dims = [1] x [0] : (tensor<16x3072xf32>, tensor<3072x768xf32>) -> tensor<16x768xf32>
-    %246 = stablehlo.broadcast_in_dim %245, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %247 = stablehlo.multiply %246, %69 : tensor<16x768xf32>
-    %248 = stablehlo.broadcast_in_dim %247, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %249 = stablehlo.broadcast_in_dim %arg42, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %250 = stablehlo.add %248, %249 : tensor<16x768xf32>
-    %251 = stablehlo.convert %250 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %252 = stablehlo.reshape %251 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %253 = stablehlo.add %252, %199 : tensor<1x16x768xbf16>
-    %254 = stablehlo.convert %253 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %255 = stablehlo.convert %254 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %256 = stablehlo.reduce(%255 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %257 = stablehlo.reshape %256 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %258 = stablehlo.broadcast_in_dim %257, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %259 = stablehlo.divide %258, %10 : tensor<1x16x1xf64>
-    %260 = stablehlo.broadcast_in_dim %255, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %261 = stablehlo.broadcast_in_dim %259, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %262 = stablehlo.subtract %260, %261 : tensor<1x16x768xf64>
-    %263 = stablehlo.multiply %262, %262 : tensor<1x16x768xf64>
-    %264 = stablehlo.reduce(%263 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %265 = stablehlo.reshape %264 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %266 = stablehlo.broadcast_in_dim %265, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %267 = stablehlo.divide %266, %10 : tensor<1x16x1xf64>
-    %268 = stablehlo.convert %267 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %269 = stablehlo.reduce(%254 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %270 = stablehlo.reshape %269 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %271 = stablehlo.broadcast_in_dim %270, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %272 = stablehlo.divide %271, %26 : tensor<1x16x1xf32>
-    %273 = stablehlo.broadcast_in_dim %268, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %274 = stablehlo.add %273, %31 : tensor<1x16x1xf32>
-    %275 = stablehlo.rsqrt %274 : tensor<1x16x1xf32>
-    %276 = stablehlo.broadcast_in_dim %254, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %277 = stablehlo.broadcast_in_dim %272, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %278 = stablehlo.subtract %276, %277 : tensor<1x16x768xf32>
-    %279 = stablehlo.broadcast_in_dim %278, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %280 = stablehlo.broadcast_in_dim %275, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %281 = stablehlo.multiply %279, %280 : tensor<1x16x768xf32>
-    %282 = stablehlo.convert %arg7 : (tensor<768xbf16>) -> tensor<768xf32>
-    %283 = stablehlo.broadcast_in_dim %281, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %284 = stablehlo.broadcast_in_dim %282, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %285 = stablehlo.multiply %283, %284 : tensor<1x16x768xf32>
-    %286 = stablehlo.convert %arg8 : (tensor<768xbf16>) -> tensor<768xf32>
-    %287 = stablehlo.broadcast_in_dim %285, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %288 = stablehlo.broadcast_in_dim %286, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %289 = stablehlo.add %287, %288 : tensor<1x16x768xf32>
-    %290 = stablehlo.convert %289 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %291 = stablehlo.reshape %290 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %292 = stablehlo.convert %291 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %293 = stablehlo.dot_general %292, %arg43, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %294 = stablehlo.broadcast_in_dim %293, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %295 = stablehlo.multiply %294, %69 : tensor<16x768xf32>
-    %296 = stablehlo.broadcast_in_dim %295, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %297 = stablehlo.broadcast_in_dim %arg44, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %298 = stablehlo.add %296, %297 : tensor<16x768xf32>
-    %299 = stablehlo.convert %298 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %300 = stablehlo.reshape %299 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %301 = stablehlo.reshape %300 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %302 = stablehlo.transpose %301, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %303 = stablehlo.dot_general %292, %arg45, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %304 = stablehlo.broadcast_in_dim %303, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %305 = stablehlo.multiply %304, %69 : tensor<16x768xf32>
-    %306 = stablehlo.broadcast_in_dim %305, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %307 = stablehlo.broadcast_in_dim %arg46, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %308 = stablehlo.add %306, %307 : tensor<16x768xf32>
-    %309 = stablehlo.convert %308 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %310 = stablehlo.reshape %309 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %311 = stablehlo.reshape %310 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %312 = stablehlo.transpose %311, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %313 = stablehlo.dot_general %292, %arg47, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %314 = stablehlo.broadcast_in_dim %313, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %315 = stablehlo.multiply %314, %69 : tensor<16x768xf32>
-    %316 = stablehlo.broadcast_in_dim %315, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %317 = stablehlo.broadcast_in_dim %arg48, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %318 = stablehlo.add %316, %317 : tensor<16x768xf32>
-    %319 = stablehlo.convert %318 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %320 = stablehlo.reshape %319 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %321 = stablehlo.reshape %320 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %322 = stablehlo.transpose %321, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %323 = stablehlo.convert %302 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %324 = stablehlo.convert %312 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %325 = stablehlo.convert %322 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %326 = stablehlo.broadcast_in_dim %323, dims = [0, 1, 2, 3] : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %327 = stablehlo.multiply %326, %104 : tensor<1x12x16x64xf32>
-    %328 = stablehlo.transpose %324, dims = [0, 1, 3, 2] : (tensor<1x12x16x64xf32>) -> tensor<1x12x64x16xf32>
-    %329 = stablehlo.broadcast_in_dim %328, dims = [0, 1, 2, 3] : (tensor<1x12x64x16xf32>) -> tensor<1x12x64x16xf32>
-    %330 = stablehlo.multiply %329, %108 : tensor<1x12x64x16xf32>
-    %331 = stablehlo.reshape %327 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %332 = stablehlo.reshape %330 : (tensor<1x12x64x16xf32>) -> tensor<12x64x16xf32>
-    %333 = stablehlo.broadcast_in_dim %332, dims = [0, 1, 2] : (tensor<12x64x16xf32>) -> tensor<12x64x16xf32>
-    %334 = stablehlo.dot_general %331, %333, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x64xf32>, tensor<12x64x16xf32>) -> tensor<12x16x16xf32>
-    %335 = stablehlo.reshape %334 : (tensor<12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %336 = stablehlo.broadcast_in_dim %335, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %337 = stablehlo.add %336, %117 : tensor<1x12x16x16xf32>
-    %338 = stablehlo.reduce(%337 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %339 = stablehlo.reshape %338 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %340 = stablehlo.broadcast_in_dim %337, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %341 = stablehlo.broadcast_in_dim %339, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %342 = stablehlo.subtract %340, %341 : tensor<1x12x16x16xf32>
-    %343 = stablehlo.exponential %342 : tensor<1x12x16x16xf32>
-    %344 = stablehlo.reduce(%343 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %345 = stablehlo.reshape %344 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %346 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %347 = stablehlo.broadcast_in_dim %345, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %348 = stablehlo.divide %346, %347 : tensor<1x12x16x16xf32>
-    %349 = stablehlo.compare  EQ, %340, %133,  FLOAT : (tensor<1x12x16x16xf32>, tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xi1>
-    %350 = stablehlo.reduce(%349 init: %c) applies stablehlo.and across dimensions = [3] : (tensor<1x12x16x16xi1>, tensor<i1>) -> tensor<1x12x16xi1>
-    %351 = stablehlo.reshape %350 : (tensor<1x12x16xi1>) -> tensor<1x12x16x1xi1>
-    %352 = stablehlo.broadcast_in_dim %351, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xi1>) -> tensor<1x12x16x16xi1>
-    %353 = stablehlo.broadcast_in_dim %348, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %354 = stablehlo.select %352, %139, %353 : tensor<1x12x16x16xi1>, tensor<1x12x16x16xf32>
-    %355 = stablehlo.reshape %354 : (tensor<1x12x16x16xf32>) -> tensor<12x16x16xf32>
-    %356 = stablehlo.reshape %325 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %357 = stablehlo.broadcast_in_dim %356, dims = [0, 1, 2] : (tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %358 = stablehlo.dot_general %355, %357, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x16xf32>, tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %359 = stablehlo.reshape %358 : (tensor<12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %360 = stablehlo.convert %359 : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xbf16>
-    %361 = stablehlo.transpose %360, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %362 = stablehlo.transpose %361, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %363 = stablehlo.transpose %362, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %364 = stablehlo.reshape %363 : (tensor<1x16x12x64xbf16>) -> tensor<1x16x768xbf16>
-    %365 = stablehlo.reshape %364 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %366 = stablehlo.convert %365 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %367 = stablehlo.dot_general %366, %arg49, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %368 = stablehlo.broadcast_in_dim %367, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %369 = stablehlo.multiply %368, %69 : tensor<16x768xf32>
-    %370 = stablehlo.broadcast_in_dim %369, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %371 = stablehlo.broadcast_in_dim %arg50, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %372 = stablehlo.add %370, %371 : tensor<16x768xf32>
-    %373 = stablehlo.convert %372 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %374 = stablehlo.reshape %373 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %375 = stablehlo.add %374, %290 : tensor<1x16x768xbf16>
-    %376 = stablehlo.convert %375 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %377 = stablehlo.convert %376 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %378 = stablehlo.reduce(%377 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %379 = stablehlo.reshape %378 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %380 = stablehlo.broadcast_in_dim %379, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %381 = stablehlo.divide %380, %10 : tensor<1x16x1xf64>
-    %382 = stablehlo.broadcast_in_dim %377, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %383 = stablehlo.broadcast_in_dim %381, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %384 = stablehlo.subtract %382, %383 : tensor<1x16x768xf64>
-    %385 = stablehlo.multiply %384, %384 : tensor<1x16x768xf64>
-    %386 = stablehlo.reduce(%385 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %387 = stablehlo.reshape %386 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %388 = stablehlo.broadcast_in_dim %387, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %389 = stablehlo.divide %388, %10 : tensor<1x16x1xf64>
-    %390 = stablehlo.convert %389 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %391 = stablehlo.reduce(%376 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %392 = stablehlo.reshape %391 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %393 = stablehlo.broadcast_in_dim %392, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %394 = stablehlo.divide %393, %26 : tensor<1x16x1xf32>
-    %395 = stablehlo.broadcast_in_dim %390, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %396 = stablehlo.add %395, %31 : tensor<1x16x1xf32>
-    %397 = stablehlo.rsqrt %396 : tensor<1x16x1xf32>
-    %398 = stablehlo.broadcast_in_dim %376, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %399 = stablehlo.broadcast_in_dim %394, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %400 = stablehlo.subtract %398, %399 : tensor<1x16x768xf32>
-    %401 = stablehlo.broadcast_in_dim %400, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %402 = stablehlo.broadcast_in_dim %397, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %403 = stablehlo.multiply %401, %402 : tensor<1x16x768xf32>
-    %404 = stablehlo.convert %arg9 : (tensor<768xbf16>) -> tensor<768xf32>
-    %405 = stablehlo.broadcast_in_dim %403, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %406 = stablehlo.broadcast_in_dim %404, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %407 = stablehlo.multiply %405, %406 : tensor<1x16x768xf32>
-    %408 = stablehlo.convert %arg10 : (tensor<768xbf16>) -> tensor<768xf32>
-    %409 = stablehlo.broadcast_in_dim %407, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %410 = stablehlo.broadcast_in_dim %408, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %411 = stablehlo.add %409, %410 : tensor<1x16x768xf32>
-    %412 = stablehlo.convert %411 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %413 = stablehlo.reshape %412 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %414 = stablehlo.convert %413 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %415 = stablehlo.dot_general %414, %arg51, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x3072xf32>) -> tensor<16x3072xf32>
-    %416 = stablehlo.broadcast_in_dim %415, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %417 = stablehlo.multiply %416, %204 : tensor<16x3072xf32>
-    %418 = stablehlo.broadcast_in_dim %417, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %419 = stablehlo.broadcast_in_dim %arg52, dims = [1] : (tensor<3072xf32>) -> tensor<16x3072xf32>
-    %420 = stablehlo.add %418, %419 : tensor<16x3072xf32>
-    %421 = stablehlo.convert %420 : (tensor<16x3072xf32>) -> tensor<16x3072xbf16>
-    %422 = stablehlo.reshape %421 : (tensor<16x3072xbf16>) -> tensor<1x16x3072xbf16>
-    %423 = stablehlo.multiply %422, %cst_4 : tensor<1x16x3072xbf16>
-    %424 = stablehlo.multiply %422, %212 : tensor<1x16x3072xbf16>
-    %425 = stablehlo.convert %424 : (tensor<1x16x3072xbf16>) -> tensor<1x16x3072xf32>
-    %426 = stablehlo.clamp %cst_5, %425, %cst_6 : tensor<1x16x3072xf32>
-    %427 = stablehlo.multiply %426, %426 : tensor<1x16x3072xf32>
-    %428 = stablehlo.multiply %cst_7, %427 : tensor<1x16x3072xf32>
-    %429 = stablehlo.add %428, %cst_8 : tensor<1x16x3072xf32>
-    %430 = stablehlo.multiply %429, %427 : tensor<1x16x3072xf32>
-    %431 = stablehlo.add %430, %cst_9 : tensor<1x16x3072xf32>
-    %432 = stablehlo.multiply %431, %427 : tensor<1x16x3072xf32>
-    %433 = stablehlo.add %432, %cst_10 : tensor<1x16x3072xf32>
-    %434 = stablehlo.multiply %433, %427 : tensor<1x16x3072xf32>
-    %435 = stablehlo.add %434, %cst_11 : tensor<1x16x3072xf32>
-    %436 = stablehlo.multiply %435, %427 : tensor<1x16x3072xf32>
-    %437 = stablehlo.add %436, %cst_12 : tensor<1x16x3072xf32>
-    %438 = stablehlo.multiply %437, %427 : tensor<1x16x3072xf32>
-    %439 = stablehlo.add %438, %cst_13 : tensor<1x16x3072xf32>
-    %440 = stablehlo.multiply %cst_14, %427 : tensor<1x16x3072xf32>
-    %441 = stablehlo.add %440, %cst_15 : tensor<1x16x3072xf32>
-    %442 = stablehlo.multiply %441, %427 : tensor<1x16x3072xf32>
-    %443 = stablehlo.add %442, %cst_16 : tensor<1x16x3072xf32>
-    %444 = stablehlo.multiply %443, %427 : tensor<1x16x3072xf32>
-    %445 = stablehlo.add %444, %cst_17 : tensor<1x16x3072xf32>
-    %446 = stablehlo.multiply %445, %427 : tensor<1x16x3072xf32>
-    %447 = stablehlo.add %446, %cst_18 : tensor<1x16x3072xf32>
-    %448 = stablehlo.multiply %426, %439 : tensor<1x16x3072xf32>
-    %449 = stablehlo.divide %448, %447 : tensor<1x16x3072xf32>
-    %450 = stablehlo.clamp %cst_19, %449, %cst_20 : tensor<1x16x3072xf32>
-    %451 = stablehlo.convert %450 : (tensor<1x16x3072xf32>) -> tensor<1x16x3072xbf16>
-    %452 = stablehlo.add %451, %cst_2 : tensor<1x16x3072xbf16>
-    %453 = stablehlo.multiply %452, %423 : tensor<1x16x3072xbf16>
-    %454 = stablehlo.reshape %453 : (tensor<1x16x3072xbf16>) -> tensor<16x3072xbf16>
-    %455 = stablehlo.convert %454 : (tensor<16x3072xbf16>) -> tensor<16x3072xf32>
-    %456 = stablehlo.dot_general %455, %arg53, contracting_dims = [1] x [0] : (tensor<16x3072xf32>, tensor<3072x768xf32>) -> tensor<16x768xf32>
-    %457 = stablehlo.broadcast_in_dim %456, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %458 = stablehlo.multiply %457, %69 : tensor<16x768xf32>
-    %459 = stablehlo.broadcast_in_dim %458, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %460 = stablehlo.broadcast_in_dim %arg54, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %461 = stablehlo.add %459, %460 : tensor<16x768xf32>
-    %462 = stablehlo.convert %461 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %463 = stablehlo.reshape %462 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %464 = stablehlo.add %463, %412 : tensor<1x16x768xbf16>
-    %465 = stablehlo.convert %464 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %466 = stablehlo.convert %465 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %467 = stablehlo.reduce(%466 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %468 = stablehlo.reshape %467 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %469 = stablehlo.broadcast_in_dim %468, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %470 = stablehlo.divide %469, %10 : tensor<1x16x1xf64>
-    %471 = stablehlo.broadcast_in_dim %466, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %472 = stablehlo.broadcast_in_dim %470, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %473 = stablehlo.subtract %471, %472 : tensor<1x16x768xf64>
-    %474 = stablehlo.multiply %473, %473 : tensor<1x16x768xf64>
-    %475 = stablehlo.reduce(%474 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %476 = stablehlo.reshape %475 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %477 = stablehlo.broadcast_in_dim %476, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %478 = stablehlo.divide %477, %10 : tensor<1x16x1xf64>
-    %479 = stablehlo.convert %478 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %480 = stablehlo.reduce(%465 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %481 = stablehlo.reshape %480 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %482 = stablehlo.broadcast_in_dim %481, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %483 = stablehlo.divide %482, %26 : tensor<1x16x1xf32>
-    %484 = stablehlo.broadcast_in_dim %479, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %485 = stablehlo.add %484, %31 : tensor<1x16x1xf32>
-    %486 = stablehlo.rsqrt %485 : tensor<1x16x1xf32>
-    %487 = stablehlo.broadcast_in_dim %465, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %488 = stablehlo.broadcast_in_dim %483, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %489 = stablehlo.subtract %487, %488 : tensor<1x16x768xf32>
-    %490 = stablehlo.broadcast_in_dim %489, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %491 = stablehlo.broadcast_in_dim %486, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %492 = stablehlo.multiply %490, %491 : tensor<1x16x768xf32>
-    %493 = stablehlo.convert %arg11 : (tensor<768xbf16>) -> tensor<768xf32>
-    %494 = stablehlo.broadcast_in_dim %492, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %495 = stablehlo.broadcast_in_dim %493, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %496 = stablehlo.multiply %494, %495 : tensor<1x16x768xf32>
-    %497 = stablehlo.convert %arg12 : (tensor<768xbf16>) -> tensor<768xf32>
-    %498 = stablehlo.broadcast_in_dim %496, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %499 = stablehlo.broadcast_in_dim %497, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %500 = stablehlo.add %498, %499 : tensor<1x16x768xf32>
-    %501 = stablehlo.convert %500 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %502 = stablehlo.reshape %501 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %503 = stablehlo.convert %502 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %504 = stablehlo.dot_general %503, %arg55, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %505 = stablehlo.broadcast_in_dim %504, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %506 = stablehlo.multiply %505, %69 : tensor<16x768xf32>
-    %507 = stablehlo.broadcast_in_dim %506, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %508 = stablehlo.broadcast_in_dim %arg56, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %509 = stablehlo.add %507, %508 : tensor<16x768xf32>
-    %510 = stablehlo.convert %509 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %511 = stablehlo.reshape %510 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %512 = stablehlo.reshape %511 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %513 = stablehlo.transpose %512, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %514 = stablehlo.dot_general %503, %arg57, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %515 = stablehlo.broadcast_in_dim %514, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %516 = stablehlo.multiply %515, %69 : tensor<16x768xf32>
-    %517 = stablehlo.broadcast_in_dim %516, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %518 = stablehlo.broadcast_in_dim %arg58, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %519 = stablehlo.add %517, %518 : tensor<16x768xf32>
-    %520 = stablehlo.convert %519 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %521 = stablehlo.reshape %520 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %522 = stablehlo.reshape %521 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %523 = stablehlo.transpose %522, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %524 = stablehlo.dot_general %503, %arg59, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %525 = stablehlo.broadcast_in_dim %524, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %526 = stablehlo.multiply %525, %69 : tensor<16x768xf32>
-    %527 = stablehlo.broadcast_in_dim %526, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %528 = stablehlo.broadcast_in_dim %arg60, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %529 = stablehlo.add %527, %528 : tensor<16x768xf32>
-    %530 = stablehlo.convert %529 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %531 = stablehlo.reshape %530 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %532 = stablehlo.reshape %531 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %533 = stablehlo.transpose %532, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %534 = stablehlo.convert %513 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %535 = stablehlo.convert %523 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %536 = stablehlo.convert %533 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %537 = stablehlo.broadcast_in_dim %534, dims = [0, 1, 2, 3] : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %538 = stablehlo.multiply %537, %104 : tensor<1x12x16x64xf32>
-    %539 = stablehlo.transpose %535, dims = [0, 1, 3, 2] : (tensor<1x12x16x64xf32>) -> tensor<1x12x64x16xf32>
-    %540 = stablehlo.broadcast_in_dim %539, dims = [0, 1, 2, 3] : (tensor<1x12x64x16xf32>) -> tensor<1x12x64x16xf32>
-    %541 = stablehlo.multiply %540, %108 : tensor<1x12x64x16xf32>
-    %542 = stablehlo.reshape %538 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %543 = stablehlo.reshape %541 : (tensor<1x12x64x16xf32>) -> tensor<12x64x16xf32>
-    %544 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2] : (tensor<12x64x16xf32>) -> tensor<12x64x16xf32>
-    %545 = stablehlo.dot_general %542, %544, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x64xf32>, tensor<12x64x16xf32>) -> tensor<12x16x16xf32>
-    %546 = stablehlo.reshape %545 : (tensor<12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %547 = stablehlo.broadcast_in_dim %546, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %548 = stablehlo.add %547, %117 : tensor<1x12x16x16xf32>
-    %549 = stablehlo.reduce(%548 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %550 = stablehlo.reshape %549 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %551 = stablehlo.broadcast_in_dim %548, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %552 = stablehlo.broadcast_in_dim %550, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %553 = stablehlo.subtract %551, %552 : tensor<1x12x16x16xf32>
-    %554 = stablehlo.exponential %553 : tensor<1x12x16x16xf32>
-    %555 = stablehlo.reduce(%554 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %556 = stablehlo.reshape %555 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %557 = stablehlo.broadcast_in_dim %554, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %558 = stablehlo.broadcast_in_dim %556, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %559 = stablehlo.divide %557, %558 : tensor<1x12x16x16xf32>
-    %560 = stablehlo.compare  EQ, %551, %133,  FLOAT : (tensor<1x12x16x16xf32>, tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xi1>
-    %561 = stablehlo.reduce(%560 init: %c) applies stablehlo.and across dimensions = [3] : (tensor<1x12x16x16xi1>, tensor<i1>) -> tensor<1x12x16xi1>
-    %562 = stablehlo.reshape %561 : (tensor<1x12x16xi1>) -> tensor<1x12x16x1xi1>
-    %563 = stablehlo.broadcast_in_dim %562, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xi1>) -> tensor<1x12x16x16xi1>
-    %564 = stablehlo.broadcast_in_dim %559, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %565 = stablehlo.select %563, %139, %564 : tensor<1x12x16x16xi1>, tensor<1x12x16x16xf32>
-    %566 = stablehlo.reshape %565 : (tensor<1x12x16x16xf32>) -> tensor<12x16x16xf32>
-    %567 = stablehlo.reshape %536 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %568 = stablehlo.broadcast_in_dim %567, dims = [0, 1, 2] : (tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %569 = stablehlo.dot_general %566, %568, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x16xf32>, tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %570 = stablehlo.reshape %569 : (tensor<12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %571 = stablehlo.convert %570 : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xbf16>
-    %572 = stablehlo.transpose %571, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %573 = stablehlo.transpose %572, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %574 = stablehlo.transpose %573, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %575 = stablehlo.reshape %574 : (tensor<1x16x12x64xbf16>) -> tensor<1x16x768xbf16>
-    %576 = stablehlo.reshape %575 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %577 = stablehlo.convert %576 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %578 = stablehlo.dot_general %577, %arg61, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %579 = stablehlo.broadcast_in_dim %578, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %580 = stablehlo.multiply %579, %69 : tensor<16x768xf32>
-    %581 = stablehlo.broadcast_in_dim %580, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %582 = stablehlo.broadcast_in_dim %arg62, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %583 = stablehlo.add %581, %582 : tensor<16x768xf32>
-    %584 = stablehlo.convert %583 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %585 = stablehlo.reshape %584 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %586 = stablehlo.add %585, %501 : tensor<1x16x768xbf16>
-    %587 = stablehlo.convert %586 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %588 = stablehlo.convert %587 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %589 = stablehlo.reduce(%588 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %590 = stablehlo.reshape %589 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %591 = stablehlo.broadcast_in_dim %590, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %592 = stablehlo.divide %591, %10 : tensor<1x16x1xf64>
-    %593 = stablehlo.broadcast_in_dim %588, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %594 = stablehlo.broadcast_in_dim %592, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %595 = stablehlo.subtract %593, %594 : tensor<1x16x768xf64>
-    %596 = stablehlo.multiply %595, %595 : tensor<1x16x768xf64>
-    %597 = stablehlo.reduce(%596 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %598 = stablehlo.reshape %597 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %599 = stablehlo.broadcast_in_dim %598, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %600 = stablehlo.divide %599, %10 : tensor<1x16x1xf64>
-    %601 = stablehlo.convert %600 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %602 = stablehlo.reduce(%587 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %603 = stablehlo.reshape %602 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %604 = stablehlo.broadcast_in_dim %603, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %605 = stablehlo.divide %604, %26 : tensor<1x16x1xf32>
-    %606 = stablehlo.broadcast_in_dim %601, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %607 = stablehlo.add %606, %31 : tensor<1x16x1xf32>
-    %608 = stablehlo.rsqrt %607 : tensor<1x16x1xf32>
-    %609 = stablehlo.broadcast_in_dim %587, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %610 = stablehlo.broadcast_in_dim %605, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %611 = stablehlo.subtract %609, %610 : tensor<1x16x768xf32>
-    %612 = stablehlo.broadcast_in_dim %611, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %613 = stablehlo.broadcast_in_dim %608, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %614 = stablehlo.multiply %612, %613 : tensor<1x16x768xf32>
-    %615 = stablehlo.convert %arg13 : (tensor<768xbf16>) -> tensor<768xf32>
-    %616 = stablehlo.broadcast_in_dim %614, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %617 = stablehlo.broadcast_in_dim %615, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %618 = stablehlo.multiply %616, %617 : tensor<1x16x768xf32>
-    %619 = stablehlo.convert %arg14 : (tensor<768xbf16>) -> tensor<768xf32>
-    %620 = stablehlo.broadcast_in_dim %618, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %621 = stablehlo.broadcast_in_dim %619, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %622 = stablehlo.add %620, %621 : tensor<1x16x768xf32>
-    %623 = stablehlo.convert %622 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %624 = stablehlo.reshape %623 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %625 = stablehlo.convert %624 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %626 = stablehlo.dot_general %625, %arg63, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x3072xf32>) -> tensor<16x3072xf32>
-    %627 = stablehlo.broadcast_in_dim %626, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %628 = stablehlo.multiply %627, %204 : tensor<16x3072xf32>
-    %629 = stablehlo.broadcast_in_dim %628, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %630 = stablehlo.broadcast_in_dim %arg64, dims = [1] : (tensor<3072xf32>) -> tensor<16x3072xf32>
-    %631 = stablehlo.add %629, %630 : tensor<16x3072xf32>
-    %632 = stablehlo.convert %631 : (tensor<16x3072xf32>) -> tensor<16x3072xbf16>
-    %633 = stablehlo.reshape %632 : (tensor<16x3072xbf16>) -> tensor<1x16x3072xbf16>
-    %634 = stablehlo.multiply %633, %cst_4 : tensor<1x16x3072xbf16>
-    %635 = stablehlo.multiply %633, %212 : tensor<1x16x3072xbf16>
-    %636 = stablehlo.convert %635 : (tensor<1x16x3072xbf16>) -> tensor<1x16x3072xf32>
-    %637 = stablehlo.clamp %cst_5, %636, %cst_6 : tensor<1x16x3072xf32>
-    %638 = stablehlo.multiply %637, %637 : tensor<1x16x3072xf32>
-    %639 = stablehlo.multiply %cst_7, %638 : tensor<1x16x3072xf32>
-    %640 = stablehlo.add %639, %cst_8 : tensor<1x16x3072xf32>
-    %641 = stablehlo.multiply %640, %638 : tensor<1x16x3072xf32>
-    %642 = stablehlo.add %641, %cst_9 : tensor<1x16x3072xf32>
-    %643 = stablehlo.multiply %642, %638 : tensor<1x16x3072xf32>
-    %644 = stablehlo.add %643, %cst_10 : tensor<1x16x3072xf32>
-    %645 = stablehlo.multiply %644, %638 : tensor<1x16x3072xf32>
-    %646 = stablehlo.add %645, %cst_11 : tensor<1x16x3072xf32>
-    %647 = stablehlo.multiply %646, %638 : tensor<1x16x3072xf32>
-    %648 = stablehlo.add %647, %cst_12 : tensor<1x16x3072xf32>
-    %649 = stablehlo.multiply %648, %638 : tensor<1x16x3072xf32>
-    %650 = stablehlo.add %649, %cst_13 : tensor<1x16x3072xf32>
-    %651 = stablehlo.multiply %cst_14, %638 : tensor<1x16x3072xf32>
-    %652 = stablehlo.add %651, %cst_15 : tensor<1x16x3072xf32>
-    %653 = stablehlo.multiply %652, %638 : tensor<1x16x3072xf32>
-    %654 = stablehlo.add %653, %cst_16 : tensor<1x16x3072xf32>
-    %655 = stablehlo.multiply %654, %638 : tensor<1x16x3072xf32>
-    %656 = stablehlo.add %655, %cst_17 : tensor<1x16x3072xf32>
-    %657 = stablehlo.multiply %656, %638 : tensor<1x16x3072xf32>
-    %658 = stablehlo.add %657, %cst_18 : tensor<1x16x3072xf32>
-    %659 = stablehlo.multiply %637, %650 : tensor<1x16x3072xf32>
-    %660 = stablehlo.divide %659, %658 : tensor<1x16x3072xf32>
-    %661 = stablehlo.clamp %cst_19, %660, %cst_20 : tensor<1x16x3072xf32>
-    %662 = stablehlo.convert %661 : (tensor<1x16x3072xf32>) -> tensor<1x16x3072xbf16>
-    %663 = stablehlo.add %662, %cst_2 : tensor<1x16x3072xbf16>
-    %664 = stablehlo.multiply %663, %634 : tensor<1x16x3072xbf16>
-    %665 = stablehlo.reshape %664 : (tensor<1x16x3072xbf16>) -> tensor<16x3072xbf16>
-    %666 = stablehlo.convert %665 : (tensor<16x3072xbf16>) -> tensor<16x3072xf32>
-    %667 = stablehlo.dot_general %666, %arg65, contracting_dims = [1] x [0] : (tensor<16x3072xf32>, tensor<3072x768xf32>) -> tensor<16x768xf32>
-    %668 = stablehlo.broadcast_in_dim %667, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %669 = stablehlo.multiply %668, %69 : tensor<16x768xf32>
-    %670 = stablehlo.broadcast_in_dim %669, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %671 = stablehlo.broadcast_in_dim %arg66, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %672 = stablehlo.add %670, %671 : tensor<16x768xf32>
-    %673 = stablehlo.convert %672 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %674 = stablehlo.reshape %673 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %675 = stablehlo.add %674, %623 : tensor<1x16x768xbf16>
-    %676 = stablehlo.convert %675 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %677 = stablehlo.convert %676 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %678 = stablehlo.reduce(%677 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %679 = stablehlo.reshape %678 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %680 = stablehlo.broadcast_in_dim %679, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %681 = stablehlo.divide %680, %10 : tensor<1x16x1xf64>
-    %682 = stablehlo.broadcast_in_dim %677, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %683 = stablehlo.broadcast_in_dim %681, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %684 = stablehlo.subtract %682, %683 : tensor<1x16x768xf64>
-    %685 = stablehlo.multiply %684, %684 : tensor<1x16x768xf64>
-    %686 = stablehlo.reduce(%685 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %687 = stablehlo.reshape %686 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %688 = stablehlo.broadcast_in_dim %687, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %689 = stablehlo.divide %688, %10 : tensor<1x16x1xf64>
-    %690 = stablehlo.convert %689 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %691 = stablehlo.reduce(%676 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %692 = stablehlo.reshape %691 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %693 = stablehlo.broadcast_in_dim %692, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %694 = stablehlo.divide %693, %26 : tensor<1x16x1xf32>
-    %695 = stablehlo.broadcast_in_dim %690, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %696 = stablehlo.add %695, %31 : tensor<1x16x1xf32>
-    %697 = stablehlo.rsqrt %696 : tensor<1x16x1xf32>
-    %698 = stablehlo.broadcast_in_dim %676, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %699 = stablehlo.broadcast_in_dim %694, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %700 = stablehlo.subtract %698, %699 : tensor<1x16x768xf32>
-    %701 = stablehlo.broadcast_in_dim %700, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %702 = stablehlo.broadcast_in_dim %697, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %703 = stablehlo.multiply %701, %702 : tensor<1x16x768xf32>
-    %704 = stablehlo.convert %arg15 : (tensor<768xbf16>) -> tensor<768xf32>
-    %705 = stablehlo.broadcast_in_dim %703, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %706 = stablehlo.broadcast_in_dim %704, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %707 = stablehlo.multiply %705, %706 : tensor<1x16x768xf32>
-    %708 = stablehlo.convert %arg16 : (tensor<768xbf16>) -> tensor<768xf32>
-    %709 = stablehlo.broadcast_in_dim %707, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %710 = stablehlo.broadcast_in_dim %708, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %711 = stablehlo.add %709, %710 : tensor<1x16x768xf32>
-    %712 = stablehlo.convert %711 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %713 = stablehlo.reshape %712 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %714 = stablehlo.convert %713 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %715 = stablehlo.dot_general %714, %arg67, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %716 = stablehlo.broadcast_in_dim %715, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %717 = stablehlo.multiply %716, %69 : tensor<16x768xf32>
-    %718 = stablehlo.broadcast_in_dim %717, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %719 = stablehlo.broadcast_in_dim %arg68, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %720 = stablehlo.add %718, %719 : tensor<16x768xf32>
-    %721 = stablehlo.convert %720 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %722 = stablehlo.reshape %721 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %723 = stablehlo.reshape %722 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %724 = stablehlo.transpose %723, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %725 = stablehlo.dot_general %714, %arg69, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %726 = stablehlo.broadcast_in_dim %725, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %727 = stablehlo.multiply %726, %69 : tensor<16x768xf32>
-    %728 = stablehlo.broadcast_in_dim %727, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %729 = stablehlo.broadcast_in_dim %arg70, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %730 = stablehlo.add %728, %729 : tensor<16x768xf32>
-    %731 = stablehlo.convert %730 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %732 = stablehlo.reshape %731 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %733 = stablehlo.reshape %732 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %734 = stablehlo.transpose %733, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %735 = stablehlo.dot_general %714, %arg71, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %736 = stablehlo.broadcast_in_dim %735, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %737 = stablehlo.multiply %736, %69 : tensor<16x768xf32>
-    %738 = stablehlo.broadcast_in_dim %737, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %739 = stablehlo.broadcast_in_dim %arg72, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %740 = stablehlo.add %738, %739 : tensor<16x768xf32>
-    %741 = stablehlo.convert %740 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %742 = stablehlo.reshape %741 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %743 = stablehlo.reshape %742 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %744 = stablehlo.transpose %743, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %745 = stablehlo.convert %724 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %746 = stablehlo.convert %734 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %747 = stablehlo.convert %744 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %748 = stablehlo.broadcast_in_dim %745, dims = [0, 1, 2, 3] : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %749 = stablehlo.multiply %748, %104 : tensor<1x12x16x64xf32>
-    %750 = stablehlo.transpose %746, dims = [0, 1, 3, 2] : (tensor<1x12x16x64xf32>) -> tensor<1x12x64x16xf32>
-    %751 = stablehlo.broadcast_in_dim %750, dims = [0, 1, 2, 3] : (tensor<1x12x64x16xf32>) -> tensor<1x12x64x16xf32>
-    %752 = stablehlo.multiply %751, %108 : tensor<1x12x64x16xf32>
-    %753 = stablehlo.reshape %749 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %754 = stablehlo.reshape %752 : (tensor<1x12x64x16xf32>) -> tensor<12x64x16xf32>
-    %755 = stablehlo.broadcast_in_dim %754, dims = [0, 1, 2] : (tensor<12x64x16xf32>) -> tensor<12x64x16xf32>
-    %756 = stablehlo.dot_general %753, %755, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x64xf32>, tensor<12x64x16xf32>) -> tensor<12x16x16xf32>
-    %757 = stablehlo.reshape %756 : (tensor<12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %758 = stablehlo.broadcast_in_dim %757, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %759 = stablehlo.add %758, %117 : tensor<1x12x16x16xf32>
-    %760 = stablehlo.reduce(%759 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %761 = stablehlo.reshape %760 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %762 = stablehlo.broadcast_in_dim %759, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %763 = stablehlo.broadcast_in_dim %761, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %764 = stablehlo.subtract %762, %763 : tensor<1x12x16x16xf32>
-    %765 = stablehlo.exponential %764 : tensor<1x12x16x16xf32>
-    %766 = stablehlo.reduce(%765 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %767 = stablehlo.reshape %766 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %768 = stablehlo.broadcast_in_dim %765, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %769 = stablehlo.broadcast_in_dim %767, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %770 = stablehlo.divide %768, %769 : tensor<1x12x16x16xf32>
-    %771 = stablehlo.compare  EQ, %762, %133,  FLOAT : (tensor<1x12x16x16xf32>, tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xi1>
-    %772 = stablehlo.reduce(%771 init: %c) applies stablehlo.and across dimensions = [3] : (tensor<1x12x16x16xi1>, tensor<i1>) -> tensor<1x12x16xi1>
-    %773 = stablehlo.reshape %772 : (tensor<1x12x16xi1>) -> tensor<1x12x16x1xi1>
-    %774 = stablehlo.broadcast_in_dim %773, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xi1>) -> tensor<1x12x16x16xi1>
-    %775 = stablehlo.broadcast_in_dim %770, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %776 = stablehlo.select %774, %139, %775 : tensor<1x12x16x16xi1>, tensor<1x12x16x16xf32>
-    %777 = stablehlo.reshape %776 : (tensor<1x12x16x16xf32>) -> tensor<12x16x16xf32>
-    %778 = stablehlo.reshape %747 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %779 = stablehlo.broadcast_in_dim %778, dims = [0, 1, 2] : (tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %780 = stablehlo.dot_general %777, %779, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x16xf32>, tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %781 = stablehlo.reshape %780 : (tensor<12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %782 = stablehlo.convert %781 : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xbf16>
-    %783 = stablehlo.transpose %782, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %784 = stablehlo.transpose %783, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %785 = stablehlo.transpose %784, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %786 = stablehlo.reshape %785 : (tensor<1x16x12x64xbf16>) -> tensor<1x16x768xbf16>
-    %787 = stablehlo.reshape %786 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %788 = stablehlo.convert %787 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %789 = stablehlo.dot_general %788, %arg73, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %790 = stablehlo.broadcast_in_dim %789, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %791 = stablehlo.multiply %790, %69 : tensor<16x768xf32>
-    %792 = stablehlo.broadcast_in_dim %791, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %793 = stablehlo.broadcast_in_dim %arg74, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %794 = stablehlo.add %792, %793 : tensor<16x768xf32>
-    %795 = stablehlo.convert %794 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %796 = stablehlo.reshape %795 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %797 = stablehlo.add %796, %712 : tensor<1x16x768xbf16>
-    %798 = stablehlo.convert %797 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %799 = stablehlo.convert %798 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %800 = stablehlo.reduce(%799 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %801 = stablehlo.reshape %800 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %802 = stablehlo.broadcast_in_dim %801, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %803 = stablehlo.divide %802, %10 : tensor<1x16x1xf64>
-    %804 = stablehlo.broadcast_in_dim %799, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %805 = stablehlo.broadcast_in_dim %803, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %806 = stablehlo.subtract %804, %805 : tensor<1x16x768xf64>
-    %807 = stablehlo.multiply %806, %806 : tensor<1x16x768xf64>
-    %808 = stablehlo.reduce(%807 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %809 = stablehlo.reshape %808 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %810 = stablehlo.broadcast_in_dim %809, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %811 = stablehlo.divide %810, %10 : tensor<1x16x1xf64>
-    %812 = stablehlo.convert %811 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %813 = stablehlo.reduce(%798 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %814 = stablehlo.reshape %813 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %815 = stablehlo.broadcast_in_dim %814, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %816 = stablehlo.divide %815, %26 : tensor<1x16x1xf32>
-    %817 = stablehlo.broadcast_in_dim %812, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %818 = stablehlo.add %817, %31 : tensor<1x16x1xf32>
-    %819 = stablehlo.rsqrt %818 : tensor<1x16x1xf32>
-    %820 = stablehlo.broadcast_in_dim %798, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %821 = stablehlo.broadcast_in_dim %816, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %822 = stablehlo.subtract %820, %821 : tensor<1x16x768xf32>
-    %823 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %824 = stablehlo.broadcast_in_dim %819, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %825 = stablehlo.multiply %823, %824 : tensor<1x16x768xf32>
-    %826 = stablehlo.convert %arg17 : (tensor<768xbf16>) -> tensor<768xf32>
-    %827 = stablehlo.broadcast_in_dim %825, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %828 = stablehlo.broadcast_in_dim %826, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %829 = stablehlo.multiply %827, %828 : tensor<1x16x768xf32>
-    %830 = stablehlo.convert %arg18 : (tensor<768xbf16>) -> tensor<768xf32>
-    %831 = stablehlo.broadcast_in_dim %829, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %832 = stablehlo.broadcast_in_dim %830, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %833 = stablehlo.add %831, %832 : tensor<1x16x768xf32>
-    %834 = stablehlo.convert %833 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %835 = stablehlo.reshape %834 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %836 = stablehlo.convert %835 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %837 = stablehlo.dot_general %836, %arg75, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x3072xf32>) -> tensor<16x3072xf32>
-    %838 = stablehlo.broadcast_in_dim %837, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %839 = stablehlo.multiply %838, %204 : tensor<16x3072xf32>
-    %840 = stablehlo.broadcast_in_dim %839, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %841 = stablehlo.broadcast_in_dim %arg76, dims = [1] : (tensor<3072xf32>) -> tensor<16x3072xf32>
-    %842 = stablehlo.add %840, %841 : tensor<16x3072xf32>
-    %843 = stablehlo.convert %842 : (tensor<16x3072xf32>) -> tensor<16x3072xbf16>
-    %844 = stablehlo.reshape %843 : (tensor<16x3072xbf16>) -> tensor<1x16x3072xbf16>
-    %845 = stablehlo.multiply %844, %cst_4 : tensor<1x16x3072xbf16>
-    %846 = stablehlo.multiply %844, %212 : tensor<1x16x3072xbf16>
-    %847 = stablehlo.convert %846 : (tensor<1x16x3072xbf16>) -> tensor<1x16x3072xf32>
-    %848 = stablehlo.clamp %cst_5, %847, %cst_6 : tensor<1x16x3072xf32>
-    %849 = stablehlo.multiply %848, %848 : tensor<1x16x3072xf32>
-    %850 = stablehlo.multiply %cst_7, %849 : tensor<1x16x3072xf32>
-    %851 = stablehlo.add %850, %cst_8 : tensor<1x16x3072xf32>
-    %852 = stablehlo.multiply %851, %849 : tensor<1x16x3072xf32>
-    %853 = stablehlo.add %852, %cst_9 : tensor<1x16x3072xf32>
-    %854 = stablehlo.multiply %853, %849 : tensor<1x16x3072xf32>
-    %855 = stablehlo.add %854, %cst_10 : tensor<1x16x3072xf32>
-    %856 = stablehlo.multiply %855, %849 : tensor<1x16x3072xf32>
-    %857 = stablehlo.add %856, %cst_11 : tensor<1x16x3072xf32>
-    %858 = stablehlo.multiply %857, %849 : tensor<1x16x3072xf32>
-    %859 = stablehlo.add %858, %cst_12 : tensor<1x16x3072xf32>
-    %860 = stablehlo.multiply %859, %849 : tensor<1x16x3072xf32>
-    %861 = stablehlo.add %860, %cst_13 : tensor<1x16x3072xf32>
-    %862 = stablehlo.multiply %cst_14, %849 : tensor<1x16x3072xf32>
-    %863 = stablehlo.add %862, %cst_15 : tensor<1x16x3072xf32>
-    %864 = stablehlo.multiply %863, %849 : tensor<1x16x3072xf32>
-    %865 = stablehlo.add %864, %cst_16 : tensor<1x16x3072xf32>
-    %866 = stablehlo.multiply %865, %849 : tensor<1x16x3072xf32>
-    %867 = stablehlo.add %866, %cst_17 : tensor<1x16x3072xf32>
-    %868 = stablehlo.multiply %867, %849 : tensor<1x16x3072xf32>
-    %869 = stablehlo.add %868, %cst_18 : tensor<1x16x3072xf32>
-    %870 = stablehlo.multiply %848, %861 : tensor<1x16x3072xf32>
-    %871 = stablehlo.divide %870, %869 : tensor<1x16x3072xf32>
-    %872 = stablehlo.clamp %cst_19, %871, %cst_20 : tensor<1x16x3072xf32>
-    %873 = stablehlo.convert %872 : (tensor<1x16x3072xf32>) -> tensor<1x16x3072xbf16>
-    %874 = stablehlo.add %873, %cst_2 : tensor<1x16x3072xbf16>
-    %875 = stablehlo.multiply %874, %845 : tensor<1x16x3072xbf16>
-    %876 = stablehlo.reshape %875 : (tensor<1x16x3072xbf16>) -> tensor<16x3072xbf16>
-    %877 = stablehlo.convert %876 : (tensor<16x3072xbf16>) -> tensor<16x3072xf32>
-    %878 = stablehlo.dot_general %877, %arg77, contracting_dims = [1] x [0] : (tensor<16x3072xf32>, tensor<3072x768xf32>) -> tensor<16x768xf32>
-    %879 = stablehlo.broadcast_in_dim %878, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %880 = stablehlo.multiply %879, %69 : tensor<16x768xf32>
-    %881 = stablehlo.broadcast_in_dim %880, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %882 = stablehlo.broadcast_in_dim %arg78, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %883 = stablehlo.add %881, %882 : tensor<16x768xf32>
-    %884 = stablehlo.convert %883 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %885 = stablehlo.reshape %884 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %886 = stablehlo.add %885, %834 : tensor<1x16x768xbf16>
-    %887 = stablehlo.convert %886 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %888 = stablehlo.convert %887 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %889 = stablehlo.reduce(%888 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %890 = stablehlo.reshape %889 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %891 = stablehlo.broadcast_in_dim %890, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %892 = stablehlo.divide %891, %10 : tensor<1x16x1xf64>
-    %893 = stablehlo.broadcast_in_dim %888, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %894 = stablehlo.broadcast_in_dim %892, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %895 = stablehlo.subtract %893, %894 : tensor<1x16x768xf64>
-    %896 = stablehlo.multiply %895, %895 : tensor<1x16x768xf64>
-    %897 = stablehlo.reduce(%896 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %898 = stablehlo.reshape %897 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %899 = stablehlo.broadcast_in_dim %898, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %900 = stablehlo.divide %899, %10 : tensor<1x16x1xf64>
-    %901 = stablehlo.convert %900 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %902 = stablehlo.reduce(%887 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %903 = stablehlo.reshape %902 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %904 = stablehlo.broadcast_in_dim %903, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %905 = stablehlo.divide %904, %26 : tensor<1x16x1xf32>
-    %906 = stablehlo.broadcast_in_dim %901, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %907 = stablehlo.add %906, %31 : tensor<1x16x1xf32>
-    %908 = stablehlo.rsqrt %907 : tensor<1x16x1xf32>
-    %909 = stablehlo.broadcast_in_dim %887, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %910 = stablehlo.broadcast_in_dim %905, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %911 = stablehlo.subtract %909, %910 : tensor<1x16x768xf32>
-    %912 = stablehlo.broadcast_in_dim %911, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %913 = stablehlo.broadcast_in_dim %908, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %914 = stablehlo.multiply %912, %913 : tensor<1x16x768xf32>
-    %915 = stablehlo.convert %arg19 : (tensor<768xbf16>) -> tensor<768xf32>
-    %916 = stablehlo.broadcast_in_dim %914, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %917 = stablehlo.broadcast_in_dim %915, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %918 = stablehlo.multiply %916, %917 : tensor<1x16x768xf32>
-    %919 = stablehlo.convert %arg20 : (tensor<768xbf16>) -> tensor<768xf32>
-    %920 = stablehlo.broadcast_in_dim %918, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %921 = stablehlo.broadcast_in_dim %919, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %922 = stablehlo.add %920, %921 : tensor<1x16x768xf32>
-    %923 = stablehlo.convert %922 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %924 = stablehlo.reshape %923 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %925 = stablehlo.convert %924 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %926 = stablehlo.dot_general %925, %arg79, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %927 = stablehlo.broadcast_in_dim %926, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %928 = stablehlo.multiply %927, %69 : tensor<16x768xf32>
-    %929 = stablehlo.broadcast_in_dim %928, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %930 = stablehlo.broadcast_in_dim %arg80, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %931 = stablehlo.add %929, %930 : tensor<16x768xf32>
-    %932 = stablehlo.convert %931 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %933 = stablehlo.reshape %932 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %934 = stablehlo.reshape %933 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %935 = stablehlo.transpose %934, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %936 = stablehlo.dot_general %925, %arg81, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %937 = stablehlo.broadcast_in_dim %936, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %938 = stablehlo.multiply %937, %69 : tensor<16x768xf32>
-    %939 = stablehlo.broadcast_in_dim %938, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %940 = stablehlo.broadcast_in_dim %arg82, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %941 = stablehlo.add %939, %940 : tensor<16x768xf32>
-    %942 = stablehlo.convert %941 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %943 = stablehlo.reshape %942 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %944 = stablehlo.reshape %943 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %945 = stablehlo.transpose %944, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %946 = stablehlo.dot_general %925, %arg83, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %947 = stablehlo.broadcast_in_dim %946, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %948 = stablehlo.multiply %947, %69 : tensor<16x768xf32>
-    %949 = stablehlo.broadcast_in_dim %948, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %950 = stablehlo.broadcast_in_dim %arg84, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %951 = stablehlo.add %949, %950 : tensor<16x768xf32>
-    %952 = stablehlo.convert %951 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %953 = stablehlo.reshape %952 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %954 = stablehlo.reshape %953 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %955 = stablehlo.transpose %954, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %956 = stablehlo.convert %935 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %957 = stablehlo.convert %945 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %958 = stablehlo.convert %955 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %959 = stablehlo.broadcast_in_dim %956, dims = [0, 1, 2, 3] : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %960 = stablehlo.multiply %959, %104 : tensor<1x12x16x64xf32>
-    %961 = stablehlo.transpose %957, dims = [0, 1, 3, 2] : (tensor<1x12x16x64xf32>) -> tensor<1x12x64x16xf32>
-    %962 = stablehlo.broadcast_in_dim %961, dims = [0, 1, 2, 3] : (tensor<1x12x64x16xf32>) -> tensor<1x12x64x16xf32>
-    %963 = stablehlo.multiply %962, %108 : tensor<1x12x64x16xf32>
-    %964 = stablehlo.reshape %960 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %965 = stablehlo.reshape %963 : (tensor<1x12x64x16xf32>) -> tensor<12x64x16xf32>
-    %966 = stablehlo.broadcast_in_dim %965, dims = [0, 1, 2] : (tensor<12x64x16xf32>) -> tensor<12x64x16xf32>
-    %967 = stablehlo.dot_general %964, %966, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x64xf32>, tensor<12x64x16xf32>) -> tensor<12x16x16xf32>
-    %968 = stablehlo.reshape %967 : (tensor<12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %969 = stablehlo.broadcast_in_dim %968, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %970 = stablehlo.add %969, %117 : tensor<1x12x16x16xf32>
-    %971 = stablehlo.reduce(%970 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %972 = stablehlo.reshape %971 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %973 = stablehlo.broadcast_in_dim %970, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %974 = stablehlo.broadcast_in_dim %972, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %975 = stablehlo.subtract %973, %974 : tensor<1x12x16x16xf32>
-    %976 = stablehlo.exponential %975 : tensor<1x12x16x16xf32>
-    %977 = stablehlo.reduce(%976 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %978 = stablehlo.reshape %977 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %979 = stablehlo.broadcast_in_dim %976, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %980 = stablehlo.broadcast_in_dim %978, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %981 = stablehlo.divide %979, %980 : tensor<1x12x16x16xf32>
-    %982 = stablehlo.compare  EQ, %973, %133,  FLOAT : (tensor<1x12x16x16xf32>, tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xi1>
-    %983 = stablehlo.reduce(%982 init: %c) applies stablehlo.and across dimensions = [3] : (tensor<1x12x16x16xi1>, tensor<i1>) -> tensor<1x12x16xi1>
-    %984 = stablehlo.reshape %983 : (tensor<1x12x16xi1>) -> tensor<1x12x16x1xi1>
-    %985 = stablehlo.broadcast_in_dim %984, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xi1>) -> tensor<1x12x16x16xi1>
-    %986 = stablehlo.broadcast_in_dim %981, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %987 = stablehlo.select %985, %139, %986 : tensor<1x12x16x16xi1>, tensor<1x12x16x16xf32>
-    %988 = stablehlo.reshape %987 : (tensor<1x12x16x16xf32>) -> tensor<12x16x16xf32>
-    %989 = stablehlo.reshape %958 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %990 = stablehlo.broadcast_in_dim %989, dims = [0, 1, 2] : (tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %991 = stablehlo.dot_general %988, %990, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x16xf32>, tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %992 = stablehlo.reshape %991 : (tensor<12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %993 = stablehlo.convert %992 : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xbf16>
-    %994 = stablehlo.transpose %993, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %995 = stablehlo.transpose %994, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %996 = stablehlo.transpose %995, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %997 = stablehlo.reshape %996 : (tensor<1x16x12x64xbf16>) -> tensor<1x16x768xbf16>
-    %998 = stablehlo.reshape %997 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %999 = stablehlo.convert %998 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %1000 = stablehlo.dot_general %999, %arg85, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %1001 = stablehlo.broadcast_in_dim %1000, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1002 = stablehlo.multiply %1001, %69 : tensor<16x768xf32>
-    %1003 = stablehlo.broadcast_in_dim %1002, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1004 = stablehlo.broadcast_in_dim %arg86, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1005 = stablehlo.add %1003, %1004 : tensor<16x768xf32>
-    %1006 = stablehlo.convert %1005 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1007 = stablehlo.reshape %1006 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1008 = stablehlo.add %1007, %923 : tensor<1x16x768xbf16>
-    %1009 = stablehlo.convert %1008 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %1010 = stablehlo.convert %1009 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %1011 = stablehlo.reduce(%1010 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1012 = stablehlo.reshape %1011 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1013 = stablehlo.broadcast_in_dim %1012, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1014 = stablehlo.divide %1013, %10 : tensor<1x16x1xf64>
-    %1015 = stablehlo.broadcast_in_dim %1010, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %1016 = stablehlo.broadcast_in_dim %1014, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %1017 = stablehlo.subtract %1015, %1016 : tensor<1x16x768xf64>
-    %1018 = stablehlo.multiply %1017, %1017 : tensor<1x16x768xf64>
-    %1019 = stablehlo.reduce(%1018 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1020 = stablehlo.reshape %1019 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1021 = stablehlo.broadcast_in_dim %1020, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1022 = stablehlo.divide %1021, %10 : tensor<1x16x1xf64>
-    %1023 = stablehlo.convert %1022 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %1024 = stablehlo.reduce(%1009 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %1025 = stablehlo.reshape %1024 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %1026 = stablehlo.broadcast_in_dim %1025, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1027 = stablehlo.divide %1026, %26 : tensor<1x16x1xf32>
-    %1028 = stablehlo.broadcast_in_dim %1023, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1029 = stablehlo.add %1028, %31 : tensor<1x16x1xf32>
-    %1030 = stablehlo.rsqrt %1029 : tensor<1x16x1xf32>
-    %1031 = stablehlo.broadcast_in_dim %1009, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1032 = stablehlo.broadcast_in_dim %1027, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1033 = stablehlo.subtract %1031, %1032 : tensor<1x16x768xf32>
-    %1034 = stablehlo.broadcast_in_dim %1033, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1035 = stablehlo.broadcast_in_dim %1030, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1036 = stablehlo.multiply %1034, %1035 : tensor<1x16x768xf32>
-    %1037 = stablehlo.convert %arg21 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1038 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1039 = stablehlo.broadcast_in_dim %1037, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1040 = stablehlo.multiply %1038, %1039 : tensor<1x16x768xf32>
-    %1041 = stablehlo.convert %arg22 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1042 = stablehlo.broadcast_in_dim %1040, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1043 = stablehlo.broadcast_in_dim %1041, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1044 = stablehlo.add %1042, %1043 : tensor<1x16x768xf32>
-    %1045 = stablehlo.convert %1044 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %1046 = stablehlo.reshape %1045 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %1047 = stablehlo.convert %1046 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %1048 = stablehlo.dot_general %1047, %arg87, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x3072xf32>) -> tensor<16x3072xf32>
-    %1049 = stablehlo.broadcast_in_dim %1048, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %1050 = stablehlo.multiply %1049, %204 : tensor<16x3072xf32>
-    %1051 = stablehlo.broadcast_in_dim %1050, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %1052 = stablehlo.broadcast_in_dim %arg88, dims = [1] : (tensor<3072xf32>) -> tensor<16x3072xf32>
-    %1053 = stablehlo.add %1051, %1052 : tensor<16x3072xf32>
-    %1054 = stablehlo.convert %1053 : (tensor<16x3072xf32>) -> tensor<16x3072xbf16>
-    %1055 = stablehlo.reshape %1054 : (tensor<16x3072xbf16>) -> tensor<1x16x3072xbf16>
-    %1056 = stablehlo.multiply %1055, %cst_4 : tensor<1x16x3072xbf16>
-    %1057 = stablehlo.multiply %1055, %212 : tensor<1x16x3072xbf16>
-    %1058 = stablehlo.convert %1057 : (tensor<1x16x3072xbf16>) -> tensor<1x16x3072xf32>
-    %1059 = stablehlo.clamp %cst_5, %1058, %cst_6 : tensor<1x16x3072xf32>
-    %1060 = stablehlo.multiply %1059, %1059 : tensor<1x16x3072xf32>
-    %1061 = stablehlo.multiply %cst_7, %1060 : tensor<1x16x3072xf32>
-    %1062 = stablehlo.add %1061, %cst_8 : tensor<1x16x3072xf32>
-    %1063 = stablehlo.multiply %1062, %1060 : tensor<1x16x3072xf32>
-    %1064 = stablehlo.add %1063, %cst_9 : tensor<1x16x3072xf32>
-    %1065 = stablehlo.multiply %1064, %1060 : tensor<1x16x3072xf32>
-    %1066 = stablehlo.add %1065, %cst_10 : tensor<1x16x3072xf32>
-    %1067 = stablehlo.multiply %1066, %1060 : tensor<1x16x3072xf32>
-    %1068 = stablehlo.add %1067, %cst_11 : tensor<1x16x3072xf32>
-    %1069 = stablehlo.multiply %1068, %1060 : tensor<1x16x3072xf32>
-    %1070 = stablehlo.add %1069, %cst_12 : tensor<1x16x3072xf32>
-    %1071 = stablehlo.multiply %1070, %1060 : tensor<1x16x3072xf32>
-    %1072 = stablehlo.add %1071, %cst_13 : tensor<1x16x3072xf32>
-    %1073 = stablehlo.multiply %cst_14, %1060 : tensor<1x16x3072xf32>
-    %1074 = stablehlo.add %1073, %cst_15 : tensor<1x16x3072xf32>
-    %1075 = stablehlo.multiply %1074, %1060 : tensor<1x16x3072xf32>
-    %1076 = stablehlo.add %1075, %cst_16 : tensor<1x16x3072xf32>
-    %1077 = stablehlo.multiply %1076, %1060 : tensor<1x16x3072xf32>
-    %1078 = stablehlo.add %1077, %cst_17 : tensor<1x16x3072xf32>
-    %1079 = stablehlo.multiply %1078, %1060 : tensor<1x16x3072xf32>
-    %1080 = stablehlo.add %1079, %cst_18 : tensor<1x16x3072xf32>
-    %1081 = stablehlo.multiply %1059, %1072 : tensor<1x16x3072xf32>
-    %1082 = stablehlo.divide %1081, %1080 : tensor<1x16x3072xf32>
-    %1083 = stablehlo.clamp %cst_19, %1082, %cst_20 : tensor<1x16x3072xf32>
-    %1084 = stablehlo.convert %1083 : (tensor<1x16x3072xf32>) -> tensor<1x16x3072xbf16>
-    %1085 = stablehlo.add %1084, %cst_2 : tensor<1x16x3072xbf16>
-    %1086 = stablehlo.multiply %1085, %1056 : tensor<1x16x3072xbf16>
-    %1087 = stablehlo.reshape %1086 : (tensor<1x16x3072xbf16>) -> tensor<16x3072xbf16>
-    %1088 = stablehlo.convert %1087 : (tensor<16x3072xbf16>) -> tensor<16x3072xf32>
-    %1089 = stablehlo.dot_general %1088, %arg89, contracting_dims = [1] x [0] : (tensor<16x3072xf32>, tensor<3072x768xf32>) -> tensor<16x768xf32>
-    %1090 = stablehlo.broadcast_in_dim %1089, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1091 = stablehlo.multiply %1090, %69 : tensor<16x768xf32>
-    %1092 = stablehlo.broadcast_in_dim %1091, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1093 = stablehlo.broadcast_in_dim %arg90, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1094 = stablehlo.add %1092, %1093 : tensor<16x768xf32>
-    %1095 = stablehlo.convert %1094 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1096 = stablehlo.reshape %1095 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1097 = stablehlo.add %1096, %1045 : tensor<1x16x768xbf16>
-    %1098 = stablehlo.convert %1097 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %1099 = stablehlo.convert %1098 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %1100 = stablehlo.reduce(%1099 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1101 = stablehlo.reshape %1100 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1102 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1103 = stablehlo.divide %1102, %10 : tensor<1x16x1xf64>
-    %1104 = stablehlo.broadcast_in_dim %1099, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %1105 = stablehlo.broadcast_in_dim %1103, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %1106 = stablehlo.subtract %1104, %1105 : tensor<1x16x768xf64>
-    %1107 = stablehlo.multiply %1106, %1106 : tensor<1x16x768xf64>
-    %1108 = stablehlo.reduce(%1107 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1109 = stablehlo.reshape %1108 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1110 = stablehlo.broadcast_in_dim %1109, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1111 = stablehlo.divide %1110, %10 : tensor<1x16x1xf64>
-    %1112 = stablehlo.convert %1111 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %1113 = stablehlo.reduce(%1098 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %1114 = stablehlo.reshape %1113 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %1115 = stablehlo.broadcast_in_dim %1114, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1116 = stablehlo.divide %1115, %26 : tensor<1x16x1xf32>
-    %1117 = stablehlo.broadcast_in_dim %1112, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1118 = stablehlo.add %1117, %31 : tensor<1x16x1xf32>
-    %1119 = stablehlo.rsqrt %1118 : tensor<1x16x1xf32>
-    %1120 = stablehlo.broadcast_in_dim %1098, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1121 = stablehlo.broadcast_in_dim %1116, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1122 = stablehlo.subtract %1120, %1121 : tensor<1x16x768xf32>
-    %1123 = stablehlo.broadcast_in_dim %1122, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1124 = stablehlo.broadcast_in_dim %1119, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1125 = stablehlo.multiply %1123, %1124 : tensor<1x16x768xf32>
-    %1126 = stablehlo.convert %arg23 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1127 = stablehlo.broadcast_in_dim %1125, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1128 = stablehlo.broadcast_in_dim %1126, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1129 = stablehlo.multiply %1127, %1128 : tensor<1x16x768xf32>
-    %1130 = stablehlo.convert %arg24 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1131 = stablehlo.broadcast_in_dim %1129, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1132 = stablehlo.broadcast_in_dim %1130, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1133 = stablehlo.add %1131, %1132 : tensor<1x16x768xf32>
-    %1134 = stablehlo.convert %1133 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %1135 = stablehlo.reshape %1134 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %1136 = stablehlo.convert %1135 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %1137 = stablehlo.dot_general %1136, %arg91, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %1138 = stablehlo.broadcast_in_dim %1137, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1139 = stablehlo.multiply %1138, %69 : tensor<16x768xf32>
-    %1140 = stablehlo.broadcast_in_dim %1139, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1141 = stablehlo.broadcast_in_dim %arg92, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1142 = stablehlo.add %1140, %1141 : tensor<16x768xf32>
-    %1143 = stablehlo.convert %1142 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1144 = stablehlo.reshape %1143 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1145 = stablehlo.reshape %1144 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %1146 = stablehlo.transpose %1145, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %1147 = stablehlo.dot_general %1136, %arg93, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %1148 = stablehlo.broadcast_in_dim %1147, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1149 = stablehlo.multiply %1148, %69 : tensor<16x768xf32>
-    %1150 = stablehlo.broadcast_in_dim %1149, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1151 = stablehlo.broadcast_in_dim %arg94, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1152 = stablehlo.add %1150, %1151 : tensor<16x768xf32>
-    %1153 = stablehlo.convert %1152 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1154 = stablehlo.reshape %1153 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1155 = stablehlo.reshape %1154 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %1156 = stablehlo.transpose %1155, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %1157 = stablehlo.dot_general %1136, %arg95, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %1158 = stablehlo.broadcast_in_dim %1157, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1159 = stablehlo.multiply %1158, %69 : tensor<16x768xf32>
-    %1160 = stablehlo.broadcast_in_dim %1159, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1161 = stablehlo.broadcast_in_dim %arg96, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1162 = stablehlo.add %1160, %1161 : tensor<16x768xf32>
-    %1163 = stablehlo.convert %1162 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1164 = stablehlo.reshape %1163 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1165 = stablehlo.reshape %1164 : (tensor<1x16x768xbf16>) -> tensor<1x16x12x64xbf16>
-    %1166 = stablehlo.transpose %1165, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %1167 = stablehlo.convert %1146 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %1168 = stablehlo.convert %1156 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %1169 = stablehlo.convert %1166 : (tensor<1x12x16x64xbf16>) -> tensor<1x12x16x64xf32>
-    %1170 = stablehlo.broadcast_in_dim %1167, dims = [0, 1, 2, 3] : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %1171 = stablehlo.multiply %1170, %104 : tensor<1x12x16x64xf32>
-    %1172 = stablehlo.transpose %1168, dims = [0, 1, 3, 2] : (tensor<1x12x16x64xf32>) -> tensor<1x12x64x16xf32>
-    %1173 = stablehlo.broadcast_in_dim %1172, dims = [0, 1, 2, 3] : (tensor<1x12x64x16xf32>) -> tensor<1x12x64x16xf32>
-    %1174 = stablehlo.multiply %1173, %108 : tensor<1x12x64x16xf32>
-    %1175 = stablehlo.reshape %1171 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %1176 = stablehlo.reshape %1174 : (tensor<1x12x64x16xf32>) -> tensor<12x64x16xf32>
-    %1177 = stablehlo.broadcast_in_dim %1176, dims = [0, 1, 2] : (tensor<12x64x16xf32>) -> tensor<12x64x16xf32>
-    %1178 = stablehlo.dot_general %1175, %1177, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x64xf32>, tensor<12x64x16xf32>) -> tensor<12x16x16xf32>
-    %1179 = stablehlo.reshape %1178 : (tensor<12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %1180 = stablehlo.broadcast_in_dim %1179, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %1181 = stablehlo.add %1180, %117 : tensor<1x12x16x16xf32>
-    %1182 = stablehlo.reduce(%1181 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %1183 = stablehlo.reshape %1182 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %1184 = stablehlo.broadcast_in_dim %1181, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %1185 = stablehlo.broadcast_in_dim %1183, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %1186 = stablehlo.subtract %1184, %1185 : tensor<1x12x16x16xf32>
-    %1187 = stablehlo.exponential %1186 : tensor<1x12x16x16xf32>
-    %1188 = stablehlo.reduce(%1187 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x16x16xf32>, tensor<f32>) -> tensor<1x12x16xf32>
-    %1189 = stablehlo.reshape %1188 : (tensor<1x12x16xf32>) -> tensor<1x12x16x1xf32>
-    %1190 = stablehlo.broadcast_in_dim %1187, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %1191 = stablehlo.broadcast_in_dim %1189, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xf32>) -> tensor<1x12x16x16xf32>
-    %1192 = stablehlo.divide %1190, %1191 : tensor<1x12x16x16xf32>
-    %1193 = stablehlo.compare  EQ, %1184, %133,  FLOAT : (tensor<1x12x16x16xf32>, tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xi1>
-    %1194 = stablehlo.reduce(%1193 init: %c) applies stablehlo.and across dimensions = [3] : (tensor<1x12x16x16xi1>, tensor<i1>) -> tensor<1x12x16xi1>
-    %1195 = stablehlo.reshape %1194 : (tensor<1x12x16xi1>) -> tensor<1x12x16x1xi1>
-    %1196 = stablehlo.broadcast_in_dim %1195, dims = [0, 1, 2, 3] : (tensor<1x12x16x1xi1>) -> tensor<1x12x16x16xi1>
-    %1197 = stablehlo.broadcast_in_dim %1192, dims = [0, 1, 2, 3] : (tensor<1x12x16x16xf32>) -> tensor<1x12x16x16xf32>
-    %1198 = stablehlo.select %1196, %139, %1197 : tensor<1x12x16x16xi1>, tensor<1x12x16x16xf32>
-    %1199 = stablehlo.reshape %1198 : (tensor<1x12x16x16xf32>) -> tensor<12x16x16xf32>
-    %1200 = stablehlo.reshape %1169 : (tensor<1x12x16x64xf32>) -> tensor<12x16x64xf32>
-    %1201 = stablehlo.broadcast_in_dim %1200, dims = [0, 1, 2] : (tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %1202 = stablehlo.dot_general %1199, %1201, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x16x16xf32>, tensor<12x16x64xf32>) -> tensor<12x16x64xf32>
-    %1203 = stablehlo.reshape %1202 : (tensor<12x16x64xf32>) -> tensor<1x12x16x64xf32>
-    %1204 = stablehlo.convert %1203 : (tensor<1x12x16x64xf32>) -> tensor<1x12x16x64xbf16>
-    %1205 = stablehlo.transpose %1204, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %1206 = stablehlo.transpose %1205, dims = [0, 2, 1, 3] : (tensor<1x16x12x64xbf16>) -> tensor<1x12x16x64xbf16>
-    %1207 = stablehlo.transpose %1206, dims = [0, 2, 1, 3] : (tensor<1x12x16x64xbf16>) -> tensor<1x16x12x64xbf16>
-    %1208 = stablehlo.reshape %1207 : (tensor<1x16x12x64xbf16>) -> tensor<1x16x768xbf16>
-    %1209 = stablehlo.reshape %1208 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %1210 = stablehlo.convert %1209 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %1211 = stablehlo.dot_general %1210, %arg97, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x768xf32>) -> tensor<16x768xf32>
-    %1212 = stablehlo.broadcast_in_dim %1211, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1213 = stablehlo.multiply %1212, %69 : tensor<16x768xf32>
-    %1214 = stablehlo.broadcast_in_dim %1213, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1215 = stablehlo.broadcast_in_dim %arg98, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1216 = stablehlo.add %1214, %1215 : tensor<16x768xf32>
-    %1217 = stablehlo.convert %1216 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1218 = stablehlo.reshape %1217 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1219 = stablehlo.add %1218, %1134 : tensor<1x16x768xbf16>
-    %1220 = stablehlo.convert %1219 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %1221 = stablehlo.convert %1220 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %1222 = stablehlo.reduce(%1221 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1223 = stablehlo.reshape %1222 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1224 = stablehlo.broadcast_in_dim %1223, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1225 = stablehlo.divide %1224, %10 : tensor<1x16x1xf64>
-    %1226 = stablehlo.broadcast_in_dim %1221, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %1227 = stablehlo.broadcast_in_dim %1225, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %1228 = stablehlo.subtract %1226, %1227 : tensor<1x16x768xf64>
-    %1229 = stablehlo.multiply %1228, %1228 : tensor<1x16x768xf64>
-    %1230 = stablehlo.reduce(%1229 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1231 = stablehlo.reshape %1230 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1232 = stablehlo.broadcast_in_dim %1231, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1233 = stablehlo.divide %1232, %10 : tensor<1x16x1xf64>
-    %1234 = stablehlo.convert %1233 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %1235 = stablehlo.reduce(%1220 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %1236 = stablehlo.reshape %1235 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %1237 = stablehlo.broadcast_in_dim %1236, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1238 = stablehlo.divide %1237, %26 : tensor<1x16x1xf32>
-    %1239 = stablehlo.broadcast_in_dim %1234, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1240 = stablehlo.add %1239, %31 : tensor<1x16x1xf32>
-    %1241 = stablehlo.rsqrt %1240 : tensor<1x16x1xf32>
-    %1242 = stablehlo.broadcast_in_dim %1220, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1243 = stablehlo.broadcast_in_dim %1238, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1244 = stablehlo.subtract %1242, %1243 : tensor<1x16x768xf32>
-    %1245 = stablehlo.broadcast_in_dim %1244, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1246 = stablehlo.broadcast_in_dim %1241, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1247 = stablehlo.multiply %1245, %1246 : tensor<1x16x768xf32>
-    %1248 = stablehlo.convert %arg25 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1249 = stablehlo.broadcast_in_dim %1247, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1250 = stablehlo.broadcast_in_dim %1248, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1251 = stablehlo.multiply %1249, %1250 : tensor<1x16x768xf32>
-    %1252 = stablehlo.convert %arg26 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1253 = stablehlo.broadcast_in_dim %1251, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1254 = stablehlo.broadcast_in_dim %1252, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1255 = stablehlo.add %1253, %1254 : tensor<1x16x768xf32>
-    %1256 = stablehlo.convert %1255 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    %1257 = stablehlo.reshape %1256 : (tensor<1x16x768xbf16>) -> tensor<16x768xbf16>
-    %1258 = stablehlo.convert %1257 : (tensor<16x768xbf16>) -> tensor<16x768xf32>
-    %1259 = stablehlo.dot_general %1258, %arg99, contracting_dims = [1] x [0] : (tensor<16x768xf32>, tensor<768x3072xf32>) -> tensor<16x3072xf32>
-    %1260 = stablehlo.broadcast_in_dim %1259, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %1261 = stablehlo.multiply %1260, %204 : tensor<16x3072xf32>
-    %1262 = stablehlo.broadcast_in_dim %1261, dims = [0, 1] : (tensor<16x3072xf32>) -> tensor<16x3072xf32>
-    %1263 = stablehlo.broadcast_in_dim %arg100, dims = [1] : (tensor<3072xf32>) -> tensor<16x3072xf32>
-    %1264 = stablehlo.add %1262, %1263 : tensor<16x3072xf32>
-    %1265 = stablehlo.convert %1264 : (tensor<16x3072xf32>) -> tensor<16x3072xbf16>
-    %1266 = stablehlo.reshape %1265 : (tensor<16x3072xbf16>) -> tensor<1x16x3072xbf16>
-    %1267 = stablehlo.multiply %1266, %cst_4 : tensor<1x16x3072xbf16>
-    %1268 = stablehlo.multiply %1266, %212 : tensor<1x16x3072xbf16>
-    %1269 = stablehlo.convert %1268 : (tensor<1x16x3072xbf16>) -> tensor<1x16x3072xf32>
-    %1270 = stablehlo.clamp %cst_5, %1269, %cst_6 : tensor<1x16x3072xf32>
-    %1271 = stablehlo.multiply %1270, %1270 : tensor<1x16x3072xf32>
-    %1272 = stablehlo.multiply %cst_7, %1271 : tensor<1x16x3072xf32>
-    %1273 = stablehlo.add %1272, %cst_8 : tensor<1x16x3072xf32>
-    %1274 = stablehlo.multiply %1273, %1271 : tensor<1x16x3072xf32>
-    %1275 = stablehlo.add %1274, %cst_9 : tensor<1x16x3072xf32>
-    %1276 = stablehlo.multiply %1275, %1271 : tensor<1x16x3072xf32>
-    %1277 = stablehlo.add %1276, %cst_10 : tensor<1x16x3072xf32>
-    %1278 = stablehlo.multiply %1277, %1271 : tensor<1x16x3072xf32>
-    %1279 = stablehlo.add %1278, %cst_11 : tensor<1x16x3072xf32>
-    %1280 = stablehlo.multiply %1279, %1271 : tensor<1x16x3072xf32>
-    %1281 = stablehlo.add %1280, %cst_12 : tensor<1x16x3072xf32>
-    %1282 = stablehlo.multiply %1281, %1271 : tensor<1x16x3072xf32>
-    %1283 = stablehlo.add %1282, %cst_13 : tensor<1x16x3072xf32>
-    %1284 = stablehlo.multiply %cst_14, %1271 : tensor<1x16x3072xf32>
-    %1285 = stablehlo.add %1284, %cst_15 : tensor<1x16x3072xf32>
-    %1286 = stablehlo.multiply %1285, %1271 : tensor<1x16x3072xf32>
-    %1287 = stablehlo.add %1286, %cst_16 : tensor<1x16x3072xf32>
-    %1288 = stablehlo.multiply %1287, %1271 : tensor<1x16x3072xf32>
-    %1289 = stablehlo.add %1288, %cst_17 : tensor<1x16x3072xf32>
-    %1290 = stablehlo.multiply %1289, %1271 : tensor<1x16x3072xf32>
-    %1291 = stablehlo.add %1290, %cst_18 : tensor<1x16x3072xf32>
-    %1292 = stablehlo.multiply %1270, %1283 : tensor<1x16x3072xf32>
-    %1293 = stablehlo.divide %1292, %1291 : tensor<1x16x3072xf32>
-    %1294 = stablehlo.clamp %cst_19, %1293, %cst_20 : tensor<1x16x3072xf32>
-    %1295 = stablehlo.convert %1294 : (tensor<1x16x3072xf32>) -> tensor<1x16x3072xbf16>
-    %1296 = stablehlo.add %1295, %cst_2 : tensor<1x16x3072xbf16>
-    %1297 = stablehlo.multiply %1296, %1267 : tensor<1x16x3072xbf16>
-    %1298 = stablehlo.reshape %1297 : (tensor<1x16x3072xbf16>) -> tensor<16x3072xbf16>
-    %1299 = stablehlo.convert %1298 : (tensor<16x3072xbf16>) -> tensor<16x3072xf32>
-    %1300 = stablehlo.dot_general %1299, %arg101, contracting_dims = [1] x [0] : (tensor<16x3072xf32>, tensor<3072x768xf32>) -> tensor<16x768xf32>
-    %1301 = stablehlo.broadcast_in_dim %1300, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1302 = stablehlo.multiply %1301, %69 : tensor<16x768xf32>
-    %1303 = stablehlo.broadcast_in_dim %1302, dims = [0, 1] : (tensor<16x768xf32>) -> tensor<16x768xf32>
-    %1304 = stablehlo.broadcast_in_dim %arg102, dims = [1] : (tensor<768xf32>) -> tensor<16x768xf32>
-    %1305 = stablehlo.add %1303, %1304 : tensor<16x768xf32>
-    %1306 = stablehlo.convert %1305 : (tensor<16x768xf32>) -> tensor<16x768xbf16>
-    %1307 = stablehlo.reshape %1306 : (tensor<16x768xbf16>) -> tensor<1x16x768xbf16>
-    %1308 = stablehlo.add %1307, %1256 : tensor<1x16x768xbf16>
-    %1309 = stablehlo.convert %1308 : (tensor<1x16x768xbf16>) -> tensor<1x16x768xf32>
-    %1310 = stablehlo.convert %1309 : (tensor<1x16x768xf32>) -> tensor<1x16x768xf64>
-    %1311 = stablehlo.reduce(%1310 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1312 = stablehlo.reshape %1311 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1313 = stablehlo.broadcast_in_dim %1312, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1314 = stablehlo.divide %1313, %10 : tensor<1x16x1xf64>
-    %1315 = stablehlo.broadcast_in_dim %1310, dims = [0, 1, 2] : (tensor<1x16x768xf64>) -> tensor<1x16x768xf64>
-    %1316 = stablehlo.broadcast_in_dim %1314, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x768xf64>
-    %1317 = stablehlo.subtract %1315, %1316 : tensor<1x16x768xf64>
-    %1318 = stablehlo.multiply %1317, %1317 : tensor<1x16x768xf64>
-    %1319 = stablehlo.reduce(%1318 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf64>, tensor<f64>) -> tensor<1x16xf64>
-    %1320 = stablehlo.reshape %1319 : (tensor<1x16xf64>) -> tensor<1x16x1xf64>
-    %1321 = stablehlo.broadcast_in_dim %1320, dims = [0, 1, 2] : (tensor<1x16x1xf64>) -> tensor<1x16x1xf64>
-    %1322 = stablehlo.divide %1321, %10 : tensor<1x16x1xf64>
-    %1323 = stablehlo.convert %1322 : (tensor<1x16x1xf64>) -> tensor<1x16x1xf32>
-    %1324 = stablehlo.reduce(%1309 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x16x768xf32>, tensor<f32>) -> tensor<1x16xf32>
-    %1325 = stablehlo.reshape %1324 : (tensor<1x16xf32>) -> tensor<1x16x1xf32>
-    %1326 = stablehlo.broadcast_in_dim %1325, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1327 = stablehlo.divide %1326, %26 : tensor<1x16x1xf32>
-    %1328 = stablehlo.broadcast_in_dim %1323, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x1xf32>
-    %1329 = stablehlo.add %1328, %31 : tensor<1x16x1xf32>
-    %1330 = stablehlo.rsqrt %1329 : tensor<1x16x1xf32>
-    %1331 = stablehlo.broadcast_in_dim %1309, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1332 = stablehlo.broadcast_in_dim %1327, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1333 = stablehlo.subtract %1331, %1332 : tensor<1x16x768xf32>
-    %1334 = stablehlo.broadcast_in_dim %1333, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1335 = stablehlo.broadcast_in_dim %1330, dims = [0, 1, 2] : (tensor<1x16x1xf32>) -> tensor<1x16x768xf32>
-    %1336 = stablehlo.multiply %1334, %1335 : tensor<1x16x768xf32>
-    %1337 = stablehlo.convert %arg27 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1338 = stablehlo.broadcast_in_dim %1336, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1339 = stablehlo.broadcast_in_dim %1337, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1340 = stablehlo.multiply %1338, %1339 : tensor<1x16x768xf32>
-    %1341 = stablehlo.convert %arg28 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1342 = stablehlo.broadcast_in_dim %1340, dims = [0, 1, 2] : (tensor<1x16x768xf32>) -> tensor<1x16x768xf32>
-    %1343 = stablehlo.broadcast_in_dim %1341, dims = [2] : (tensor<768xf32>) -> tensor<1x16x768xf32>
-    %1344 = stablehlo.add %1342, %1343 : tensor<1x16x768xf32>
-    %1345 = stablehlo.convert %1344 : (tensor<1x16x768xf32>) -> tensor<1x16x768xbf16>
-    return %1345 : tensor<1x16x768xbf16>
-  }
-}
diff --git a/mlir_tests/microsoftbeit-base-patch16-224.mlir b/mlir_tests/microsoftbeit-base-patch16-224.mlir
deleted file mode 100644
index c0f0c0ab..00000000
--- a/mlir_tests/microsoftbeit-base-patch16-224.mlir
+++ /dev/null
@@ -1,2490 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x224x224xbf16>, %arg1: tensor<768x3x16x16xbf16>, %arg2: tensor<768xbf16>, %arg3: tensor<768xbf16>, %arg4: tensor<768xbf16>, %arg5: tensor<768xbf16>, %arg6: tensor<768xbf16>, %arg7: tensor<768xbf16>, %arg8: tensor<768xbf16>, %arg9: tensor<768xbf16>, %arg10: tensor<768xbf16>, %arg11: tensor<768xbf16>, %arg12: tensor<768xbf16>, %arg13: tensor<768xbf16>, %arg14: tensor<768xbf16>, %arg15: tensor<768xbf16>, %arg16: tensor<768xbf16>, %arg17: tensor<768xbf16>, %arg18: tensor<768xbf16>, %arg19: tensor<768xbf16>, %arg20: tensor<768xbf16>, %arg21: tensor<768xbf16>, %arg22: tensor<768xbf16>, %arg23: tensor<768xbf16>, %arg24: tensor<768xbf16>, %arg25: tensor<768xbf16>, %arg26: tensor<768xbf16>, %arg27: tensor<768xbf16>, %arg28: tensor<768xbf16>, %arg29: tensor<768xbf16>, %arg30: tensor<768xbf16>, %arg31: tensor<768xbf16>, %arg32: tensor<768xbf16>, %arg33: tensor<768xbf16>, %arg34: tensor<768xbf16>, %arg35: tensor<768xbf16>, %arg36: tensor<768xbf16>, %arg37: tensor<768xbf16>, %arg38: tensor<768xbf16>, %arg39: tensor<768xbf16>, %arg40: tensor<768xbf16>, %arg41: tensor<768xbf16>, %arg42: tensor<768xbf16>, %arg43: tensor<768xbf16>, %arg44: tensor<768xbf16>, %arg45: tensor<768xbf16>, %arg46: tensor<768xbf16>, %arg47: tensor<768xbf16>, %arg48: tensor<768xbf16>, %arg49: tensor<768xbf16>, %arg50: tensor<768xbf16>, %arg51: tensor<768xbf16>, %arg52: tensor<768xbf16>, %arg53: tensor<768xbf16>, %arg54: tensor<768xbf16>, %arg55: tensor<768xbf16>, %arg56: tensor<768xbf16>, %arg57: tensor<768xbf16>, %arg58: tensor<768xbf16>, %arg59: tensor<768xbf16>, %arg60: tensor<768xbf16>, %arg61: tensor<768xbf16>, %arg62: tensor<768xbf16>, %arg63: tensor<768xbf16>, %arg64: tensor<768xbf16>, %arg65: tensor<768xbf16>, %arg66: tensor<768xbf16>, %arg67: tensor<768xbf16>, %arg68: tensor<768xbf16>, %arg69: tensor<768xbf16>, %arg70: tensor<768xbf16>, %arg71: tensor<768xbf16>, %arg72: tensor<768xbf16>, %arg73: tensor<768xbf16>, %arg74: tensor<768xbf16>, %arg75: tensor<768xbf16>, %arg76: tensor<768xbf16>, %arg77: tensor<1x1x768xbf16>, %arg78: tensor<768x768xf32>, %arg79: tensor<768xf32>, %arg80: tensor<768x768xbf16>, %arg81: tensor<768x768xf32>, %arg82: tensor<768xf32>, %arg83: tensor<1x12x197x197xbf16>, %arg84: tensor<768x768xf32>, %arg85: tensor<768xf32>, %arg86: tensor<768x3072xf32>, %arg87: tensor<3072xf32>, %arg88: tensor<3072x768xf32>, %arg89: tensor<768xf32>, %arg90: tensor<768x768xf32>, %arg91: tensor<768xf32>, %arg92: tensor<768x768xbf16>, %arg93: tensor<768x768xf32>, %arg94: tensor<768xf32>, %arg95: tensor<1x12x197x197xbf16>, %arg96: tensor<768x768xf32>, %arg97: tensor<768xf32>, %arg98: tensor<768x3072xf32>, %arg99: tensor<3072xf32>, %arg100: tensor<3072x768xf32>, %arg101: tensor<768xf32>, %arg102: tensor<768x768xf32>, %arg103: tensor<768xf32>, %arg104: tensor<768x768xbf16>, %arg105: tensor<768x768xf32>, %arg106: tensor<768xf32>, %arg107: tensor<1x12x197x197xbf16>, %arg108: tensor<768x768xf32>, %arg109: tensor<768xf32>, %arg110: tensor<768x3072xf32>, %arg111: tensor<3072xf32>, %arg112: tensor<3072x768xf32>, %arg113: tensor<768xf32>, %arg114: tensor<768x768xf32>, %arg115: tensor<768xf32>, %arg116: tensor<768x768xbf16>, %arg117: tensor<768x768xf32>, %arg118: tensor<768xf32>, %arg119: tensor<1x12x197x197xbf16>, %arg120: tensor<768x768xf32>, %arg121: tensor<768xf32>, %arg122: tensor<768x3072xf32>, %arg123: tensor<3072xf32>, %arg124: tensor<3072x768xf32>, %arg125: tensor<768xf32>, %arg126: tensor<768x768xf32>, %arg127: tensor<768xf32>, %arg128: tensor<768x768xbf16>, %arg129: tensor<768x768xf32>, %arg130: tensor<768xf32>, %arg131: tensor<1x12x197x197xbf16>, %arg132: tensor<768x768xf32>, %arg133: tensor<768xf32>, %arg134: tensor<768x3072xf32>, %arg135: tensor<3072xf32>, %arg136: tensor<3072x768xf32>, %arg137: tensor<768xf32>, %arg138: tensor<768x768xf32>, %arg139: tensor<768xf32>, %arg140: tensor<768x768xbf16>, %arg141: tensor<768x768xf32>, %arg142: tensor<768xf32>, %arg143: tensor<1x12x197x197xbf16>, %arg144: tensor<768x768xf32>, %arg145: tensor<768xf32>, %arg146: tensor<768x3072xf32>, %arg147: tensor<3072xf32>, %arg148: tensor<3072x768xf32>, %arg149: tensor<768xf32>, %arg150: tensor<768x768xf32>, %arg151: tensor<768xf32>, %arg152: tensor<768x768xbf16>, %arg153: tensor<768x768xf32>, %arg154: tensor<768xf32>, %arg155: tensor<1x12x197x197xbf16>, %arg156: tensor<768x768xf32>, %arg157: tensor<768xf32>, %arg158: tensor<768x3072xf32>, %arg159: tensor<3072xf32>, %arg160: tensor<3072x768xf32>, %arg161: tensor<768xf32>, %arg162: tensor<768x768xf32>, %arg163: tensor<768xf32>, %arg164: tensor<768x768xbf16>, %arg165: tensor<768x768xf32>, %arg166: tensor<768xf32>, %arg167: tensor<1x12x197x197xbf16>, %arg168: tensor<768x768xf32>, %arg169: tensor<768xf32>, %arg170: tensor<768x3072xf32>, %arg171: tensor<3072xf32>, %arg172: tensor<3072x768xf32>, %arg173: tensor<768xf32>, %arg174: tensor<768x768xf32>, %arg175: tensor<768xf32>, %arg176: tensor<768x768xbf16>, %arg177: tensor<768x768xf32>, %arg178: tensor<768xf32>, %arg179: tensor<1x12x197x197xbf16>, %arg180: tensor<768x768xf32>, %arg181: tensor<768xf32>, %arg182: tensor<768x3072xf32>, %arg183: tensor<3072xf32>, %arg184: tensor<3072x768xf32>, %arg185: tensor<768xf32>, %arg186: tensor<768x768xf32>, %arg187: tensor<768xf32>, %arg188: tensor<768x768xbf16>, %arg189: tensor<768x768xf32>, %arg190: tensor<768xf32>, %arg191: tensor<1x12x197x197xbf16>, %arg192: tensor<768x768xf32>, %arg193: tensor<768xf32>, %arg194: tensor<768x3072xf32>, %arg195: tensor<3072xf32>, %arg196: tensor<3072x768xf32>, %arg197: tensor<768xf32>, %arg198: tensor<768x768xf32>, %arg199: tensor<768xf32>, %arg200: tensor<768x768xbf16>, %arg201: tensor<768x768xf32>, %arg202: tensor<768xf32>, %arg203: tensor<1x12x197x197xbf16>, %arg204: tensor<768x768xf32>, %arg205: tensor<768xf32>, %arg206: tensor<768x3072xf32>, %arg207: tensor<3072xf32>, %arg208: tensor<3072x768xf32>, %arg209: tensor<768xf32>, %arg210: tensor<768x768xf32>, %arg211: tensor<768xf32>, %arg212: tensor<768x768xbf16>, %arg213: tensor<768x768xf32>, %arg214: tensor<768xf32>, %arg215: tensor<1x12x197x197xbf16>, %arg216: tensor<768x768xf32>, %arg217: tensor<768xf32>, %arg218: tensor<768x3072xf32>, %arg219: tensor<3072xf32>, %arg220: tensor<3072x768xf32>, %arg221: tensor<768xf32>, %arg222: tensor<768x1000xf32>, %arg223: tensor<1000xf32>) -> tensor<1x1000xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x197x3072xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x197x3072xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x197x3072xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x197x3072xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x197x3072xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x197x3072xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x197x3072xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x197x3072xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x197x3072xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x197x3072xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x197x3072xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x197x3072xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x197x3072xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x197x3072xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x197x3072xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x197x3072xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x197x3072xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x197x3072xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x197x3072xf32>
-    %cst_21 = stablehlo.constant dense<0.000000e+00> : tensor<bf16>
-    %cst_22 = arith.constant dense<768> : tensor<1xi64>
-    %cst_23 = arith.constant dense<9.9999999999999998E-13> : tensor<1xf64>
-    %cst_24 = arith.constant dense<1> : tensor<1xi64>
-    %cst_25 = arith.constant dense<8.000000e+00> : tensor<1xf64>
-    %cst_26 = arith.constant dense<196> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [16, 16], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x224x224xbf16>, tensor<768x3x16x16xbf16>) -> tensor<1x768x14x14xbf16>
-    %1 = stablehlo.reshape %arg2 : (tensor<768xbf16>) -> tensor<768x1x1xbf16>
-    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x768x14x14xbf16>) -> tensor<1x768x14x14xbf16>
-    %3 = stablehlo.broadcast_in_dim %1, dims = [1, 2, 3] : (tensor<768x1x1xbf16>) -> tensor<1x768x14x14xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x768x14x14xbf16>
-    %5 = stablehlo.reshape %4 : (tensor<1x768x14x14xbf16>) -> tensor<1x768x196xbf16>
-    %6 = stablehlo.transpose %5, dims = [0, 2, 1] : (tensor<1x768x196xbf16>) -> tensor<1x196x768xbf16>
-    %7 = stablehlo.concatenate %arg77, %6, dim = 1 : (tensor<1x1x768xbf16>, tensor<1x196x768xbf16>) -> tensor<1x197x768xbf16>
-    %8 = stablehlo.convert %7 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %9 = stablehlo.convert %8 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %10 = stablehlo.reduce(%9 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %11 = stablehlo.reshape %10 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %12 = stablehlo.convert %cst_22 : (tensor<1xi64>) -> tensor<1xf64>
-    %13 = stablehlo.reshape %12 : (tensor<1xf64>) -> tensor<f64>
-    %14 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %15 = stablehlo.broadcast_in_dim %13, dims = [] : (tensor<f64>) -> tensor<1x197x1xf64>
-    %16 = stablehlo.divide %14, %15 : tensor<1x197x1xf64>
-    %17 = stablehlo.broadcast_in_dim %9, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %18 = stablehlo.broadcast_in_dim %16, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %19 = stablehlo.subtract %17, %18 : tensor<1x197x768xf64>
-    %20 = stablehlo.multiply %19, %19 : tensor<1x197x768xf64>
-    %21 = stablehlo.reduce(%20 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %22 = stablehlo.reshape %21 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %23 = stablehlo.broadcast_in_dim %22, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %24 = stablehlo.divide %23, %15 : tensor<1x197x1xf64>
-    %25 = stablehlo.convert %24 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %26 = stablehlo.reduce(%8 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %27 = stablehlo.reshape %26 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %28 = stablehlo.convert %cst_22 : (tensor<1xi64>) -> tensor<1xf32>
-    %29 = stablehlo.reshape %28 : (tensor<1xf32>) -> tensor<f32>
-    %30 = stablehlo.broadcast_in_dim %27, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %31 = stablehlo.broadcast_in_dim %29, dims = [] : (tensor<f32>) -> tensor<1x197x1xf32>
-    %32 = stablehlo.divide %30, %31 : tensor<1x197x1xf32>
-    %33 = stablehlo.convert %cst_23 : (tensor<1xf64>) -> tensor<1xf32>
-    %34 = stablehlo.reshape %33 : (tensor<1xf32>) -> tensor<f32>
-    %35 = stablehlo.broadcast_in_dim %25, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %36 = stablehlo.broadcast_in_dim %34, dims = [] : (tensor<f32>) -> tensor<1x197x1xf32>
-    %37 = stablehlo.add %35, %36 : tensor<1x197x1xf32>
-    %38 = stablehlo.rsqrt %37 : tensor<1x197x1xf32>
-    %39 = stablehlo.broadcast_in_dim %8, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %40 = stablehlo.broadcast_in_dim %32, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %41 = stablehlo.subtract %39, %40 : tensor<1x197x768xf32>
-    %42 = stablehlo.broadcast_in_dim %41, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %43 = stablehlo.broadcast_in_dim %38, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %44 = stablehlo.multiply %42, %43 : tensor<1x197x768xf32>
-    %45 = stablehlo.convert %arg3 : (tensor<768xbf16>) -> tensor<768xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %47 = stablehlo.broadcast_in_dim %45, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %48 = stablehlo.multiply %46, %47 : tensor<1x197x768xf32>
-    %49 = stablehlo.convert %arg4 : (tensor<768xbf16>) -> tensor<768xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %51 = stablehlo.broadcast_in_dim %49, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %52 = stablehlo.add %50, %51 : tensor<1x197x768xf32>
-    %53 = stablehlo.convert %52 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %54 = stablehlo.reshape %53 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %55 = stablehlo.convert %54 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %56 = stablehlo.dot_general %55, %arg78, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %57 = stablehlo.convert %cst_24 : (tensor<1xi64>) -> tensor<1xf32>
-    %58 = stablehlo.reshape %57 : (tensor<1xf32>) -> tensor<f32>
-    %59 = stablehlo.broadcast_in_dim %56, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %60 = stablehlo.broadcast_in_dim %58, dims = [] : (tensor<f32>) -> tensor<197x768xf32>
-    %61 = stablehlo.multiply %59, %60 : tensor<197x768xf32>
-    %62 = stablehlo.broadcast_in_dim %61, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %63 = stablehlo.broadcast_in_dim %arg79, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %64 = stablehlo.add %62, %63 : tensor<197x768xf32>
-    %65 = stablehlo.convert %64 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %66 = stablehlo.reshape %65 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %67 = stablehlo.dot_general %54, %arg80, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %68 = stablehlo.reshape %67 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %69 = stablehlo.reshape %68 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %70 = stablehlo.transpose %69, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %71 = stablehlo.dot_general %55, %arg81, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %72 = stablehlo.broadcast_in_dim %71, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %73 = stablehlo.multiply %72, %60 : tensor<197x768xf32>
-    %74 = stablehlo.broadcast_in_dim %73, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %75 = stablehlo.broadcast_in_dim %arg82, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %76 = stablehlo.add %74, %75 : tensor<197x768xf32>
-    %77 = stablehlo.convert %76 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %78 = stablehlo.reshape %77 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %79 = stablehlo.reshape %78 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %80 = stablehlo.transpose %79, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %81 = stablehlo.reshape %66 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %82 = stablehlo.transpose %81, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %83 = stablehlo.transpose %70, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %84 = stablehlo.reshape %82 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %85 = stablehlo.reshape %83 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %86 = stablehlo.broadcast_in_dim %85, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %87 = stablehlo.dot_general %84, %86, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %88 = stablehlo.reshape %87 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %89 = stablehlo.convert %cst_25 : (tensor<1xf64>) -> tensor<1xbf16>
-    %90 = stablehlo.reshape %89 : (tensor<1xbf16>) -> tensor<bf16>
-    %91 = stablehlo.broadcast_in_dim %88, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %92 = stablehlo.broadcast_in_dim %90, dims = [] : (tensor<bf16>) -> tensor<1x12x197x197xbf16>
-    %93 = stablehlo.divide %91, %92 : tensor<1x12x197x197xbf16>
-    %94 = stablehlo.add %93, %arg83 : tensor<1x12x197x197xbf16>
-    %95 = stablehlo.convert %94 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %96 = stablehlo.reduce(%95 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %97 = stablehlo.reshape %96 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %98 = stablehlo.broadcast_in_dim %95, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %99 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %100 = stablehlo.subtract %98, %99 : tensor<1x12x197x197xf32>
-    %101 = stablehlo.exponential %100 : tensor<1x12x197x197xf32>
-    %102 = stablehlo.reduce(%101 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %103 = stablehlo.reshape %102 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %104 = stablehlo.broadcast_in_dim %101, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %105 = stablehlo.broadcast_in_dim %103, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %106 = stablehlo.divide %104, %105 : tensor<1x12x197x197xf32>
-    %107 = stablehlo.convert %106 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %108 = stablehlo.reshape %107 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %109 = stablehlo.reshape %80 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %110 = stablehlo.broadcast_in_dim %109, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %111 = stablehlo.dot_general %108, %110, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %112 = stablehlo.reshape %111 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %113 = stablehlo.transpose %112, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %114 = stablehlo.reshape %113 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %115 = stablehlo.reshape %114 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %116 = stablehlo.convert %115 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %117 = stablehlo.dot_general %116, %arg84, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %119 = stablehlo.multiply %118, %60 : tensor<197x768xf32>
-    %120 = stablehlo.broadcast_in_dim %119, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %121 = stablehlo.broadcast_in_dim %arg85, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %122 = stablehlo.add %120, %121 : tensor<197x768xf32>
-    %123 = stablehlo.convert %122 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %124 = stablehlo.reshape %123 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %125 = stablehlo.broadcast_in_dim %arg5, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %126 = stablehlo.broadcast_in_dim %124, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %127 = stablehlo.multiply %125, %126 : tensor<1x197x768xbf16>
-    %128 = stablehlo.add %127, %7 : tensor<1x197x768xbf16>
-    %129 = stablehlo.convert %128 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %130 = stablehlo.convert %129 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %131 = stablehlo.reduce(%130 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %132 = stablehlo.reshape %131 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %133 = stablehlo.broadcast_in_dim %132, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %134 = stablehlo.divide %133, %15 : tensor<1x197x1xf64>
-    %135 = stablehlo.broadcast_in_dim %130, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %136 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %137 = stablehlo.subtract %135, %136 : tensor<1x197x768xf64>
-    %138 = stablehlo.multiply %137, %137 : tensor<1x197x768xf64>
-    %139 = stablehlo.reduce(%138 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %140 = stablehlo.reshape %139 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %141 = stablehlo.broadcast_in_dim %140, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %142 = stablehlo.divide %141, %15 : tensor<1x197x1xf64>
-    %143 = stablehlo.convert %142 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %144 = stablehlo.reduce(%129 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %145 = stablehlo.reshape %144 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %146 = stablehlo.broadcast_in_dim %145, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %147 = stablehlo.divide %146, %31 : tensor<1x197x1xf32>
-    %148 = stablehlo.broadcast_in_dim %143, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %149 = stablehlo.add %148, %36 : tensor<1x197x1xf32>
-    %150 = stablehlo.rsqrt %149 : tensor<1x197x1xf32>
-    %151 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %152 = stablehlo.broadcast_in_dim %147, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %153 = stablehlo.subtract %151, %152 : tensor<1x197x768xf32>
-    %154 = stablehlo.broadcast_in_dim %153, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %155 = stablehlo.broadcast_in_dim %150, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %156 = stablehlo.multiply %154, %155 : tensor<1x197x768xf32>
-    %157 = stablehlo.convert %arg6 : (tensor<768xbf16>) -> tensor<768xf32>
-    %158 = stablehlo.broadcast_in_dim %156, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %159 = stablehlo.broadcast_in_dim %157, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %160 = stablehlo.multiply %158, %159 : tensor<1x197x768xf32>
-    %161 = stablehlo.convert %arg7 : (tensor<768xbf16>) -> tensor<768xf32>
-    %162 = stablehlo.broadcast_in_dim %160, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %163 = stablehlo.broadcast_in_dim %161, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %164 = stablehlo.add %162, %163 : tensor<1x197x768xf32>
-    %165 = stablehlo.convert %164 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %166 = stablehlo.reshape %165 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %167 = stablehlo.convert %166 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %168 = stablehlo.dot_general %167, %arg86, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %169 = stablehlo.broadcast_in_dim %168, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %170 = stablehlo.broadcast_in_dim %58, dims = [] : (tensor<f32>) -> tensor<197x3072xf32>
-    %171 = stablehlo.multiply %169, %170 : tensor<197x3072xf32>
-    %172 = stablehlo.broadcast_in_dim %171, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %173 = stablehlo.broadcast_in_dim %arg87, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %174 = stablehlo.add %172, %173 : tensor<197x3072xf32>
-    %175 = stablehlo.convert %174 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %176 = stablehlo.reshape %175 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %177 = stablehlo.multiply %176, %cst_4 : tensor<1x197x3072xbf16>
-    %178 = stablehlo.rsqrt %cst_3 : tensor<1x197x3072xbf16>
-    %179 = stablehlo.multiply %176, %178 : tensor<1x197x3072xbf16>
-    %180 = stablehlo.convert %179 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %181 = stablehlo.clamp %cst_5, %180, %cst_6 : tensor<1x197x3072xf32>
-    %182 = stablehlo.multiply %181, %181 : tensor<1x197x3072xf32>
-    %183 = stablehlo.multiply %cst_7, %182 : tensor<1x197x3072xf32>
-    %184 = stablehlo.add %183, %cst_8 : tensor<1x197x3072xf32>
-    %185 = stablehlo.multiply %184, %182 : tensor<1x197x3072xf32>
-    %186 = stablehlo.add %185, %cst_9 : tensor<1x197x3072xf32>
-    %187 = stablehlo.multiply %186, %182 : tensor<1x197x3072xf32>
-    %188 = stablehlo.add %187, %cst_10 : tensor<1x197x3072xf32>
-    %189 = stablehlo.multiply %188, %182 : tensor<1x197x3072xf32>
-    %190 = stablehlo.add %189, %cst_11 : tensor<1x197x3072xf32>
-    %191 = stablehlo.multiply %190, %182 : tensor<1x197x3072xf32>
-    %192 = stablehlo.add %191, %cst_12 : tensor<1x197x3072xf32>
-    %193 = stablehlo.multiply %192, %182 : tensor<1x197x3072xf32>
-    %194 = stablehlo.add %193, %cst_13 : tensor<1x197x3072xf32>
-    %195 = stablehlo.multiply %cst_14, %182 : tensor<1x197x3072xf32>
-    %196 = stablehlo.add %195, %cst_15 : tensor<1x197x3072xf32>
-    %197 = stablehlo.multiply %196, %182 : tensor<1x197x3072xf32>
-    %198 = stablehlo.add %197, %cst_16 : tensor<1x197x3072xf32>
-    %199 = stablehlo.multiply %198, %182 : tensor<1x197x3072xf32>
-    %200 = stablehlo.add %199, %cst_17 : tensor<1x197x3072xf32>
-    %201 = stablehlo.multiply %200, %182 : tensor<1x197x3072xf32>
-    %202 = stablehlo.add %201, %cst_18 : tensor<1x197x3072xf32>
-    %203 = stablehlo.multiply %181, %194 : tensor<1x197x3072xf32>
-    %204 = stablehlo.divide %203, %202 : tensor<1x197x3072xf32>
-    %205 = stablehlo.clamp %cst_19, %204, %cst_20 : tensor<1x197x3072xf32>
-    %206 = stablehlo.convert %205 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %207 = stablehlo.add %206, %cst_2 : tensor<1x197x3072xbf16>
-    %208 = stablehlo.multiply %207, %177 : tensor<1x197x3072xbf16>
-    %209 = stablehlo.reshape %208 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %210 = stablehlo.convert %209 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %211 = stablehlo.dot_general %210, %arg88, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %212 = stablehlo.broadcast_in_dim %211, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %213 = stablehlo.multiply %212, %60 : tensor<197x768xf32>
-    %214 = stablehlo.broadcast_in_dim %213, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %215 = stablehlo.broadcast_in_dim %arg89, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %216 = stablehlo.add %214, %215 : tensor<197x768xf32>
-    %217 = stablehlo.convert %216 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %218 = stablehlo.reshape %217 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %219 = stablehlo.broadcast_in_dim %arg8, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %220 = stablehlo.broadcast_in_dim %218, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %221 = stablehlo.multiply %219, %220 : tensor<1x197x768xbf16>
-    %222 = stablehlo.add %221, %128 : tensor<1x197x768xbf16>
-    %223 = stablehlo.convert %222 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %224 = stablehlo.convert %223 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %225 = stablehlo.reduce(%224 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %226 = stablehlo.reshape %225 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %227 = stablehlo.broadcast_in_dim %226, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %228 = stablehlo.divide %227, %15 : tensor<1x197x1xf64>
-    %229 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %230 = stablehlo.broadcast_in_dim %228, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %231 = stablehlo.subtract %229, %230 : tensor<1x197x768xf64>
-    %232 = stablehlo.multiply %231, %231 : tensor<1x197x768xf64>
-    %233 = stablehlo.reduce(%232 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %234 = stablehlo.reshape %233 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %235 = stablehlo.broadcast_in_dim %234, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %236 = stablehlo.divide %235, %15 : tensor<1x197x1xf64>
-    %237 = stablehlo.convert %236 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %238 = stablehlo.reduce(%223 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %239 = stablehlo.reshape %238 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %240 = stablehlo.broadcast_in_dim %239, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %241 = stablehlo.divide %240, %31 : tensor<1x197x1xf32>
-    %242 = stablehlo.broadcast_in_dim %237, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %243 = stablehlo.add %242, %36 : tensor<1x197x1xf32>
-    %244 = stablehlo.rsqrt %243 : tensor<1x197x1xf32>
-    %245 = stablehlo.broadcast_in_dim %223, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %246 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %247 = stablehlo.subtract %245, %246 : tensor<1x197x768xf32>
-    %248 = stablehlo.broadcast_in_dim %247, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %249 = stablehlo.broadcast_in_dim %244, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %250 = stablehlo.multiply %248, %249 : tensor<1x197x768xf32>
-    %251 = stablehlo.convert %arg9 : (tensor<768xbf16>) -> tensor<768xf32>
-    %252 = stablehlo.broadcast_in_dim %250, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %253 = stablehlo.broadcast_in_dim %251, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %254 = stablehlo.multiply %252, %253 : tensor<1x197x768xf32>
-    %255 = stablehlo.convert %arg10 : (tensor<768xbf16>) -> tensor<768xf32>
-    %256 = stablehlo.broadcast_in_dim %254, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %257 = stablehlo.broadcast_in_dim %255, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %258 = stablehlo.add %256, %257 : tensor<1x197x768xf32>
-    %259 = stablehlo.convert %258 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %260 = stablehlo.reshape %259 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %261 = stablehlo.convert %260 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %262 = stablehlo.dot_general %261, %arg90, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %263 = stablehlo.broadcast_in_dim %262, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %264 = stablehlo.multiply %263, %60 : tensor<197x768xf32>
-    %265 = stablehlo.broadcast_in_dim %264, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %266 = stablehlo.broadcast_in_dim %arg91, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %267 = stablehlo.add %265, %266 : tensor<197x768xf32>
-    %268 = stablehlo.convert %267 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %269 = stablehlo.reshape %268 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %270 = stablehlo.dot_general %260, %arg92, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %271 = stablehlo.reshape %270 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %272 = stablehlo.reshape %271 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %273 = stablehlo.transpose %272, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %274 = stablehlo.dot_general %261, %arg93, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %276 = stablehlo.multiply %275, %60 : tensor<197x768xf32>
-    %277 = stablehlo.broadcast_in_dim %276, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %278 = stablehlo.broadcast_in_dim %arg94, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %279 = stablehlo.add %277, %278 : tensor<197x768xf32>
-    %280 = stablehlo.convert %279 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %281 = stablehlo.reshape %280 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %282 = stablehlo.reshape %281 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %283 = stablehlo.transpose %282, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %284 = stablehlo.reshape %269 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %285 = stablehlo.transpose %284, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %286 = stablehlo.transpose %273, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %287 = stablehlo.reshape %285 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %288 = stablehlo.reshape %286 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %289 = stablehlo.broadcast_in_dim %288, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %290 = stablehlo.dot_general %287, %289, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %291 = stablehlo.reshape %290 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %292 = stablehlo.broadcast_in_dim %291, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %293 = stablehlo.divide %292, %92 : tensor<1x12x197x197xbf16>
-    %294 = stablehlo.add %293, %arg95 : tensor<1x12x197x197xbf16>
-    %295 = stablehlo.convert %294 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %296 = stablehlo.reduce(%295 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %297 = stablehlo.reshape %296 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %298 = stablehlo.broadcast_in_dim %295, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %299 = stablehlo.broadcast_in_dim %297, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %300 = stablehlo.subtract %298, %299 : tensor<1x12x197x197xf32>
-    %301 = stablehlo.exponential %300 : tensor<1x12x197x197xf32>
-    %302 = stablehlo.reduce(%301 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %303 = stablehlo.reshape %302 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %304 = stablehlo.broadcast_in_dim %301, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %305 = stablehlo.broadcast_in_dim %303, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %306 = stablehlo.divide %304, %305 : tensor<1x12x197x197xf32>
-    %307 = stablehlo.convert %306 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %308 = stablehlo.reshape %307 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %309 = stablehlo.reshape %283 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %310 = stablehlo.broadcast_in_dim %309, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %311 = stablehlo.dot_general %308, %310, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %312 = stablehlo.reshape %311 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %313 = stablehlo.transpose %312, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %314 = stablehlo.reshape %313 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %315 = stablehlo.reshape %314 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %316 = stablehlo.convert %315 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %317 = stablehlo.dot_general %316, %arg96, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %318 = stablehlo.broadcast_in_dim %317, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %319 = stablehlo.multiply %318, %60 : tensor<197x768xf32>
-    %320 = stablehlo.broadcast_in_dim %319, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %321 = stablehlo.broadcast_in_dim %arg97, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %322 = stablehlo.add %320, %321 : tensor<197x768xf32>
-    %323 = stablehlo.convert %322 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %324 = stablehlo.reshape %323 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %325 = stablehlo.broadcast_in_dim %arg11, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %326 = stablehlo.broadcast_in_dim %324, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %327 = stablehlo.multiply %325, %326 : tensor<1x197x768xbf16>
-    %328 = stablehlo.add %327, %222 : tensor<1x197x768xbf16>
-    %329 = stablehlo.convert %328 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %330 = stablehlo.convert %329 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %331 = stablehlo.reduce(%330 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %332 = stablehlo.reshape %331 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %333 = stablehlo.broadcast_in_dim %332, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %334 = stablehlo.divide %333, %15 : tensor<1x197x1xf64>
-    %335 = stablehlo.broadcast_in_dim %330, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %336 = stablehlo.broadcast_in_dim %334, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %337 = stablehlo.subtract %335, %336 : tensor<1x197x768xf64>
-    %338 = stablehlo.multiply %337, %337 : tensor<1x197x768xf64>
-    %339 = stablehlo.reduce(%338 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %340 = stablehlo.reshape %339 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %341 = stablehlo.broadcast_in_dim %340, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %342 = stablehlo.divide %341, %15 : tensor<1x197x1xf64>
-    %343 = stablehlo.convert %342 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %344 = stablehlo.reduce(%329 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %345 = stablehlo.reshape %344 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %346 = stablehlo.broadcast_in_dim %345, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %347 = stablehlo.divide %346, %31 : tensor<1x197x1xf32>
-    %348 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %349 = stablehlo.add %348, %36 : tensor<1x197x1xf32>
-    %350 = stablehlo.rsqrt %349 : tensor<1x197x1xf32>
-    %351 = stablehlo.broadcast_in_dim %329, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %352 = stablehlo.broadcast_in_dim %347, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %353 = stablehlo.subtract %351, %352 : tensor<1x197x768xf32>
-    %354 = stablehlo.broadcast_in_dim %353, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %355 = stablehlo.broadcast_in_dim %350, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %356 = stablehlo.multiply %354, %355 : tensor<1x197x768xf32>
-    %357 = stablehlo.convert %arg12 : (tensor<768xbf16>) -> tensor<768xf32>
-    %358 = stablehlo.broadcast_in_dim %356, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %359 = stablehlo.broadcast_in_dim %357, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %360 = stablehlo.multiply %358, %359 : tensor<1x197x768xf32>
-    %361 = stablehlo.convert %arg13 : (tensor<768xbf16>) -> tensor<768xf32>
-    %362 = stablehlo.broadcast_in_dim %360, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %363 = stablehlo.broadcast_in_dim %361, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %364 = stablehlo.add %362, %363 : tensor<1x197x768xf32>
-    %365 = stablehlo.convert %364 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %366 = stablehlo.reshape %365 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %367 = stablehlo.convert %366 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %368 = stablehlo.dot_general %367, %arg98, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %369 = stablehlo.broadcast_in_dim %368, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %370 = stablehlo.multiply %369, %170 : tensor<197x3072xf32>
-    %371 = stablehlo.broadcast_in_dim %370, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %372 = stablehlo.broadcast_in_dim %arg99, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %373 = stablehlo.add %371, %372 : tensor<197x3072xf32>
-    %374 = stablehlo.convert %373 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %375 = stablehlo.reshape %374 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %376 = stablehlo.multiply %375, %cst_4 : tensor<1x197x3072xbf16>
-    %377 = stablehlo.multiply %375, %178 : tensor<1x197x3072xbf16>
-    %378 = stablehlo.convert %377 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %379 = stablehlo.clamp %cst_5, %378, %cst_6 : tensor<1x197x3072xf32>
-    %380 = stablehlo.multiply %379, %379 : tensor<1x197x3072xf32>
-    %381 = stablehlo.multiply %cst_7, %380 : tensor<1x197x3072xf32>
-    %382 = stablehlo.add %381, %cst_8 : tensor<1x197x3072xf32>
-    %383 = stablehlo.multiply %382, %380 : tensor<1x197x3072xf32>
-    %384 = stablehlo.add %383, %cst_9 : tensor<1x197x3072xf32>
-    %385 = stablehlo.multiply %384, %380 : tensor<1x197x3072xf32>
-    %386 = stablehlo.add %385, %cst_10 : tensor<1x197x3072xf32>
-    %387 = stablehlo.multiply %386, %380 : tensor<1x197x3072xf32>
-    %388 = stablehlo.add %387, %cst_11 : tensor<1x197x3072xf32>
-    %389 = stablehlo.multiply %388, %380 : tensor<1x197x3072xf32>
-    %390 = stablehlo.add %389, %cst_12 : tensor<1x197x3072xf32>
-    %391 = stablehlo.multiply %390, %380 : tensor<1x197x3072xf32>
-    %392 = stablehlo.add %391, %cst_13 : tensor<1x197x3072xf32>
-    %393 = stablehlo.multiply %cst_14, %380 : tensor<1x197x3072xf32>
-    %394 = stablehlo.add %393, %cst_15 : tensor<1x197x3072xf32>
-    %395 = stablehlo.multiply %394, %380 : tensor<1x197x3072xf32>
-    %396 = stablehlo.add %395, %cst_16 : tensor<1x197x3072xf32>
-    %397 = stablehlo.multiply %396, %380 : tensor<1x197x3072xf32>
-    %398 = stablehlo.add %397, %cst_17 : tensor<1x197x3072xf32>
-    %399 = stablehlo.multiply %398, %380 : tensor<1x197x3072xf32>
-    %400 = stablehlo.add %399, %cst_18 : tensor<1x197x3072xf32>
-    %401 = stablehlo.multiply %379, %392 : tensor<1x197x3072xf32>
-    %402 = stablehlo.divide %401, %400 : tensor<1x197x3072xf32>
-    %403 = stablehlo.clamp %cst_19, %402, %cst_20 : tensor<1x197x3072xf32>
-    %404 = stablehlo.convert %403 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %405 = stablehlo.add %404, %cst_2 : tensor<1x197x3072xbf16>
-    %406 = stablehlo.multiply %405, %376 : tensor<1x197x3072xbf16>
-    %407 = stablehlo.reshape %406 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %408 = stablehlo.convert %407 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %409 = stablehlo.dot_general %408, %arg100, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %410 = stablehlo.broadcast_in_dim %409, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %411 = stablehlo.multiply %410, %60 : tensor<197x768xf32>
-    %412 = stablehlo.broadcast_in_dim %411, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %413 = stablehlo.broadcast_in_dim %arg101, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %414 = stablehlo.add %412, %413 : tensor<197x768xf32>
-    %415 = stablehlo.convert %414 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %416 = stablehlo.reshape %415 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %417 = stablehlo.broadcast_in_dim %arg14, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %418 = stablehlo.broadcast_in_dim %416, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %419 = stablehlo.multiply %417, %418 : tensor<1x197x768xbf16>
-    %420 = stablehlo.add %419, %328 : tensor<1x197x768xbf16>
-    %421 = stablehlo.convert %420 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %422 = stablehlo.convert %421 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %423 = stablehlo.reduce(%422 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %424 = stablehlo.reshape %423 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %425 = stablehlo.broadcast_in_dim %424, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %426 = stablehlo.divide %425, %15 : tensor<1x197x1xf64>
-    %427 = stablehlo.broadcast_in_dim %422, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %428 = stablehlo.broadcast_in_dim %426, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %429 = stablehlo.subtract %427, %428 : tensor<1x197x768xf64>
-    %430 = stablehlo.multiply %429, %429 : tensor<1x197x768xf64>
-    %431 = stablehlo.reduce(%430 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %432 = stablehlo.reshape %431 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %433 = stablehlo.broadcast_in_dim %432, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %434 = stablehlo.divide %433, %15 : tensor<1x197x1xf64>
-    %435 = stablehlo.convert %434 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %436 = stablehlo.reduce(%421 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %437 = stablehlo.reshape %436 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %438 = stablehlo.broadcast_in_dim %437, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %439 = stablehlo.divide %438, %31 : tensor<1x197x1xf32>
-    %440 = stablehlo.broadcast_in_dim %435, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %441 = stablehlo.add %440, %36 : tensor<1x197x1xf32>
-    %442 = stablehlo.rsqrt %441 : tensor<1x197x1xf32>
-    %443 = stablehlo.broadcast_in_dim %421, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %444 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %445 = stablehlo.subtract %443, %444 : tensor<1x197x768xf32>
-    %446 = stablehlo.broadcast_in_dim %445, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %447 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %448 = stablehlo.multiply %446, %447 : tensor<1x197x768xf32>
-    %449 = stablehlo.convert %arg15 : (tensor<768xbf16>) -> tensor<768xf32>
-    %450 = stablehlo.broadcast_in_dim %448, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %451 = stablehlo.broadcast_in_dim %449, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %452 = stablehlo.multiply %450, %451 : tensor<1x197x768xf32>
-    %453 = stablehlo.convert %arg16 : (tensor<768xbf16>) -> tensor<768xf32>
-    %454 = stablehlo.broadcast_in_dim %452, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %455 = stablehlo.broadcast_in_dim %453, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %456 = stablehlo.add %454, %455 : tensor<1x197x768xf32>
-    %457 = stablehlo.convert %456 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %458 = stablehlo.reshape %457 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %459 = stablehlo.convert %458 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %460 = stablehlo.dot_general %459, %arg102, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %461 = stablehlo.broadcast_in_dim %460, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %462 = stablehlo.multiply %461, %60 : tensor<197x768xf32>
-    %463 = stablehlo.broadcast_in_dim %462, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %464 = stablehlo.broadcast_in_dim %arg103, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %465 = stablehlo.add %463, %464 : tensor<197x768xf32>
-    %466 = stablehlo.convert %465 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %467 = stablehlo.reshape %466 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %468 = stablehlo.dot_general %458, %arg104, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %469 = stablehlo.reshape %468 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %470 = stablehlo.reshape %469 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %471 = stablehlo.transpose %470, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %472 = stablehlo.dot_general %459, %arg105, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %473 = stablehlo.broadcast_in_dim %472, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %474 = stablehlo.multiply %473, %60 : tensor<197x768xf32>
-    %475 = stablehlo.broadcast_in_dim %474, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %476 = stablehlo.broadcast_in_dim %arg106, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %477 = stablehlo.add %475, %476 : tensor<197x768xf32>
-    %478 = stablehlo.convert %477 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %479 = stablehlo.reshape %478 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %480 = stablehlo.reshape %479 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %481 = stablehlo.transpose %480, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %482 = stablehlo.reshape %467 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %483 = stablehlo.transpose %482, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %484 = stablehlo.transpose %471, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %485 = stablehlo.reshape %483 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %486 = stablehlo.reshape %484 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %487 = stablehlo.broadcast_in_dim %486, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %488 = stablehlo.dot_general %485, %487, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %489 = stablehlo.reshape %488 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %490 = stablehlo.broadcast_in_dim %489, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %491 = stablehlo.divide %490, %92 : tensor<1x12x197x197xbf16>
-    %492 = stablehlo.add %491, %arg107 : tensor<1x12x197x197xbf16>
-    %493 = stablehlo.convert %492 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %494 = stablehlo.reduce(%493 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %495 = stablehlo.reshape %494 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %496 = stablehlo.broadcast_in_dim %493, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %497 = stablehlo.broadcast_in_dim %495, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %498 = stablehlo.subtract %496, %497 : tensor<1x12x197x197xf32>
-    %499 = stablehlo.exponential %498 : tensor<1x12x197x197xf32>
-    %500 = stablehlo.reduce(%499 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %501 = stablehlo.reshape %500 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %502 = stablehlo.broadcast_in_dim %499, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %503 = stablehlo.broadcast_in_dim %501, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %504 = stablehlo.divide %502, %503 : tensor<1x12x197x197xf32>
-    %505 = stablehlo.convert %504 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %506 = stablehlo.reshape %505 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %507 = stablehlo.reshape %481 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %508 = stablehlo.broadcast_in_dim %507, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %509 = stablehlo.dot_general %506, %508, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %510 = stablehlo.reshape %509 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %511 = stablehlo.transpose %510, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %512 = stablehlo.reshape %511 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %513 = stablehlo.reshape %512 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %514 = stablehlo.convert %513 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %515 = stablehlo.dot_general %514, %arg108, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %516 = stablehlo.broadcast_in_dim %515, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %517 = stablehlo.multiply %516, %60 : tensor<197x768xf32>
-    %518 = stablehlo.broadcast_in_dim %517, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %519 = stablehlo.broadcast_in_dim %arg109, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %520 = stablehlo.add %518, %519 : tensor<197x768xf32>
-    %521 = stablehlo.convert %520 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %522 = stablehlo.reshape %521 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %523 = stablehlo.broadcast_in_dim %arg17, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %524 = stablehlo.broadcast_in_dim %522, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %525 = stablehlo.multiply %523, %524 : tensor<1x197x768xbf16>
-    %526 = stablehlo.add %525, %420 : tensor<1x197x768xbf16>
-    %527 = stablehlo.convert %526 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %528 = stablehlo.convert %527 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %529 = stablehlo.reduce(%528 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %530 = stablehlo.reshape %529 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %531 = stablehlo.broadcast_in_dim %530, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %532 = stablehlo.divide %531, %15 : tensor<1x197x1xf64>
-    %533 = stablehlo.broadcast_in_dim %528, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %534 = stablehlo.broadcast_in_dim %532, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %535 = stablehlo.subtract %533, %534 : tensor<1x197x768xf64>
-    %536 = stablehlo.multiply %535, %535 : tensor<1x197x768xf64>
-    %537 = stablehlo.reduce(%536 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %538 = stablehlo.reshape %537 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %539 = stablehlo.broadcast_in_dim %538, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %540 = stablehlo.divide %539, %15 : tensor<1x197x1xf64>
-    %541 = stablehlo.convert %540 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %542 = stablehlo.reduce(%527 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %543 = stablehlo.reshape %542 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %544 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %545 = stablehlo.divide %544, %31 : tensor<1x197x1xf32>
-    %546 = stablehlo.broadcast_in_dim %541, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %547 = stablehlo.add %546, %36 : tensor<1x197x1xf32>
-    %548 = stablehlo.rsqrt %547 : tensor<1x197x1xf32>
-    %549 = stablehlo.broadcast_in_dim %527, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %550 = stablehlo.broadcast_in_dim %545, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %551 = stablehlo.subtract %549, %550 : tensor<1x197x768xf32>
-    %552 = stablehlo.broadcast_in_dim %551, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %553 = stablehlo.broadcast_in_dim %548, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %554 = stablehlo.multiply %552, %553 : tensor<1x197x768xf32>
-    %555 = stablehlo.convert %arg18 : (tensor<768xbf16>) -> tensor<768xf32>
-    %556 = stablehlo.broadcast_in_dim %554, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %557 = stablehlo.broadcast_in_dim %555, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %558 = stablehlo.multiply %556, %557 : tensor<1x197x768xf32>
-    %559 = stablehlo.convert %arg19 : (tensor<768xbf16>) -> tensor<768xf32>
-    %560 = stablehlo.broadcast_in_dim %558, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %561 = stablehlo.broadcast_in_dim %559, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %562 = stablehlo.add %560, %561 : tensor<1x197x768xf32>
-    %563 = stablehlo.convert %562 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %564 = stablehlo.reshape %563 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %565 = stablehlo.convert %564 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %566 = stablehlo.dot_general %565, %arg110, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %567 = stablehlo.broadcast_in_dim %566, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %568 = stablehlo.multiply %567, %170 : tensor<197x3072xf32>
-    %569 = stablehlo.broadcast_in_dim %568, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %570 = stablehlo.broadcast_in_dim %arg111, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %571 = stablehlo.add %569, %570 : tensor<197x3072xf32>
-    %572 = stablehlo.convert %571 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %573 = stablehlo.reshape %572 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %574 = stablehlo.multiply %573, %cst_4 : tensor<1x197x3072xbf16>
-    %575 = stablehlo.multiply %573, %178 : tensor<1x197x3072xbf16>
-    %576 = stablehlo.convert %575 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %577 = stablehlo.clamp %cst_5, %576, %cst_6 : tensor<1x197x3072xf32>
-    %578 = stablehlo.multiply %577, %577 : tensor<1x197x3072xf32>
-    %579 = stablehlo.multiply %cst_7, %578 : tensor<1x197x3072xf32>
-    %580 = stablehlo.add %579, %cst_8 : tensor<1x197x3072xf32>
-    %581 = stablehlo.multiply %580, %578 : tensor<1x197x3072xf32>
-    %582 = stablehlo.add %581, %cst_9 : tensor<1x197x3072xf32>
-    %583 = stablehlo.multiply %582, %578 : tensor<1x197x3072xf32>
-    %584 = stablehlo.add %583, %cst_10 : tensor<1x197x3072xf32>
-    %585 = stablehlo.multiply %584, %578 : tensor<1x197x3072xf32>
-    %586 = stablehlo.add %585, %cst_11 : tensor<1x197x3072xf32>
-    %587 = stablehlo.multiply %586, %578 : tensor<1x197x3072xf32>
-    %588 = stablehlo.add %587, %cst_12 : tensor<1x197x3072xf32>
-    %589 = stablehlo.multiply %588, %578 : tensor<1x197x3072xf32>
-    %590 = stablehlo.add %589, %cst_13 : tensor<1x197x3072xf32>
-    %591 = stablehlo.multiply %cst_14, %578 : tensor<1x197x3072xf32>
-    %592 = stablehlo.add %591, %cst_15 : tensor<1x197x3072xf32>
-    %593 = stablehlo.multiply %592, %578 : tensor<1x197x3072xf32>
-    %594 = stablehlo.add %593, %cst_16 : tensor<1x197x3072xf32>
-    %595 = stablehlo.multiply %594, %578 : tensor<1x197x3072xf32>
-    %596 = stablehlo.add %595, %cst_17 : tensor<1x197x3072xf32>
-    %597 = stablehlo.multiply %596, %578 : tensor<1x197x3072xf32>
-    %598 = stablehlo.add %597, %cst_18 : tensor<1x197x3072xf32>
-    %599 = stablehlo.multiply %577, %590 : tensor<1x197x3072xf32>
-    %600 = stablehlo.divide %599, %598 : tensor<1x197x3072xf32>
-    %601 = stablehlo.clamp %cst_19, %600, %cst_20 : tensor<1x197x3072xf32>
-    %602 = stablehlo.convert %601 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %603 = stablehlo.add %602, %cst_2 : tensor<1x197x3072xbf16>
-    %604 = stablehlo.multiply %603, %574 : tensor<1x197x3072xbf16>
-    %605 = stablehlo.reshape %604 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %606 = stablehlo.convert %605 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %607 = stablehlo.dot_general %606, %arg112, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %608 = stablehlo.broadcast_in_dim %607, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %609 = stablehlo.multiply %608, %60 : tensor<197x768xf32>
-    %610 = stablehlo.broadcast_in_dim %609, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %611 = stablehlo.broadcast_in_dim %arg113, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %612 = stablehlo.add %610, %611 : tensor<197x768xf32>
-    %613 = stablehlo.convert %612 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %614 = stablehlo.reshape %613 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %615 = stablehlo.broadcast_in_dim %arg20, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %616 = stablehlo.broadcast_in_dim %614, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %617 = stablehlo.multiply %615, %616 : tensor<1x197x768xbf16>
-    %618 = stablehlo.add %617, %526 : tensor<1x197x768xbf16>
-    %619 = stablehlo.convert %618 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %620 = stablehlo.convert %619 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %621 = stablehlo.reduce(%620 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %622 = stablehlo.reshape %621 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %623 = stablehlo.broadcast_in_dim %622, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %624 = stablehlo.divide %623, %15 : tensor<1x197x1xf64>
-    %625 = stablehlo.broadcast_in_dim %620, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %626 = stablehlo.broadcast_in_dim %624, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %627 = stablehlo.subtract %625, %626 : tensor<1x197x768xf64>
-    %628 = stablehlo.multiply %627, %627 : tensor<1x197x768xf64>
-    %629 = stablehlo.reduce(%628 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %630 = stablehlo.reshape %629 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %631 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %632 = stablehlo.divide %631, %15 : tensor<1x197x1xf64>
-    %633 = stablehlo.convert %632 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %634 = stablehlo.reduce(%619 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %635 = stablehlo.reshape %634 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %636 = stablehlo.broadcast_in_dim %635, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %637 = stablehlo.divide %636, %31 : tensor<1x197x1xf32>
-    %638 = stablehlo.broadcast_in_dim %633, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %639 = stablehlo.add %638, %36 : tensor<1x197x1xf32>
-    %640 = stablehlo.rsqrt %639 : tensor<1x197x1xf32>
-    %641 = stablehlo.broadcast_in_dim %619, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %642 = stablehlo.broadcast_in_dim %637, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %643 = stablehlo.subtract %641, %642 : tensor<1x197x768xf32>
-    %644 = stablehlo.broadcast_in_dim %643, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %645 = stablehlo.broadcast_in_dim %640, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %646 = stablehlo.multiply %644, %645 : tensor<1x197x768xf32>
-    %647 = stablehlo.convert %arg21 : (tensor<768xbf16>) -> tensor<768xf32>
-    %648 = stablehlo.broadcast_in_dim %646, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %649 = stablehlo.broadcast_in_dim %647, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %650 = stablehlo.multiply %648, %649 : tensor<1x197x768xf32>
-    %651 = stablehlo.convert %arg22 : (tensor<768xbf16>) -> tensor<768xf32>
-    %652 = stablehlo.broadcast_in_dim %650, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %653 = stablehlo.broadcast_in_dim %651, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %654 = stablehlo.add %652, %653 : tensor<1x197x768xf32>
-    %655 = stablehlo.convert %654 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %656 = stablehlo.reshape %655 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %657 = stablehlo.convert %656 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %658 = stablehlo.dot_general %657, %arg114, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %659 = stablehlo.broadcast_in_dim %658, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %660 = stablehlo.multiply %659, %60 : tensor<197x768xf32>
-    %661 = stablehlo.broadcast_in_dim %660, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %662 = stablehlo.broadcast_in_dim %arg115, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %663 = stablehlo.add %661, %662 : tensor<197x768xf32>
-    %664 = stablehlo.convert %663 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %665 = stablehlo.reshape %664 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %666 = stablehlo.dot_general %656, %arg116, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %667 = stablehlo.reshape %666 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %668 = stablehlo.reshape %667 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %669 = stablehlo.transpose %668, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %670 = stablehlo.dot_general %657, %arg117, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %671 = stablehlo.broadcast_in_dim %670, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %672 = stablehlo.multiply %671, %60 : tensor<197x768xf32>
-    %673 = stablehlo.broadcast_in_dim %672, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %674 = stablehlo.broadcast_in_dim %arg118, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %675 = stablehlo.add %673, %674 : tensor<197x768xf32>
-    %676 = stablehlo.convert %675 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %677 = stablehlo.reshape %676 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %678 = stablehlo.reshape %677 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %679 = stablehlo.transpose %678, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %680 = stablehlo.reshape %665 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %681 = stablehlo.transpose %680, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %682 = stablehlo.transpose %669, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %683 = stablehlo.reshape %681 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %684 = stablehlo.reshape %682 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %685 = stablehlo.broadcast_in_dim %684, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %686 = stablehlo.dot_general %683, %685, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %687 = stablehlo.reshape %686 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %688 = stablehlo.broadcast_in_dim %687, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %689 = stablehlo.divide %688, %92 : tensor<1x12x197x197xbf16>
-    %690 = stablehlo.add %689, %arg119 : tensor<1x12x197x197xbf16>
-    %691 = stablehlo.convert %690 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %692 = stablehlo.reduce(%691 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %693 = stablehlo.reshape %692 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %694 = stablehlo.broadcast_in_dim %691, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %695 = stablehlo.broadcast_in_dim %693, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %696 = stablehlo.subtract %694, %695 : tensor<1x12x197x197xf32>
-    %697 = stablehlo.exponential %696 : tensor<1x12x197x197xf32>
-    %698 = stablehlo.reduce(%697 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %699 = stablehlo.reshape %698 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %700 = stablehlo.broadcast_in_dim %697, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %701 = stablehlo.broadcast_in_dim %699, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %702 = stablehlo.divide %700, %701 : tensor<1x12x197x197xf32>
-    %703 = stablehlo.convert %702 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %704 = stablehlo.reshape %703 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %705 = stablehlo.reshape %679 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %706 = stablehlo.broadcast_in_dim %705, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %707 = stablehlo.dot_general %704, %706, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %708 = stablehlo.reshape %707 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %709 = stablehlo.transpose %708, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %710 = stablehlo.reshape %709 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %711 = stablehlo.reshape %710 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %712 = stablehlo.convert %711 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %713 = stablehlo.dot_general %712, %arg120, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %714 = stablehlo.broadcast_in_dim %713, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %715 = stablehlo.multiply %714, %60 : tensor<197x768xf32>
-    %716 = stablehlo.broadcast_in_dim %715, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %717 = stablehlo.broadcast_in_dim %arg121, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %718 = stablehlo.add %716, %717 : tensor<197x768xf32>
-    %719 = stablehlo.convert %718 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %720 = stablehlo.reshape %719 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %721 = stablehlo.broadcast_in_dim %arg23, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %722 = stablehlo.broadcast_in_dim %720, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %723 = stablehlo.multiply %721, %722 : tensor<1x197x768xbf16>
-    %724 = stablehlo.add %723, %618 : tensor<1x197x768xbf16>
-    %725 = stablehlo.convert %724 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %726 = stablehlo.convert %725 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %727 = stablehlo.reduce(%726 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %728 = stablehlo.reshape %727 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %729 = stablehlo.broadcast_in_dim %728, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %730 = stablehlo.divide %729, %15 : tensor<1x197x1xf64>
-    %731 = stablehlo.broadcast_in_dim %726, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %732 = stablehlo.broadcast_in_dim %730, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %733 = stablehlo.subtract %731, %732 : tensor<1x197x768xf64>
-    %734 = stablehlo.multiply %733, %733 : tensor<1x197x768xf64>
-    %735 = stablehlo.reduce(%734 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %736 = stablehlo.reshape %735 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %737 = stablehlo.broadcast_in_dim %736, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %738 = stablehlo.divide %737, %15 : tensor<1x197x1xf64>
-    %739 = stablehlo.convert %738 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %740 = stablehlo.reduce(%725 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %741 = stablehlo.reshape %740 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %742 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %743 = stablehlo.divide %742, %31 : tensor<1x197x1xf32>
-    %744 = stablehlo.broadcast_in_dim %739, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %745 = stablehlo.add %744, %36 : tensor<1x197x1xf32>
-    %746 = stablehlo.rsqrt %745 : tensor<1x197x1xf32>
-    %747 = stablehlo.broadcast_in_dim %725, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %748 = stablehlo.broadcast_in_dim %743, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %749 = stablehlo.subtract %747, %748 : tensor<1x197x768xf32>
-    %750 = stablehlo.broadcast_in_dim %749, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %751 = stablehlo.broadcast_in_dim %746, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %752 = stablehlo.multiply %750, %751 : tensor<1x197x768xf32>
-    %753 = stablehlo.convert %arg24 : (tensor<768xbf16>) -> tensor<768xf32>
-    %754 = stablehlo.broadcast_in_dim %752, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %755 = stablehlo.broadcast_in_dim %753, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %756 = stablehlo.multiply %754, %755 : tensor<1x197x768xf32>
-    %757 = stablehlo.convert %arg25 : (tensor<768xbf16>) -> tensor<768xf32>
-    %758 = stablehlo.broadcast_in_dim %756, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %759 = stablehlo.broadcast_in_dim %757, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %760 = stablehlo.add %758, %759 : tensor<1x197x768xf32>
-    %761 = stablehlo.convert %760 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %762 = stablehlo.reshape %761 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %763 = stablehlo.convert %762 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %764 = stablehlo.dot_general %763, %arg122, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %765 = stablehlo.broadcast_in_dim %764, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %766 = stablehlo.multiply %765, %170 : tensor<197x3072xf32>
-    %767 = stablehlo.broadcast_in_dim %766, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %768 = stablehlo.broadcast_in_dim %arg123, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %769 = stablehlo.add %767, %768 : tensor<197x3072xf32>
-    %770 = stablehlo.convert %769 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %771 = stablehlo.reshape %770 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %772 = stablehlo.multiply %771, %cst_4 : tensor<1x197x3072xbf16>
-    %773 = stablehlo.multiply %771, %178 : tensor<1x197x3072xbf16>
-    %774 = stablehlo.convert %773 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %775 = stablehlo.clamp %cst_5, %774, %cst_6 : tensor<1x197x3072xf32>
-    %776 = stablehlo.multiply %775, %775 : tensor<1x197x3072xf32>
-    %777 = stablehlo.multiply %cst_7, %776 : tensor<1x197x3072xf32>
-    %778 = stablehlo.add %777, %cst_8 : tensor<1x197x3072xf32>
-    %779 = stablehlo.multiply %778, %776 : tensor<1x197x3072xf32>
-    %780 = stablehlo.add %779, %cst_9 : tensor<1x197x3072xf32>
-    %781 = stablehlo.multiply %780, %776 : tensor<1x197x3072xf32>
-    %782 = stablehlo.add %781, %cst_10 : tensor<1x197x3072xf32>
-    %783 = stablehlo.multiply %782, %776 : tensor<1x197x3072xf32>
-    %784 = stablehlo.add %783, %cst_11 : tensor<1x197x3072xf32>
-    %785 = stablehlo.multiply %784, %776 : tensor<1x197x3072xf32>
-    %786 = stablehlo.add %785, %cst_12 : tensor<1x197x3072xf32>
-    %787 = stablehlo.multiply %786, %776 : tensor<1x197x3072xf32>
-    %788 = stablehlo.add %787, %cst_13 : tensor<1x197x3072xf32>
-    %789 = stablehlo.multiply %cst_14, %776 : tensor<1x197x3072xf32>
-    %790 = stablehlo.add %789, %cst_15 : tensor<1x197x3072xf32>
-    %791 = stablehlo.multiply %790, %776 : tensor<1x197x3072xf32>
-    %792 = stablehlo.add %791, %cst_16 : tensor<1x197x3072xf32>
-    %793 = stablehlo.multiply %792, %776 : tensor<1x197x3072xf32>
-    %794 = stablehlo.add %793, %cst_17 : tensor<1x197x3072xf32>
-    %795 = stablehlo.multiply %794, %776 : tensor<1x197x3072xf32>
-    %796 = stablehlo.add %795, %cst_18 : tensor<1x197x3072xf32>
-    %797 = stablehlo.multiply %775, %788 : tensor<1x197x3072xf32>
-    %798 = stablehlo.divide %797, %796 : tensor<1x197x3072xf32>
-    %799 = stablehlo.clamp %cst_19, %798, %cst_20 : tensor<1x197x3072xf32>
-    %800 = stablehlo.convert %799 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %801 = stablehlo.add %800, %cst_2 : tensor<1x197x3072xbf16>
-    %802 = stablehlo.multiply %801, %772 : tensor<1x197x3072xbf16>
-    %803 = stablehlo.reshape %802 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %804 = stablehlo.convert %803 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %805 = stablehlo.dot_general %804, %arg124, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %806 = stablehlo.broadcast_in_dim %805, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %807 = stablehlo.multiply %806, %60 : tensor<197x768xf32>
-    %808 = stablehlo.broadcast_in_dim %807, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %809 = stablehlo.broadcast_in_dim %arg125, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %810 = stablehlo.add %808, %809 : tensor<197x768xf32>
-    %811 = stablehlo.convert %810 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %812 = stablehlo.reshape %811 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %813 = stablehlo.broadcast_in_dim %arg26, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %814 = stablehlo.broadcast_in_dim %812, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %815 = stablehlo.multiply %813, %814 : tensor<1x197x768xbf16>
-    %816 = stablehlo.add %815, %724 : tensor<1x197x768xbf16>
-    %817 = stablehlo.convert %816 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %818 = stablehlo.convert %817 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %819 = stablehlo.reduce(%818 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %820 = stablehlo.reshape %819 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %821 = stablehlo.broadcast_in_dim %820, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %822 = stablehlo.divide %821, %15 : tensor<1x197x1xf64>
-    %823 = stablehlo.broadcast_in_dim %818, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %824 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %825 = stablehlo.subtract %823, %824 : tensor<1x197x768xf64>
-    %826 = stablehlo.multiply %825, %825 : tensor<1x197x768xf64>
-    %827 = stablehlo.reduce(%826 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %828 = stablehlo.reshape %827 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %829 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %830 = stablehlo.divide %829, %15 : tensor<1x197x1xf64>
-    %831 = stablehlo.convert %830 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %832 = stablehlo.reduce(%817 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %833 = stablehlo.reshape %832 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %834 = stablehlo.broadcast_in_dim %833, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %835 = stablehlo.divide %834, %31 : tensor<1x197x1xf32>
-    %836 = stablehlo.broadcast_in_dim %831, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %837 = stablehlo.add %836, %36 : tensor<1x197x1xf32>
-    %838 = stablehlo.rsqrt %837 : tensor<1x197x1xf32>
-    %839 = stablehlo.broadcast_in_dim %817, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %840 = stablehlo.broadcast_in_dim %835, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %841 = stablehlo.subtract %839, %840 : tensor<1x197x768xf32>
-    %842 = stablehlo.broadcast_in_dim %841, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %843 = stablehlo.broadcast_in_dim %838, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %844 = stablehlo.multiply %842, %843 : tensor<1x197x768xf32>
-    %845 = stablehlo.convert %arg27 : (tensor<768xbf16>) -> tensor<768xf32>
-    %846 = stablehlo.broadcast_in_dim %844, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %847 = stablehlo.broadcast_in_dim %845, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %848 = stablehlo.multiply %846, %847 : tensor<1x197x768xf32>
-    %849 = stablehlo.convert %arg28 : (tensor<768xbf16>) -> tensor<768xf32>
-    %850 = stablehlo.broadcast_in_dim %848, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %851 = stablehlo.broadcast_in_dim %849, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %852 = stablehlo.add %850, %851 : tensor<1x197x768xf32>
-    %853 = stablehlo.convert %852 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %854 = stablehlo.reshape %853 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %855 = stablehlo.convert %854 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %856 = stablehlo.dot_general %855, %arg126, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %857 = stablehlo.broadcast_in_dim %856, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %858 = stablehlo.multiply %857, %60 : tensor<197x768xf32>
-    %859 = stablehlo.broadcast_in_dim %858, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %860 = stablehlo.broadcast_in_dim %arg127, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %861 = stablehlo.add %859, %860 : tensor<197x768xf32>
-    %862 = stablehlo.convert %861 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %863 = stablehlo.reshape %862 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %864 = stablehlo.dot_general %854, %arg128, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %865 = stablehlo.reshape %864 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %866 = stablehlo.reshape %865 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %867 = stablehlo.transpose %866, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %868 = stablehlo.dot_general %855, %arg129, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %869 = stablehlo.broadcast_in_dim %868, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %870 = stablehlo.multiply %869, %60 : tensor<197x768xf32>
-    %871 = stablehlo.broadcast_in_dim %870, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %872 = stablehlo.broadcast_in_dim %arg130, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %873 = stablehlo.add %871, %872 : tensor<197x768xf32>
-    %874 = stablehlo.convert %873 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %875 = stablehlo.reshape %874 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %876 = stablehlo.reshape %875 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %877 = stablehlo.transpose %876, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %878 = stablehlo.reshape %863 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %879 = stablehlo.transpose %878, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %880 = stablehlo.transpose %867, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %881 = stablehlo.reshape %879 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %882 = stablehlo.reshape %880 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %883 = stablehlo.broadcast_in_dim %882, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %884 = stablehlo.dot_general %881, %883, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %885 = stablehlo.reshape %884 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %886 = stablehlo.broadcast_in_dim %885, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %887 = stablehlo.divide %886, %92 : tensor<1x12x197x197xbf16>
-    %888 = stablehlo.add %887, %arg131 : tensor<1x12x197x197xbf16>
-    %889 = stablehlo.convert %888 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %890 = stablehlo.reduce(%889 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %891 = stablehlo.reshape %890 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %892 = stablehlo.broadcast_in_dim %889, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %893 = stablehlo.broadcast_in_dim %891, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %894 = stablehlo.subtract %892, %893 : tensor<1x12x197x197xf32>
-    %895 = stablehlo.exponential %894 : tensor<1x12x197x197xf32>
-    %896 = stablehlo.reduce(%895 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %897 = stablehlo.reshape %896 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %898 = stablehlo.broadcast_in_dim %895, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %899 = stablehlo.broadcast_in_dim %897, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %900 = stablehlo.divide %898, %899 : tensor<1x12x197x197xf32>
-    %901 = stablehlo.convert %900 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %902 = stablehlo.reshape %901 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %903 = stablehlo.reshape %877 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %904 = stablehlo.broadcast_in_dim %903, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %905 = stablehlo.dot_general %902, %904, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %906 = stablehlo.reshape %905 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %907 = stablehlo.transpose %906, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %908 = stablehlo.reshape %907 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %909 = stablehlo.reshape %908 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %910 = stablehlo.convert %909 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %911 = stablehlo.dot_general %910, %arg132, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %912 = stablehlo.broadcast_in_dim %911, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %913 = stablehlo.multiply %912, %60 : tensor<197x768xf32>
-    %914 = stablehlo.broadcast_in_dim %913, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %915 = stablehlo.broadcast_in_dim %arg133, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %916 = stablehlo.add %914, %915 : tensor<197x768xf32>
-    %917 = stablehlo.convert %916 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %918 = stablehlo.reshape %917 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %919 = stablehlo.broadcast_in_dim %arg29, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %920 = stablehlo.broadcast_in_dim %918, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %921 = stablehlo.multiply %919, %920 : tensor<1x197x768xbf16>
-    %922 = stablehlo.add %921, %816 : tensor<1x197x768xbf16>
-    %923 = stablehlo.convert %922 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %924 = stablehlo.convert %923 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %925 = stablehlo.reduce(%924 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %926 = stablehlo.reshape %925 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %927 = stablehlo.broadcast_in_dim %926, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %928 = stablehlo.divide %927, %15 : tensor<1x197x1xf64>
-    %929 = stablehlo.broadcast_in_dim %924, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %930 = stablehlo.broadcast_in_dim %928, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %931 = stablehlo.subtract %929, %930 : tensor<1x197x768xf64>
-    %932 = stablehlo.multiply %931, %931 : tensor<1x197x768xf64>
-    %933 = stablehlo.reduce(%932 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %934 = stablehlo.reshape %933 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %935 = stablehlo.broadcast_in_dim %934, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %936 = stablehlo.divide %935, %15 : tensor<1x197x1xf64>
-    %937 = stablehlo.convert %936 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %938 = stablehlo.reduce(%923 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %939 = stablehlo.reshape %938 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %940 = stablehlo.broadcast_in_dim %939, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %941 = stablehlo.divide %940, %31 : tensor<1x197x1xf32>
-    %942 = stablehlo.broadcast_in_dim %937, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %943 = stablehlo.add %942, %36 : tensor<1x197x1xf32>
-    %944 = stablehlo.rsqrt %943 : tensor<1x197x1xf32>
-    %945 = stablehlo.broadcast_in_dim %923, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %946 = stablehlo.broadcast_in_dim %941, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %947 = stablehlo.subtract %945, %946 : tensor<1x197x768xf32>
-    %948 = stablehlo.broadcast_in_dim %947, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %949 = stablehlo.broadcast_in_dim %944, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %950 = stablehlo.multiply %948, %949 : tensor<1x197x768xf32>
-    %951 = stablehlo.convert %arg30 : (tensor<768xbf16>) -> tensor<768xf32>
-    %952 = stablehlo.broadcast_in_dim %950, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %953 = stablehlo.broadcast_in_dim %951, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %954 = stablehlo.multiply %952, %953 : tensor<1x197x768xf32>
-    %955 = stablehlo.convert %arg31 : (tensor<768xbf16>) -> tensor<768xf32>
-    %956 = stablehlo.broadcast_in_dim %954, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %957 = stablehlo.broadcast_in_dim %955, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %958 = stablehlo.add %956, %957 : tensor<1x197x768xf32>
-    %959 = stablehlo.convert %958 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %960 = stablehlo.reshape %959 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %961 = stablehlo.convert %960 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %962 = stablehlo.dot_general %961, %arg134, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %963 = stablehlo.broadcast_in_dim %962, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %964 = stablehlo.multiply %963, %170 : tensor<197x3072xf32>
-    %965 = stablehlo.broadcast_in_dim %964, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %966 = stablehlo.broadcast_in_dim %arg135, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %967 = stablehlo.add %965, %966 : tensor<197x3072xf32>
-    %968 = stablehlo.convert %967 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %969 = stablehlo.reshape %968 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %970 = stablehlo.multiply %969, %cst_4 : tensor<1x197x3072xbf16>
-    %971 = stablehlo.multiply %969, %178 : tensor<1x197x3072xbf16>
-    %972 = stablehlo.convert %971 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %973 = stablehlo.clamp %cst_5, %972, %cst_6 : tensor<1x197x3072xf32>
-    %974 = stablehlo.multiply %973, %973 : tensor<1x197x3072xf32>
-    %975 = stablehlo.multiply %cst_7, %974 : tensor<1x197x3072xf32>
-    %976 = stablehlo.add %975, %cst_8 : tensor<1x197x3072xf32>
-    %977 = stablehlo.multiply %976, %974 : tensor<1x197x3072xf32>
-    %978 = stablehlo.add %977, %cst_9 : tensor<1x197x3072xf32>
-    %979 = stablehlo.multiply %978, %974 : tensor<1x197x3072xf32>
-    %980 = stablehlo.add %979, %cst_10 : tensor<1x197x3072xf32>
-    %981 = stablehlo.multiply %980, %974 : tensor<1x197x3072xf32>
-    %982 = stablehlo.add %981, %cst_11 : tensor<1x197x3072xf32>
-    %983 = stablehlo.multiply %982, %974 : tensor<1x197x3072xf32>
-    %984 = stablehlo.add %983, %cst_12 : tensor<1x197x3072xf32>
-    %985 = stablehlo.multiply %984, %974 : tensor<1x197x3072xf32>
-    %986 = stablehlo.add %985, %cst_13 : tensor<1x197x3072xf32>
-    %987 = stablehlo.multiply %cst_14, %974 : tensor<1x197x3072xf32>
-    %988 = stablehlo.add %987, %cst_15 : tensor<1x197x3072xf32>
-    %989 = stablehlo.multiply %988, %974 : tensor<1x197x3072xf32>
-    %990 = stablehlo.add %989, %cst_16 : tensor<1x197x3072xf32>
-    %991 = stablehlo.multiply %990, %974 : tensor<1x197x3072xf32>
-    %992 = stablehlo.add %991, %cst_17 : tensor<1x197x3072xf32>
-    %993 = stablehlo.multiply %992, %974 : tensor<1x197x3072xf32>
-    %994 = stablehlo.add %993, %cst_18 : tensor<1x197x3072xf32>
-    %995 = stablehlo.multiply %973, %986 : tensor<1x197x3072xf32>
-    %996 = stablehlo.divide %995, %994 : tensor<1x197x3072xf32>
-    %997 = stablehlo.clamp %cst_19, %996, %cst_20 : tensor<1x197x3072xf32>
-    %998 = stablehlo.convert %997 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %999 = stablehlo.add %998, %cst_2 : tensor<1x197x3072xbf16>
-    %1000 = stablehlo.multiply %999, %970 : tensor<1x197x3072xbf16>
-    %1001 = stablehlo.reshape %1000 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %1002 = stablehlo.convert %1001 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %1003 = stablehlo.dot_general %1002, %arg136, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %1004 = stablehlo.broadcast_in_dim %1003, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1005 = stablehlo.multiply %1004, %60 : tensor<197x768xf32>
-    %1006 = stablehlo.broadcast_in_dim %1005, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1007 = stablehlo.broadcast_in_dim %arg137, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1008 = stablehlo.add %1006, %1007 : tensor<197x768xf32>
-    %1009 = stablehlo.convert %1008 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1010 = stablehlo.reshape %1009 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1011 = stablehlo.broadcast_in_dim %arg32, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1012 = stablehlo.broadcast_in_dim %1010, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1013 = stablehlo.multiply %1011, %1012 : tensor<1x197x768xbf16>
-    %1014 = stablehlo.add %1013, %922 : tensor<1x197x768xbf16>
-    %1015 = stablehlo.convert %1014 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1016 = stablehlo.convert %1015 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1017 = stablehlo.reduce(%1016 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1018 = stablehlo.reshape %1017 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1019 = stablehlo.broadcast_in_dim %1018, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1020 = stablehlo.divide %1019, %15 : tensor<1x197x1xf64>
-    %1021 = stablehlo.broadcast_in_dim %1016, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1022 = stablehlo.broadcast_in_dim %1020, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1023 = stablehlo.subtract %1021, %1022 : tensor<1x197x768xf64>
-    %1024 = stablehlo.multiply %1023, %1023 : tensor<1x197x768xf64>
-    %1025 = stablehlo.reduce(%1024 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1026 = stablehlo.reshape %1025 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1027 = stablehlo.broadcast_in_dim %1026, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1028 = stablehlo.divide %1027, %15 : tensor<1x197x1xf64>
-    %1029 = stablehlo.convert %1028 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1030 = stablehlo.reduce(%1015 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1031 = stablehlo.reshape %1030 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1032 = stablehlo.broadcast_in_dim %1031, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1033 = stablehlo.divide %1032, %31 : tensor<1x197x1xf32>
-    %1034 = stablehlo.broadcast_in_dim %1029, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1035 = stablehlo.add %1034, %36 : tensor<1x197x1xf32>
-    %1036 = stablehlo.rsqrt %1035 : tensor<1x197x1xf32>
-    %1037 = stablehlo.broadcast_in_dim %1015, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1038 = stablehlo.broadcast_in_dim %1033, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1039 = stablehlo.subtract %1037, %1038 : tensor<1x197x768xf32>
-    %1040 = stablehlo.broadcast_in_dim %1039, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1041 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1042 = stablehlo.multiply %1040, %1041 : tensor<1x197x768xf32>
-    %1043 = stablehlo.convert %arg33 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1044 = stablehlo.broadcast_in_dim %1042, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1045 = stablehlo.broadcast_in_dim %1043, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1046 = stablehlo.multiply %1044, %1045 : tensor<1x197x768xf32>
-    %1047 = stablehlo.convert %arg34 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1048 = stablehlo.broadcast_in_dim %1046, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1049 = stablehlo.broadcast_in_dim %1047, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1050 = stablehlo.add %1048, %1049 : tensor<1x197x768xf32>
-    %1051 = stablehlo.convert %1050 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1052 = stablehlo.reshape %1051 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1053 = stablehlo.convert %1052 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1054 = stablehlo.dot_general %1053, %arg138, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1055 = stablehlo.broadcast_in_dim %1054, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1056 = stablehlo.multiply %1055, %60 : tensor<197x768xf32>
-    %1057 = stablehlo.broadcast_in_dim %1056, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1058 = stablehlo.broadcast_in_dim %arg139, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1059 = stablehlo.add %1057, %1058 : tensor<197x768xf32>
-    %1060 = stablehlo.convert %1059 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1061 = stablehlo.reshape %1060 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1062 = stablehlo.dot_general %1052, %arg140, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %1063 = stablehlo.reshape %1062 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1064 = stablehlo.reshape %1063 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1065 = stablehlo.transpose %1064, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1066 = stablehlo.dot_general %1053, %arg141, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1067 = stablehlo.broadcast_in_dim %1066, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1068 = stablehlo.multiply %1067, %60 : tensor<197x768xf32>
-    %1069 = stablehlo.broadcast_in_dim %1068, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1070 = stablehlo.broadcast_in_dim %arg142, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1071 = stablehlo.add %1069, %1070 : tensor<197x768xf32>
-    %1072 = stablehlo.convert %1071 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1073 = stablehlo.reshape %1072 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1074 = stablehlo.reshape %1073 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1075 = stablehlo.transpose %1074, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1076 = stablehlo.reshape %1061 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1077 = stablehlo.transpose %1076, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1078 = stablehlo.transpose %1065, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %1079 = stablehlo.reshape %1077 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1080 = stablehlo.reshape %1078 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1081 = stablehlo.broadcast_in_dim %1080, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1082 = stablehlo.dot_general %1079, %1081, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %1083 = stablehlo.reshape %1082 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1084 = stablehlo.broadcast_in_dim %1083, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1085 = stablehlo.divide %1084, %92 : tensor<1x12x197x197xbf16>
-    %1086 = stablehlo.add %1085, %arg143 : tensor<1x12x197x197xbf16>
-    %1087 = stablehlo.convert %1086 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %1088 = stablehlo.reduce(%1087 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1089 = stablehlo.reshape %1088 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1090 = stablehlo.broadcast_in_dim %1087, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1091 = stablehlo.broadcast_in_dim %1089, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1092 = stablehlo.subtract %1090, %1091 : tensor<1x12x197x197xf32>
-    %1093 = stablehlo.exponential %1092 : tensor<1x12x197x197xf32>
-    %1094 = stablehlo.reduce(%1093 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1095 = stablehlo.reshape %1094 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1096 = stablehlo.broadcast_in_dim %1093, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1097 = stablehlo.broadcast_in_dim %1095, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1098 = stablehlo.divide %1096, %1097 : tensor<1x12x197x197xf32>
-    %1099 = stablehlo.convert %1098 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %1100 = stablehlo.reshape %1099 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %1101 = stablehlo.reshape %1075 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1102 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1103 = stablehlo.dot_general %1100, %1102, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1104 = stablehlo.reshape %1103 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1105 = stablehlo.transpose %1104, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %1106 = stablehlo.reshape %1105 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %1107 = stablehlo.reshape %1106 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1108 = stablehlo.convert %1107 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1109 = stablehlo.dot_general %1108, %arg144, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1110 = stablehlo.broadcast_in_dim %1109, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1111 = stablehlo.multiply %1110, %60 : tensor<197x768xf32>
-    %1112 = stablehlo.broadcast_in_dim %1111, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1113 = stablehlo.broadcast_in_dim %arg145, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1114 = stablehlo.add %1112, %1113 : tensor<197x768xf32>
-    %1115 = stablehlo.convert %1114 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1116 = stablehlo.reshape %1115 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1117 = stablehlo.broadcast_in_dim %arg35, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1118 = stablehlo.broadcast_in_dim %1116, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1119 = stablehlo.multiply %1117, %1118 : tensor<1x197x768xbf16>
-    %1120 = stablehlo.add %1119, %1014 : tensor<1x197x768xbf16>
-    %1121 = stablehlo.convert %1120 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1122 = stablehlo.convert %1121 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1123 = stablehlo.reduce(%1122 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1124 = stablehlo.reshape %1123 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1125 = stablehlo.broadcast_in_dim %1124, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1126 = stablehlo.divide %1125, %15 : tensor<1x197x1xf64>
-    %1127 = stablehlo.broadcast_in_dim %1122, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1128 = stablehlo.broadcast_in_dim %1126, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1129 = stablehlo.subtract %1127, %1128 : tensor<1x197x768xf64>
-    %1130 = stablehlo.multiply %1129, %1129 : tensor<1x197x768xf64>
-    %1131 = stablehlo.reduce(%1130 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1132 = stablehlo.reshape %1131 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1133 = stablehlo.broadcast_in_dim %1132, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1134 = stablehlo.divide %1133, %15 : tensor<1x197x1xf64>
-    %1135 = stablehlo.convert %1134 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1136 = stablehlo.reduce(%1121 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1137 = stablehlo.reshape %1136 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1138 = stablehlo.broadcast_in_dim %1137, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1139 = stablehlo.divide %1138, %31 : tensor<1x197x1xf32>
-    %1140 = stablehlo.broadcast_in_dim %1135, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1141 = stablehlo.add %1140, %36 : tensor<1x197x1xf32>
-    %1142 = stablehlo.rsqrt %1141 : tensor<1x197x1xf32>
-    %1143 = stablehlo.broadcast_in_dim %1121, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1144 = stablehlo.broadcast_in_dim %1139, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1145 = stablehlo.subtract %1143, %1144 : tensor<1x197x768xf32>
-    %1146 = stablehlo.broadcast_in_dim %1145, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1147 = stablehlo.broadcast_in_dim %1142, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1148 = stablehlo.multiply %1146, %1147 : tensor<1x197x768xf32>
-    %1149 = stablehlo.convert %arg36 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1150 = stablehlo.broadcast_in_dim %1148, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1151 = stablehlo.broadcast_in_dim %1149, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1152 = stablehlo.multiply %1150, %1151 : tensor<1x197x768xf32>
-    %1153 = stablehlo.convert %arg37 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1154 = stablehlo.broadcast_in_dim %1152, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1155 = stablehlo.broadcast_in_dim %1153, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1156 = stablehlo.add %1154, %1155 : tensor<1x197x768xf32>
-    %1157 = stablehlo.convert %1156 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1158 = stablehlo.reshape %1157 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1159 = stablehlo.convert %1158 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1160 = stablehlo.dot_general %1159, %arg146, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %1161 = stablehlo.broadcast_in_dim %1160, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1162 = stablehlo.multiply %1161, %170 : tensor<197x3072xf32>
-    %1163 = stablehlo.broadcast_in_dim %1162, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1164 = stablehlo.broadcast_in_dim %arg147, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %1165 = stablehlo.add %1163, %1164 : tensor<197x3072xf32>
-    %1166 = stablehlo.convert %1165 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %1167 = stablehlo.reshape %1166 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %1168 = stablehlo.multiply %1167, %cst_4 : tensor<1x197x3072xbf16>
-    %1169 = stablehlo.multiply %1167, %178 : tensor<1x197x3072xbf16>
-    %1170 = stablehlo.convert %1169 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %1171 = stablehlo.clamp %cst_5, %1170, %cst_6 : tensor<1x197x3072xf32>
-    %1172 = stablehlo.multiply %1171, %1171 : tensor<1x197x3072xf32>
-    %1173 = stablehlo.multiply %cst_7, %1172 : tensor<1x197x3072xf32>
-    %1174 = stablehlo.add %1173, %cst_8 : tensor<1x197x3072xf32>
-    %1175 = stablehlo.multiply %1174, %1172 : tensor<1x197x3072xf32>
-    %1176 = stablehlo.add %1175, %cst_9 : tensor<1x197x3072xf32>
-    %1177 = stablehlo.multiply %1176, %1172 : tensor<1x197x3072xf32>
-    %1178 = stablehlo.add %1177, %cst_10 : tensor<1x197x3072xf32>
-    %1179 = stablehlo.multiply %1178, %1172 : tensor<1x197x3072xf32>
-    %1180 = stablehlo.add %1179, %cst_11 : tensor<1x197x3072xf32>
-    %1181 = stablehlo.multiply %1180, %1172 : tensor<1x197x3072xf32>
-    %1182 = stablehlo.add %1181, %cst_12 : tensor<1x197x3072xf32>
-    %1183 = stablehlo.multiply %1182, %1172 : tensor<1x197x3072xf32>
-    %1184 = stablehlo.add %1183, %cst_13 : tensor<1x197x3072xf32>
-    %1185 = stablehlo.multiply %cst_14, %1172 : tensor<1x197x3072xf32>
-    %1186 = stablehlo.add %1185, %cst_15 : tensor<1x197x3072xf32>
-    %1187 = stablehlo.multiply %1186, %1172 : tensor<1x197x3072xf32>
-    %1188 = stablehlo.add %1187, %cst_16 : tensor<1x197x3072xf32>
-    %1189 = stablehlo.multiply %1188, %1172 : tensor<1x197x3072xf32>
-    %1190 = stablehlo.add %1189, %cst_17 : tensor<1x197x3072xf32>
-    %1191 = stablehlo.multiply %1190, %1172 : tensor<1x197x3072xf32>
-    %1192 = stablehlo.add %1191, %cst_18 : tensor<1x197x3072xf32>
-    %1193 = stablehlo.multiply %1171, %1184 : tensor<1x197x3072xf32>
-    %1194 = stablehlo.divide %1193, %1192 : tensor<1x197x3072xf32>
-    %1195 = stablehlo.clamp %cst_19, %1194, %cst_20 : tensor<1x197x3072xf32>
-    %1196 = stablehlo.convert %1195 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %1197 = stablehlo.add %1196, %cst_2 : tensor<1x197x3072xbf16>
-    %1198 = stablehlo.multiply %1197, %1168 : tensor<1x197x3072xbf16>
-    %1199 = stablehlo.reshape %1198 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %1200 = stablehlo.convert %1199 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %1201 = stablehlo.dot_general %1200, %arg148, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %1202 = stablehlo.broadcast_in_dim %1201, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1203 = stablehlo.multiply %1202, %60 : tensor<197x768xf32>
-    %1204 = stablehlo.broadcast_in_dim %1203, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1205 = stablehlo.broadcast_in_dim %arg149, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1206 = stablehlo.add %1204, %1205 : tensor<197x768xf32>
-    %1207 = stablehlo.convert %1206 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1208 = stablehlo.reshape %1207 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1209 = stablehlo.broadcast_in_dim %arg38, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1210 = stablehlo.broadcast_in_dim %1208, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1211 = stablehlo.multiply %1209, %1210 : tensor<1x197x768xbf16>
-    %1212 = stablehlo.add %1211, %1120 : tensor<1x197x768xbf16>
-    %1213 = stablehlo.convert %1212 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1214 = stablehlo.convert %1213 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1215 = stablehlo.reduce(%1214 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1216 = stablehlo.reshape %1215 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1217 = stablehlo.broadcast_in_dim %1216, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1218 = stablehlo.divide %1217, %15 : tensor<1x197x1xf64>
-    %1219 = stablehlo.broadcast_in_dim %1214, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1220 = stablehlo.broadcast_in_dim %1218, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1221 = stablehlo.subtract %1219, %1220 : tensor<1x197x768xf64>
-    %1222 = stablehlo.multiply %1221, %1221 : tensor<1x197x768xf64>
-    %1223 = stablehlo.reduce(%1222 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1224 = stablehlo.reshape %1223 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1225 = stablehlo.broadcast_in_dim %1224, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1226 = stablehlo.divide %1225, %15 : tensor<1x197x1xf64>
-    %1227 = stablehlo.convert %1226 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1228 = stablehlo.reduce(%1213 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1229 = stablehlo.reshape %1228 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1230 = stablehlo.broadcast_in_dim %1229, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1231 = stablehlo.divide %1230, %31 : tensor<1x197x1xf32>
-    %1232 = stablehlo.broadcast_in_dim %1227, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1233 = stablehlo.add %1232, %36 : tensor<1x197x1xf32>
-    %1234 = stablehlo.rsqrt %1233 : tensor<1x197x1xf32>
-    %1235 = stablehlo.broadcast_in_dim %1213, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1236 = stablehlo.broadcast_in_dim %1231, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1237 = stablehlo.subtract %1235, %1236 : tensor<1x197x768xf32>
-    %1238 = stablehlo.broadcast_in_dim %1237, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1239 = stablehlo.broadcast_in_dim %1234, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1240 = stablehlo.multiply %1238, %1239 : tensor<1x197x768xf32>
-    %1241 = stablehlo.convert %arg39 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1242 = stablehlo.broadcast_in_dim %1240, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1243 = stablehlo.broadcast_in_dim %1241, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1244 = stablehlo.multiply %1242, %1243 : tensor<1x197x768xf32>
-    %1245 = stablehlo.convert %arg40 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1246 = stablehlo.broadcast_in_dim %1244, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1247 = stablehlo.broadcast_in_dim %1245, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1248 = stablehlo.add %1246, %1247 : tensor<1x197x768xf32>
-    %1249 = stablehlo.convert %1248 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1250 = stablehlo.reshape %1249 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1251 = stablehlo.convert %1250 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1252 = stablehlo.dot_general %1251, %arg150, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1253 = stablehlo.broadcast_in_dim %1252, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1254 = stablehlo.multiply %1253, %60 : tensor<197x768xf32>
-    %1255 = stablehlo.broadcast_in_dim %1254, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1256 = stablehlo.broadcast_in_dim %arg151, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1257 = stablehlo.add %1255, %1256 : tensor<197x768xf32>
-    %1258 = stablehlo.convert %1257 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1259 = stablehlo.reshape %1258 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1260 = stablehlo.dot_general %1250, %arg152, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %1261 = stablehlo.reshape %1260 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1262 = stablehlo.reshape %1261 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1263 = stablehlo.transpose %1262, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1264 = stablehlo.dot_general %1251, %arg153, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1265 = stablehlo.broadcast_in_dim %1264, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1266 = stablehlo.multiply %1265, %60 : tensor<197x768xf32>
-    %1267 = stablehlo.broadcast_in_dim %1266, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1268 = stablehlo.broadcast_in_dim %arg154, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1269 = stablehlo.add %1267, %1268 : tensor<197x768xf32>
-    %1270 = stablehlo.convert %1269 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1271 = stablehlo.reshape %1270 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1272 = stablehlo.reshape %1271 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1273 = stablehlo.transpose %1272, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1274 = stablehlo.reshape %1259 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1275 = stablehlo.transpose %1274, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1276 = stablehlo.transpose %1263, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %1277 = stablehlo.reshape %1275 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1278 = stablehlo.reshape %1276 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1279 = stablehlo.broadcast_in_dim %1278, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1280 = stablehlo.dot_general %1277, %1279, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %1281 = stablehlo.reshape %1280 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1282 = stablehlo.broadcast_in_dim %1281, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1283 = stablehlo.divide %1282, %92 : tensor<1x12x197x197xbf16>
-    %1284 = stablehlo.add %1283, %arg155 : tensor<1x12x197x197xbf16>
-    %1285 = stablehlo.convert %1284 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %1286 = stablehlo.reduce(%1285 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1287 = stablehlo.reshape %1286 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1288 = stablehlo.broadcast_in_dim %1285, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1289 = stablehlo.broadcast_in_dim %1287, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1290 = stablehlo.subtract %1288, %1289 : tensor<1x12x197x197xf32>
-    %1291 = stablehlo.exponential %1290 : tensor<1x12x197x197xf32>
-    %1292 = stablehlo.reduce(%1291 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1293 = stablehlo.reshape %1292 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1294 = stablehlo.broadcast_in_dim %1291, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1295 = stablehlo.broadcast_in_dim %1293, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1296 = stablehlo.divide %1294, %1295 : tensor<1x12x197x197xf32>
-    %1297 = stablehlo.convert %1296 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %1298 = stablehlo.reshape %1297 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %1299 = stablehlo.reshape %1273 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1300 = stablehlo.broadcast_in_dim %1299, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1301 = stablehlo.dot_general %1298, %1300, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1302 = stablehlo.reshape %1301 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1303 = stablehlo.transpose %1302, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %1304 = stablehlo.reshape %1303 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %1305 = stablehlo.reshape %1304 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1306 = stablehlo.convert %1305 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1307 = stablehlo.dot_general %1306, %arg156, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1308 = stablehlo.broadcast_in_dim %1307, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1309 = stablehlo.multiply %1308, %60 : tensor<197x768xf32>
-    %1310 = stablehlo.broadcast_in_dim %1309, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1311 = stablehlo.broadcast_in_dim %arg157, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1312 = stablehlo.add %1310, %1311 : tensor<197x768xf32>
-    %1313 = stablehlo.convert %1312 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1314 = stablehlo.reshape %1313 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1315 = stablehlo.broadcast_in_dim %arg41, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1316 = stablehlo.broadcast_in_dim %1314, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1317 = stablehlo.multiply %1315, %1316 : tensor<1x197x768xbf16>
-    %1318 = stablehlo.add %1317, %1212 : tensor<1x197x768xbf16>
-    %1319 = stablehlo.convert %1318 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1320 = stablehlo.convert %1319 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1321 = stablehlo.reduce(%1320 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1322 = stablehlo.reshape %1321 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1323 = stablehlo.broadcast_in_dim %1322, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1324 = stablehlo.divide %1323, %15 : tensor<1x197x1xf64>
-    %1325 = stablehlo.broadcast_in_dim %1320, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1326 = stablehlo.broadcast_in_dim %1324, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1327 = stablehlo.subtract %1325, %1326 : tensor<1x197x768xf64>
-    %1328 = stablehlo.multiply %1327, %1327 : tensor<1x197x768xf64>
-    %1329 = stablehlo.reduce(%1328 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1330 = stablehlo.reshape %1329 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1331 = stablehlo.broadcast_in_dim %1330, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1332 = stablehlo.divide %1331, %15 : tensor<1x197x1xf64>
-    %1333 = stablehlo.convert %1332 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1334 = stablehlo.reduce(%1319 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1335 = stablehlo.reshape %1334 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1336 = stablehlo.broadcast_in_dim %1335, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1337 = stablehlo.divide %1336, %31 : tensor<1x197x1xf32>
-    %1338 = stablehlo.broadcast_in_dim %1333, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1339 = stablehlo.add %1338, %36 : tensor<1x197x1xf32>
-    %1340 = stablehlo.rsqrt %1339 : tensor<1x197x1xf32>
-    %1341 = stablehlo.broadcast_in_dim %1319, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1342 = stablehlo.broadcast_in_dim %1337, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1343 = stablehlo.subtract %1341, %1342 : tensor<1x197x768xf32>
-    %1344 = stablehlo.broadcast_in_dim %1343, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1345 = stablehlo.broadcast_in_dim %1340, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1346 = stablehlo.multiply %1344, %1345 : tensor<1x197x768xf32>
-    %1347 = stablehlo.convert %arg42 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1348 = stablehlo.broadcast_in_dim %1346, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1349 = stablehlo.broadcast_in_dim %1347, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1350 = stablehlo.multiply %1348, %1349 : tensor<1x197x768xf32>
-    %1351 = stablehlo.convert %arg43 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1352 = stablehlo.broadcast_in_dim %1350, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1353 = stablehlo.broadcast_in_dim %1351, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1354 = stablehlo.add %1352, %1353 : tensor<1x197x768xf32>
-    %1355 = stablehlo.convert %1354 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1356 = stablehlo.reshape %1355 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1357 = stablehlo.convert %1356 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1358 = stablehlo.dot_general %1357, %arg158, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %1359 = stablehlo.broadcast_in_dim %1358, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1360 = stablehlo.multiply %1359, %170 : tensor<197x3072xf32>
-    %1361 = stablehlo.broadcast_in_dim %1360, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1362 = stablehlo.broadcast_in_dim %arg159, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %1363 = stablehlo.add %1361, %1362 : tensor<197x3072xf32>
-    %1364 = stablehlo.convert %1363 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %1365 = stablehlo.reshape %1364 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %1366 = stablehlo.multiply %1365, %cst_4 : tensor<1x197x3072xbf16>
-    %1367 = stablehlo.multiply %1365, %178 : tensor<1x197x3072xbf16>
-    %1368 = stablehlo.convert %1367 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %1369 = stablehlo.clamp %cst_5, %1368, %cst_6 : tensor<1x197x3072xf32>
-    %1370 = stablehlo.multiply %1369, %1369 : tensor<1x197x3072xf32>
-    %1371 = stablehlo.multiply %cst_7, %1370 : tensor<1x197x3072xf32>
-    %1372 = stablehlo.add %1371, %cst_8 : tensor<1x197x3072xf32>
-    %1373 = stablehlo.multiply %1372, %1370 : tensor<1x197x3072xf32>
-    %1374 = stablehlo.add %1373, %cst_9 : tensor<1x197x3072xf32>
-    %1375 = stablehlo.multiply %1374, %1370 : tensor<1x197x3072xf32>
-    %1376 = stablehlo.add %1375, %cst_10 : tensor<1x197x3072xf32>
-    %1377 = stablehlo.multiply %1376, %1370 : tensor<1x197x3072xf32>
-    %1378 = stablehlo.add %1377, %cst_11 : tensor<1x197x3072xf32>
-    %1379 = stablehlo.multiply %1378, %1370 : tensor<1x197x3072xf32>
-    %1380 = stablehlo.add %1379, %cst_12 : tensor<1x197x3072xf32>
-    %1381 = stablehlo.multiply %1380, %1370 : tensor<1x197x3072xf32>
-    %1382 = stablehlo.add %1381, %cst_13 : tensor<1x197x3072xf32>
-    %1383 = stablehlo.multiply %cst_14, %1370 : tensor<1x197x3072xf32>
-    %1384 = stablehlo.add %1383, %cst_15 : tensor<1x197x3072xf32>
-    %1385 = stablehlo.multiply %1384, %1370 : tensor<1x197x3072xf32>
-    %1386 = stablehlo.add %1385, %cst_16 : tensor<1x197x3072xf32>
-    %1387 = stablehlo.multiply %1386, %1370 : tensor<1x197x3072xf32>
-    %1388 = stablehlo.add %1387, %cst_17 : tensor<1x197x3072xf32>
-    %1389 = stablehlo.multiply %1388, %1370 : tensor<1x197x3072xf32>
-    %1390 = stablehlo.add %1389, %cst_18 : tensor<1x197x3072xf32>
-    %1391 = stablehlo.multiply %1369, %1382 : tensor<1x197x3072xf32>
-    %1392 = stablehlo.divide %1391, %1390 : tensor<1x197x3072xf32>
-    %1393 = stablehlo.clamp %cst_19, %1392, %cst_20 : tensor<1x197x3072xf32>
-    %1394 = stablehlo.convert %1393 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %1395 = stablehlo.add %1394, %cst_2 : tensor<1x197x3072xbf16>
-    %1396 = stablehlo.multiply %1395, %1366 : tensor<1x197x3072xbf16>
-    %1397 = stablehlo.reshape %1396 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %1398 = stablehlo.convert %1397 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %1399 = stablehlo.dot_general %1398, %arg160, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %1400 = stablehlo.broadcast_in_dim %1399, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1401 = stablehlo.multiply %1400, %60 : tensor<197x768xf32>
-    %1402 = stablehlo.broadcast_in_dim %1401, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1403 = stablehlo.broadcast_in_dim %arg161, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1404 = stablehlo.add %1402, %1403 : tensor<197x768xf32>
-    %1405 = stablehlo.convert %1404 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1406 = stablehlo.reshape %1405 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1407 = stablehlo.broadcast_in_dim %arg44, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1408 = stablehlo.broadcast_in_dim %1406, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1409 = stablehlo.multiply %1407, %1408 : tensor<1x197x768xbf16>
-    %1410 = stablehlo.add %1409, %1318 : tensor<1x197x768xbf16>
-    %1411 = stablehlo.convert %1410 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1412 = stablehlo.convert %1411 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1413 = stablehlo.reduce(%1412 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1414 = stablehlo.reshape %1413 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1415 = stablehlo.broadcast_in_dim %1414, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1416 = stablehlo.divide %1415, %15 : tensor<1x197x1xf64>
-    %1417 = stablehlo.broadcast_in_dim %1412, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1418 = stablehlo.broadcast_in_dim %1416, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1419 = stablehlo.subtract %1417, %1418 : tensor<1x197x768xf64>
-    %1420 = stablehlo.multiply %1419, %1419 : tensor<1x197x768xf64>
-    %1421 = stablehlo.reduce(%1420 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1422 = stablehlo.reshape %1421 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1423 = stablehlo.broadcast_in_dim %1422, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1424 = stablehlo.divide %1423, %15 : tensor<1x197x1xf64>
-    %1425 = stablehlo.convert %1424 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1426 = stablehlo.reduce(%1411 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1427 = stablehlo.reshape %1426 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1428 = stablehlo.broadcast_in_dim %1427, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1429 = stablehlo.divide %1428, %31 : tensor<1x197x1xf32>
-    %1430 = stablehlo.broadcast_in_dim %1425, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1431 = stablehlo.add %1430, %36 : tensor<1x197x1xf32>
-    %1432 = stablehlo.rsqrt %1431 : tensor<1x197x1xf32>
-    %1433 = stablehlo.broadcast_in_dim %1411, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1434 = stablehlo.broadcast_in_dim %1429, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1435 = stablehlo.subtract %1433, %1434 : tensor<1x197x768xf32>
-    %1436 = stablehlo.broadcast_in_dim %1435, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1437 = stablehlo.broadcast_in_dim %1432, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1438 = stablehlo.multiply %1436, %1437 : tensor<1x197x768xf32>
-    %1439 = stablehlo.convert %arg45 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1440 = stablehlo.broadcast_in_dim %1438, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1441 = stablehlo.broadcast_in_dim %1439, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1442 = stablehlo.multiply %1440, %1441 : tensor<1x197x768xf32>
-    %1443 = stablehlo.convert %arg46 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1444 = stablehlo.broadcast_in_dim %1442, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1445 = stablehlo.broadcast_in_dim %1443, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1446 = stablehlo.add %1444, %1445 : tensor<1x197x768xf32>
-    %1447 = stablehlo.convert %1446 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1448 = stablehlo.reshape %1447 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1449 = stablehlo.convert %1448 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1450 = stablehlo.dot_general %1449, %arg162, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1451 = stablehlo.broadcast_in_dim %1450, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1452 = stablehlo.multiply %1451, %60 : tensor<197x768xf32>
-    %1453 = stablehlo.broadcast_in_dim %1452, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1454 = stablehlo.broadcast_in_dim %arg163, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1455 = stablehlo.add %1453, %1454 : tensor<197x768xf32>
-    %1456 = stablehlo.convert %1455 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1457 = stablehlo.reshape %1456 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1458 = stablehlo.dot_general %1448, %arg164, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %1459 = stablehlo.reshape %1458 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1460 = stablehlo.reshape %1459 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1461 = stablehlo.transpose %1460, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1462 = stablehlo.dot_general %1449, %arg165, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1463 = stablehlo.broadcast_in_dim %1462, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1464 = stablehlo.multiply %1463, %60 : tensor<197x768xf32>
-    %1465 = stablehlo.broadcast_in_dim %1464, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1466 = stablehlo.broadcast_in_dim %arg166, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1467 = stablehlo.add %1465, %1466 : tensor<197x768xf32>
-    %1468 = stablehlo.convert %1467 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1469 = stablehlo.reshape %1468 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1470 = stablehlo.reshape %1469 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1471 = stablehlo.transpose %1470, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1472 = stablehlo.reshape %1457 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1473 = stablehlo.transpose %1472, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1474 = stablehlo.transpose %1461, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %1475 = stablehlo.reshape %1473 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1476 = stablehlo.reshape %1474 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1477 = stablehlo.broadcast_in_dim %1476, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1478 = stablehlo.dot_general %1475, %1477, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %1479 = stablehlo.reshape %1478 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1480 = stablehlo.broadcast_in_dim %1479, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1481 = stablehlo.divide %1480, %92 : tensor<1x12x197x197xbf16>
-    %1482 = stablehlo.add %1481, %arg167 : tensor<1x12x197x197xbf16>
-    %1483 = stablehlo.convert %1482 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %1484 = stablehlo.reduce(%1483 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1485 = stablehlo.reshape %1484 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1486 = stablehlo.broadcast_in_dim %1483, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1487 = stablehlo.broadcast_in_dim %1485, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1488 = stablehlo.subtract %1486, %1487 : tensor<1x12x197x197xf32>
-    %1489 = stablehlo.exponential %1488 : tensor<1x12x197x197xf32>
-    %1490 = stablehlo.reduce(%1489 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1491 = stablehlo.reshape %1490 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1492 = stablehlo.broadcast_in_dim %1489, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1493 = stablehlo.broadcast_in_dim %1491, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1494 = stablehlo.divide %1492, %1493 : tensor<1x12x197x197xf32>
-    %1495 = stablehlo.convert %1494 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %1496 = stablehlo.reshape %1495 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %1497 = stablehlo.reshape %1471 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1498 = stablehlo.broadcast_in_dim %1497, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1499 = stablehlo.dot_general %1496, %1498, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1500 = stablehlo.reshape %1499 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1501 = stablehlo.transpose %1500, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %1502 = stablehlo.reshape %1501 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %1503 = stablehlo.reshape %1502 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1504 = stablehlo.convert %1503 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1505 = stablehlo.dot_general %1504, %arg168, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1506 = stablehlo.broadcast_in_dim %1505, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1507 = stablehlo.multiply %1506, %60 : tensor<197x768xf32>
-    %1508 = stablehlo.broadcast_in_dim %1507, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1509 = stablehlo.broadcast_in_dim %arg169, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1510 = stablehlo.add %1508, %1509 : tensor<197x768xf32>
-    %1511 = stablehlo.convert %1510 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1512 = stablehlo.reshape %1511 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1513 = stablehlo.broadcast_in_dim %arg47, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1514 = stablehlo.broadcast_in_dim %1512, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1515 = stablehlo.multiply %1513, %1514 : tensor<1x197x768xbf16>
-    %1516 = stablehlo.add %1515, %1410 : tensor<1x197x768xbf16>
-    %1517 = stablehlo.convert %1516 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1518 = stablehlo.convert %1517 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1519 = stablehlo.reduce(%1518 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1520 = stablehlo.reshape %1519 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1521 = stablehlo.broadcast_in_dim %1520, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1522 = stablehlo.divide %1521, %15 : tensor<1x197x1xf64>
-    %1523 = stablehlo.broadcast_in_dim %1518, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1524 = stablehlo.broadcast_in_dim %1522, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1525 = stablehlo.subtract %1523, %1524 : tensor<1x197x768xf64>
-    %1526 = stablehlo.multiply %1525, %1525 : tensor<1x197x768xf64>
-    %1527 = stablehlo.reduce(%1526 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1528 = stablehlo.reshape %1527 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1529 = stablehlo.broadcast_in_dim %1528, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1530 = stablehlo.divide %1529, %15 : tensor<1x197x1xf64>
-    %1531 = stablehlo.convert %1530 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1532 = stablehlo.reduce(%1517 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1533 = stablehlo.reshape %1532 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1534 = stablehlo.broadcast_in_dim %1533, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1535 = stablehlo.divide %1534, %31 : tensor<1x197x1xf32>
-    %1536 = stablehlo.broadcast_in_dim %1531, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1537 = stablehlo.add %1536, %36 : tensor<1x197x1xf32>
-    %1538 = stablehlo.rsqrt %1537 : tensor<1x197x1xf32>
-    %1539 = stablehlo.broadcast_in_dim %1517, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1540 = stablehlo.broadcast_in_dim %1535, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1541 = stablehlo.subtract %1539, %1540 : tensor<1x197x768xf32>
-    %1542 = stablehlo.broadcast_in_dim %1541, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1543 = stablehlo.broadcast_in_dim %1538, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1544 = stablehlo.multiply %1542, %1543 : tensor<1x197x768xf32>
-    %1545 = stablehlo.convert %arg48 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1546 = stablehlo.broadcast_in_dim %1544, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1547 = stablehlo.broadcast_in_dim %1545, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1548 = stablehlo.multiply %1546, %1547 : tensor<1x197x768xf32>
-    %1549 = stablehlo.convert %arg49 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1550 = stablehlo.broadcast_in_dim %1548, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1551 = stablehlo.broadcast_in_dim %1549, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1552 = stablehlo.add %1550, %1551 : tensor<1x197x768xf32>
-    %1553 = stablehlo.convert %1552 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1554 = stablehlo.reshape %1553 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1555 = stablehlo.convert %1554 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1556 = stablehlo.dot_general %1555, %arg170, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %1557 = stablehlo.broadcast_in_dim %1556, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1558 = stablehlo.multiply %1557, %170 : tensor<197x3072xf32>
-    %1559 = stablehlo.broadcast_in_dim %1558, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1560 = stablehlo.broadcast_in_dim %arg171, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %1561 = stablehlo.add %1559, %1560 : tensor<197x3072xf32>
-    %1562 = stablehlo.convert %1561 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %1563 = stablehlo.reshape %1562 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %1564 = stablehlo.multiply %1563, %cst_4 : tensor<1x197x3072xbf16>
-    %1565 = stablehlo.multiply %1563, %178 : tensor<1x197x3072xbf16>
-    %1566 = stablehlo.convert %1565 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %1567 = stablehlo.clamp %cst_5, %1566, %cst_6 : tensor<1x197x3072xf32>
-    %1568 = stablehlo.multiply %1567, %1567 : tensor<1x197x3072xf32>
-    %1569 = stablehlo.multiply %cst_7, %1568 : tensor<1x197x3072xf32>
-    %1570 = stablehlo.add %1569, %cst_8 : tensor<1x197x3072xf32>
-    %1571 = stablehlo.multiply %1570, %1568 : tensor<1x197x3072xf32>
-    %1572 = stablehlo.add %1571, %cst_9 : tensor<1x197x3072xf32>
-    %1573 = stablehlo.multiply %1572, %1568 : tensor<1x197x3072xf32>
-    %1574 = stablehlo.add %1573, %cst_10 : tensor<1x197x3072xf32>
-    %1575 = stablehlo.multiply %1574, %1568 : tensor<1x197x3072xf32>
-    %1576 = stablehlo.add %1575, %cst_11 : tensor<1x197x3072xf32>
-    %1577 = stablehlo.multiply %1576, %1568 : tensor<1x197x3072xf32>
-    %1578 = stablehlo.add %1577, %cst_12 : tensor<1x197x3072xf32>
-    %1579 = stablehlo.multiply %1578, %1568 : tensor<1x197x3072xf32>
-    %1580 = stablehlo.add %1579, %cst_13 : tensor<1x197x3072xf32>
-    %1581 = stablehlo.multiply %cst_14, %1568 : tensor<1x197x3072xf32>
-    %1582 = stablehlo.add %1581, %cst_15 : tensor<1x197x3072xf32>
-    %1583 = stablehlo.multiply %1582, %1568 : tensor<1x197x3072xf32>
-    %1584 = stablehlo.add %1583, %cst_16 : tensor<1x197x3072xf32>
-    %1585 = stablehlo.multiply %1584, %1568 : tensor<1x197x3072xf32>
-    %1586 = stablehlo.add %1585, %cst_17 : tensor<1x197x3072xf32>
-    %1587 = stablehlo.multiply %1586, %1568 : tensor<1x197x3072xf32>
-    %1588 = stablehlo.add %1587, %cst_18 : tensor<1x197x3072xf32>
-    %1589 = stablehlo.multiply %1567, %1580 : tensor<1x197x3072xf32>
-    %1590 = stablehlo.divide %1589, %1588 : tensor<1x197x3072xf32>
-    %1591 = stablehlo.clamp %cst_19, %1590, %cst_20 : tensor<1x197x3072xf32>
-    %1592 = stablehlo.convert %1591 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %1593 = stablehlo.add %1592, %cst_2 : tensor<1x197x3072xbf16>
-    %1594 = stablehlo.multiply %1593, %1564 : tensor<1x197x3072xbf16>
-    %1595 = stablehlo.reshape %1594 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %1596 = stablehlo.convert %1595 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %1597 = stablehlo.dot_general %1596, %arg172, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %1598 = stablehlo.broadcast_in_dim %1597, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1599 = stablehlo.multiply %1598, %60 : tensor<197x768xf32>
-    %1600 = stablehlo.broadcast_in_dim %1599, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1601 = stablehlo.broadcast_in_dim %arg173, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1602 = stablehlo.add %1600, %1601 : tensor<197x768xf32>
-    %1603 = stablehlo.convert %1602 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1604 = stablehlo.reshape %1603 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1605 = stablehlo.broadcast_in_dim %arg50, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1606 = stablehlo.broadcast_in_dim %1604, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1607 = stablehlo.multiply %1605, %1606 : tensor<1x197x768xbf16>
-    %1608 = stablehlo.add %1607, %1516 : tensor<1x197x768xbf16>
-    %1609 = stablehlo.convert %1608 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1610 = stablehlo.convert %1609 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1611 = stablehlo.reduce(%1610 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1612 = stablehlo.reshape %1611 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1613 = stablehlo.broadcast_in_dim %1612, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1614 = stablehlo.divide %1613, %15 : tensor<1x197x1xf64>
-    %1615 = stablehlo.broadcast_in_dim %1610, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1616 = stablehlo.broadcast_in_dim %1614, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1617 = stablehlo.subtract %1615, %1616 : tensor<1x197x768xf64>
-    %1618 = stablehlo.multiply %1617, %1617 : tensor<1x197x768xf64>
-    %1619 = stablehlo.reduce(%1618 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1620 = stablehlo.reshape %1619 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1621 = stablehlo.broadcast_in_dim %1620, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1622 = stablehlo.divide %1621, %15 : tensor<1x197x1xf64>
-    %1623 = stablehlo.convert %1622 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1624 = stablehlo.reduce(%1609 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1625 = stablehlo.reshape %1624 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1626 = stablehlo.broadcast_in_dim %1625, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1627 = stablehlo.divide %1626, %31 : tensor<1x197x1xf32>
-    %1628 = stablehlo.broadcast_in_dim %1623, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1629 = stablehlo.add %1628, %36 : tensor<1x197x1xf32>
-    %1630 = stablehlo.rsqrt %1629 : tensor<1x197x1xf32>
-    %1631 = stablehlo.broadcast_in_dim %1609, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1632 = stablehlo.broadcast_in_dim %1627, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1633 = stablehlo.subtract %1631, %1632 : tensor<1x197x768xf32>
-    %1634 = stablehlo.broadcast_in_dim %1633, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1635 = stablehlo.broadcast_in_dim %1630, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1636 = stablehlo.multiply %1634, %1635 : tensor<1x197x768xf32>
-    %1637 = stablehlo.convert %arg51 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1638 = stablehlo.broadcast_in_dim %1636, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1639 = stablehlo.broadcast_in_dim %1637, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1640 = stablehlo.multiply %1638, %1639 : tensor<1x197x768xf32>
-    %1641 = stablehlo.convert %arg52 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1642 = stablehlo.broadcast_in_dim %1640, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1643 = stablehlo.broadcast_in_dim %1641, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1644 = stablehlo.add %1642, %1643 : tensor<1x197x768xf32>
-    %1645 = stablehlo.convert %1644 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1646 = stablehlo.reshape %1645 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1647 = stablehlo.convert %1646 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1648 = stablehlo.dot_general %1647, %arg174, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1649 = stablehlo.broadcast_in_dim %1648, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1650 = stablehlo.multiply %1649, %60 : tensor<197x768xf32>
-    %1651 = stablehlo.broadcast_in_dim %1650, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1652 = stablehlo.broadcast_in_dim %arg175, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1653 = stablehlo.add %1651, %1652 : tensor<197x768xf32>
-    %1654 = stablehlo.convert %1653 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1655 = stablehlo.reshape %1654 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1656 = stablehlo.dot_general %1646, %arg176, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %1657 = stablehlo.reshape %1656 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1658 = stablehlo.reshape %1657 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1659 = stablehlo.transpose %1658, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1660 = stablehlo.dot_general %1647, %arg177, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1661 = stablehlo.broadcast_in_dim %1660, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1662 = stablehlo.multiply %1661, %60 : tensor<197x768xf32>
-    %1663 = stablehlo.broadcast_in_dim %1662, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1664 = stablehlo.broadcast_in_dim %arg178, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1665 = stablehlo.add %1663, %1664 : tensor<197x768xf32>
-    %1666 = stablehlo.convert %1665 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1667 = stablehlo.reshape %1666 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1668 = stablehlo.reshape %1667 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1669 = stablehlo.transpose %1668, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1670 = stablehlo.reshape %1655 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1671 = stablehlo.transpose %1670, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1672 = stablehlo.transpose %1659, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %1673 = stablehlo.reshape %1671 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1674 = stablehlo.reshape %1672 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1675 = stablehlo.broadcast_in_dim %1674, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1676 = stablehlo.dot_general %1673, %1675, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %1677 = stablehlo.reshape %1676 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1678 = stablehlo.broadcast_in_dim %1677, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1679 = stablehlo.divide %1678, %92 : tensor<1x12x197x197xbf16>
-    %1680 = stablehlo.add %1679, %arg179 : tensor<1x12x197x197xbf16>
-    %1681 = stablehlo.convert %1680 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %1682 = stablehlo.reduce(%1681 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1683 = stablehlo.reshape %1682 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1684 = stablehlo.broadcast_in_dim %1681, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1685 = stablehlo.broadcast_in_dim %1683, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1686 = stablehlo.subtract %1684, %1685 : tensor<1x12x197x197xf32>
-    %1687 = stablehlo.exponential %1686 : tensor<1x12x197x197xf32>
-    %1688 = stablehlo.reduce(%1687 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1689 = stablehlo.reshape %1688 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1690 = stablehlo.broadcast_in_dim %1687, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1691 = stablehlo.broadcast_in_dim %1689, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1692 = stablehlo.divide %1690, %1691 : tensor<1x12x197x197xf32>
-    %1693 = stablehlo.convert %1692 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %1694 = stablehlo.reshape %1693 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %1695 = stablehlo.reshape %1669 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1696 = stablehlo.broadcast_in_dim %1695, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1697 = stablehlo.dot_general %1694, %1696, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1698 = stablehlo.reshape %1697 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1699 = stablehlo.transpose %1698, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %1700 = stablehlo.reshape %1699 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %1701 = stablehlo.reshape %1700 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1702 = stablehlo.convert %1701 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1703 = stablehlo.dot_general %1702, %arg180, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1704 = stablehlo.broadcast_in_dim %1703, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1705 = stablehlo.multiply %1704, %60 : tensor<197x768xf32>
-    %1706 = stablehlo.broadcast_in_dim %1705, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1707 = stablehlo.broadcast_in_dim %arg181, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1708 = stablehlo.add %1706, %1707 : tensor<197x768xf32>
-    %1709 = stablehlo.convert %1708 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1710 = stablehlo.reshape %1709 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1711 = stablehlo.broadcast_in_dim %arg53, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1712 = stablehlo.broadcast_in_dim %1710, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1713 = stablehlo.multiply %1711, %1712 : tensor<1x197x768xbf16>
-    %1714 = stablehlo.add %1713, %1608 : tensor<1x197x768xbf16>
-    %1715 = stablehlo.convert %1714 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1716 = stablehlo.convert %1715 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1717 = stablehlo.reduce(%1716 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1718 = stablehlo.reshape %1717 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1719 = stablehlo.broadcast_in_dim %1718, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1720 = stablehlo.divide %1719, %15 : tensor<1x197x1xf64>
-    %1721 = stablehlo.broadcast_in_dim %1716, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1722 = stablehlo.broadcast_in_dim %1720, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1723 = stablehlo.subtract %1721, %1722 : tensor<1x197x768xf64>
-    %1724 = stablehlo.multiply %1723, %1723 : tensor<1x197x768xf64>
-    %1725 = stablehlo.reduce(%1724 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1726 = stablehlo.reshape %1725 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1727 = stablehlo.broadcast_in_dim %1726, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1728 = stablehlo.divide %1727, %15 : tensor<1x197x1xf64>
-    %1729 = stablehlo.convert %1728 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1730 = stablehlo.reduce(%1715 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1731 = stablehlo.reshape %1730 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1732 = stablehlo.broadcast_in_dim %1731, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1733 = stablehlo.divide %1732, %31 : tensor<1x197x1xf32>
-    %1734 = stablehlo.broadcast_in_dim %1729, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1735 = stablehlo.add %1734, %36 : tensor<1x197x1xf32>
-    %1736 = stablehlo.rsqrt %1735 : tensor<1x197x1xf32>
-    %1737 = stablehlo.broadcast_in_dim %1715, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1738 = stablehlo.broadcast_in_dim %1733, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1739 = stablehlo.subtract %1737, %1738 : tensor<1x197x768xf32>
-    %1740 = stablehlo.broadcast_in_dim %1739, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1741 = stablehlo.broadcast_in_dim %1736, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1742 = stablehlo.multiply %1740, %1741 : tensor<1x197x768xf32>
-    %1743 = stablehlo.convert %arg54 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1744 = stablehlo.broadcast_in_dim %1742, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1745 = stablehlo.broadcast_in_dim %1743, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1746 = stablehlo.multiply %1744, %1745 : tensor<1x197x768xf32>
-    %1747 = stablehlo.convert %arg55 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1748 = stablehlo.broadcast_in_dim %1746, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1749 = stablehlo.broadcast_in_dim %1747, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1750 = stablehlo.add %1748, %1749 : tensor<1x197x768xf32>
-    %1751 = stablehlo.convert %1750 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1752 = stablehlo.reshape %1751 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1753 = stablehlo.convert %1752 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1754 = stablehlo.dot_general %1753, %arg182, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %1755 = stablehlo.broadcast_in_dim %1754, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1756 = stablehlo.multiply %1755, %170 : tensor<197x3072xf32>
-    %1757 = stablehlo.broadcast_in_dim %1756, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1758 = stablehlo.broadcast_in_dim %arg183, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %1759 = stablehlo.add %1757, %1758 : tensor<197x3072xf32>
-    %1760 = stablehlo.convert %1759 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %1761 = stablehlo.reshape %1760 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %1762 = stablehlo.multiply %1761, %cst_4 : tensor<1x197x3072xbf16>
-    %1763 = stablehlo.multiply %1761, %178 : tensor<1x197x3072xbf16>
-    %1764 = stablehlo.convert %1763 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %1765 = stablehlo.clamp %cst_5, %1764, %cst_6 : tensor<1x197x3072xf32>
-    %1766 = stablehlo.multiply %1765, %1765 : tensor<1x197x3072xf32>
-    %1767 = stablehlo.multiply %cst_7, %1766 : tensor<1x197x3072xf32>
-    %1768 = stablehlo.add %1767, %cst_8 : tensor<1x197x3072xf32>
-    %1769 = stablehlo.multiply %1768, %1766 : tensor<1x197x3072xf32>
-    %1770 = stablehlo.add %1769, %cst_9 : tensor<1x197x3072xf32>
-    %1771 = stablehlo.multiply %1770, %1766 : tensor<1x197x3072xf32>
-    %1772 = stablehlo.add %1771, %cst_10 : tensor<1x197x3072xf32>
-    %1773 = stablehlo.multiply %1772, %1766 : tensor<1x197x3072xf32>
-    %1774 = stablehlo.add %1773, %cst_11 : tensor<1x197x3072xf32>
-    %1775 = stablehlo.multiply %1774, %1766 : tensor<1x197x3072xf32>
-    %1776 = stablehlo.add %1775, %cst_12 : tensor<1x197x3072xf32>
-    %1777 = stablehlo.multiply %1776, %1766 : tensor<1x197x3072xf32>
-    %1778 = stablehlo.add %1777, %cst_13 : tensor<1x197x3072xf32>
-    %1779 = stablehlo.multiply %cst_14, %1766 : tensor<1x197x3072xf32>
-    %1780 = stablehlo.add %1779, %cst_15 : tensor<1x197x3072xf32>
-    %1781 = stablehlo.multiply %1780, %1766 : tensor<1x197x3072xf32>
-    %1782 = stablehlo.add %1781, %cst_16 : tensor<1x197x3072xf32>
-    %1783 = stablehlo.multiply %1782, %1766 : tensor<1x197x3072xf32>
-    %1784 = stablehlo.add %1783, %cst_17 : tensor<1x197x3072xf32>
-    %1785 = stablehlo.multiply %1784, %1766 : tensor<1x197x3072xf32>
-    %1786 = stablehlo.add %1785, %cst_18 : tensor<1x197x3072xf32>
-    %1787 = stablehlo.multiply %1765, %1778 : tensor<1x197x3072xf32>
-    %1788 = stablehlo.divide %1787, %1786 : tensor<1x197x3072xf32>
-    %1789 = stablehlo.clamp %cst_19, %1788, %cst_20 : tensor<1x197x3072xf32>
-    %1790 = stablehlo.convert %1789 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %1791 = stablehlo.add %1790, %cst_2 : tensor<1x197x3072xbf16>
-    %1792 = stablehlo.multiply %1791, %1762 : tensor<1x197x3072xbf16>
-    %1793 = stablehlo.reshape %1792 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %1794 = stablehlo.convert %1793 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %1795 = stablehlo.dot_general %1794, %arg184, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %1796 = stablehlo.broadcast_in_dim %1795, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1797 = stablehlo.multiply %1796, %60 : tensor<197x768xf32>
-    %1798 = stablehlo.broadcast_in_dim %1797, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1799 = stablehlo.broadcast_in_dim %arg185, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1800 = stablehlo.add %1798, %1799 : tensor<197x768xf32>
-    %1801 = stablehlo.convert %1800 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1802 = stablehlo.reshape %1801 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1803 = stablehlo.broadcast_in_dim %arg56, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1804 = stablehlo.broadcast_in_dim %1802, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1805 = stablehlo.multiply %1803, %1804 : tensor<1x197x768xbf16>
-    %1806 = stablehlo.add %1805, %1714 : tensor<1x197x768xbf16>
-    %1807 = stablehlo.convert %1806 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1808 = stablehlo.convert %1807 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1809 = stablehlo.reduce(%1808 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1810 = stablehlo.reshape %1809 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1811 = stablehlo.broadcast_in_dim %1810, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1812 = stablehlo.divide %1811, %15 : tensor<1x197x1xf64>
-    %1813 = stablehlo.broadcast_in_dim %1808, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1814 = stablehlo.broadcast_in_dim %1812, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1815 = stablehlo.subtract %1813, %1814 : tensor<1x197x768xf64>
-    %1816 = stablehlo.multiply %1815, %1815 : tensor<1x197x768xf64>
-    %1817 = stablehlo.reduce(%1816 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1818 = stablehlo.reshape %1817 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1819 = stablehlo.broadcast_in_dim %1818, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1820 = stablehlo.divide %1819, %15 : tensor<1x197x1xf64>
-    %1821 = stablehlo.convert %1820 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1822 = stablehlo.reduce(%1807 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1823 = stablehlo.reshape %1822 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1824 = stablehlo.broadcast_in_dim %1823, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1825 = stablehlo.divide %1824, %31 : tensor<1x197x1xf32>
-    %1826 = stablehlo.broadcast_in_dim %1821, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1827 = stablehlo.add %1826, %36 : tensor<1x197x1xf32>
-    %1828 = stablehlo.rsqrt %1827 : tensor<1x197x1xf32>
-    %1829 = stablehlo.broadcast_in_dim %1807, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1830 = stablehlo.broadcast_in_dim %1825, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1831 = stablehlo.subtract %1829, %1830 : tensor<1x197x768xf32>
-    %1832 = stablehlo.broadcast_in_dim %1831, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1833 = stablehlo.broadcast_in_dim %1828, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1834 = stablehlo.multiply %1832, %1833 : tensor<1x197x768xf32>
-    %1835 = stablehlo.convert %arg57 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1836 = stablehlo.broadcast_in_dim %1834, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1837 = stablehlo.broadcast_in_dim %1835, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1838 = stablehlo.multiply %1836, %1837 : tensor<1x197x768xf32>
-    %1839 = stablehlo.convert %arg58 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1840 = stablehlo.broadcast_in_dim %1838, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1841 = stablehlo.broadcast_in_dim %1839, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1842 = stablehlo.add %1840, %1841 : tensor<1x197x768xf32>
-    %1843 = stablehlo.convert %1842 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1844 = stablehlo.reshape %1843 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1845 = stablehlo.convert %1844 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1846 = stablehlo.dot_general %1845, %arg186, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1847 = stablehlo.broadcast_in_dim %1846, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1848 = stablehlo.multiply %1847, %60 : tensor<197x768xf32>
-    %1849 = stablehlo.broadcast_in_dim %1848, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1850 = stablehlo.broadcast_in_dim %arg187, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1851 = stablehlo.add %1849, %1850 : tensor<197x768xf32>
-    %1852 = stablehlo.convert %1851 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1853 = stablehlo.reshape %1852 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1854 = stablehlo.dot_general %1844, %arg188, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %1855 = stablehlo.reshape %1854 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1856 = stablehlo.reshape %1855 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1857 = stablehlo.transpose %1856, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1858 = stablehlo.dot_general %1845, %arg189, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1859 = stablehlo.broadcast_in_dim %1858, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1860 = stablehlo.multiply %1859, %60 : tensor<197x768xf32>
-    %1861 = stablehlo.broadcast_in_dim %1860, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1862 = stablehlo.broadcast_in_dim %arg190, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1863 = stablehlo.add %1861, %1862 : tensor<197x768xf32>
-    %1864 = stablehlo.convert %1863 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1865 = stablehlo.reshape %1864 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1866 = stablehlo.reshape %1865 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1867 = stablehlo.transpose %1866, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1868 = stablehlo.reshape %1853 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %1869 = stablehlo.transpose %1868, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1870 = stablehlo.transpose %1857, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %1871 = stablehlo.reshape %1869 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1872 = stablehlo.reshape %1870 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1873 = stablehlo.broadcast_in_dim %1872, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %1874 = stablehlo.dot_general %1871, %1873, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %1875 = stablehlo.reshape %1874 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1876 = stablehlo.broadcast_in_dim %1875, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %1877 = stablehlo.divide %1876, %92 : tensor<1x12x197x197xbf16>
-    %1878 = stablehlo.add %1877, %arg191 : tensor<1x12x197x197xbf16>
-    %1879 = stablehlo.convert %1878 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %1880 = stablehlo.reduce(%1879 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1881 = stablehlo.reshape %1880 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1882 = stablehlo.broadcast_in_dim %1879, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1883 = stablehlo.broadcast_in_dim %1881, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1884 = stablehlo.subtract %1882, %1883 : tensor<1x12x197x197xf32>
-    %1885 = stablehlo.exponential %1884 : tensor<1x12x197x197xf32>
-    %1886 = stablehlo.reduce(%1885 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %1887 = stablehlo.reshape %1886 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %1888 = stablehlo.broadcast_in_dim %1885, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %1889 = stablehlo.broadcast_in_dim %1887, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %1890 = stablehlo.divide %1888, %1889 : tensor<1x12x197x197xf32>
-    %1891 = stablehlo.convert %1890 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %1892 = stablehlo.reshape %1891 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %1893 = stablehlo.reshape %1867 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1894 = stablehlo.broadcast_in_dim %1893, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1895 = stablehlo.dot_general %1892, %1894, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %1896 = stablehlo.reshape %1895 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %1897 = stablehlo.transpose %1896, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %1898 = stablehlo.reshape %1897 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %1899 = stablehlo.reshape %1898 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1900 = stablehlo.convert %1899 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1901 = stablehlo.dot_general %1900, %arg192, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %1902 = stablehlo.broadcast_in_dim %1901, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1903 = stablehlo.multiply %1902, %60 : tensor<197x768xf32>
-    %1904 = stablehlo.broadcast_in_dim %1903, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1905 = stablehlo.broadcast_in_dim %arg193, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1906 = stablehlo.add %1904, %1905 : tensor<197x768xf32>
-    %1907 = stablehlo.convert %1906 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %1908 = stablehlo.reshape %1907 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1909 = stablehlo.broadcast_in_dim %arg59, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %1910 = stablehlo.broadcast_in_dim %1908, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %1911 = stablehlo.multiply %1909, %1910 : tensor<1x197x768xbf16>
-    %1912 = stablehlo.add %1911, %1806 : tensor<1x197x768xbf16>
-    %1913 = stablehlo.convert %1912 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %1914 = stablehlo.convert %1913 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %1915 = stablehlo.reduce(%1914 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1916 = stablehlo.reshape %1915 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1917 = stablehlo.broadcast_in_dim %1916, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1918 = stablehlo.divide %1917, %15 : tensor<1x197x1xf64>
-    %1919 = stablehlo.broadcast_in_dim %1914, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %1920 = stablehlo.broadcast_in_dim %1918, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %1921 = stablehlo.subtract %1919, %1920 : tensor<1x197x768xf64>
-    %1922 = stablehlo.multiply %1921, %1921 : tensor<1x197x768xf64>
-    %1923 = stablehlo.reduce(%1922 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1924 = stablehlo.reshape %1923 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1925 = stablehlo.broadcast_in_dim %1924, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1926 = stablehlo.divide %1925, %15 : tensor<1x197x1xf64>
-    %1927 = stablehlo.convert %1926 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1928 = stablehlo.reduce(%1913 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1929 = stablehlo.reshape %1928 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1930 = stablehlo.broadcast_in_dim %1929, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1931 = stablehlo.divide %1930, %31 : tensor<1x197x1xf32>
-    %1932 = stablehlo.broadcast_in_dim %1927, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1933 = stablehlo.add %1932, %36 : tensor<1x197x1xf32>
-    %1934 = stablehlo.rsqrt %1933 : tensor<1x197x1xf32>
-    %1935 = stablehlo.broadcast_in_dim %1913, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1936 = stablehlo.broadcast_in_dim %1931, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1937 = stablehlo.subtract %1935, %1936 : tensor<1x197x768xf32>
-    %1938 = stablehlo.broadcast_in_dim %1937, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1939 = stablehlo.broadcast_in_dim %1934, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %1940 = stablehlo.multiply %1938, %1939 : tensor<1x197x768xf32>
-    %1941 = stablehlo.convert %arg60 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1942 = stablehlo.broadcast_in_dim %1940, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1943 = stablehlo.broadcast_in_dim %1941, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1944 = stablehlo.multiply %1942, %1943 : tensor<1x197x768xf32>
-    %1945 = stablehlo.convert %arg61 : (tensor<768xbf16>) -> tensor<768xf32>
-    %1946 = stablehlo.broadcast_in_dim %1944, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %1947 = stablehlo.broadcast_in_dim %1945, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %1948 = stablehlo.add %1946, %1947 : tensor<1x197x768xf32>
-    %1949 = stablehlo.convert %1948 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %1950 = stablehlo.reshape %1949 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %1951 = stablehlo.convert %1950 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %1952 = stablehlo.dot_general %1951, %arg194, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %1953 = stablehlo.broadcast_in_dim %1952, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1954 = stablehlo.multiply %1953, %170 : tensor<197x3072xf32>
-    %1955 = stablehlo.broadcast_in_dim %1954, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %1956 = stablehlo.broadcast_in_dim %arg195, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %1957 = stablehlo.add %1955, %1956 : tensor<197x3072xf32>
-    %1958 = stablehlo.convert %1957 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %1959 = stablehlo.reshape %1958 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %1960 = stablehlo.multiply %1959, %cst_4 : tensor<1x197x3072xbf16>
-    %1961 = stablehlo.multiply %1959, %178 : tensor<1x197x3072xbf16>
-    %1962 = stablehlo.convert %1961 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %1963 = stablehlo.clamp %cst_5, %1962, %cst_6 : tensor<1x197x3072xf32>
-    %1964 = stablehlo.multiply %1963, %1963 : tensor<1x197x3072xf32>
-    %1965 = stablehlo.multiply %cst_7, %1964 : tensor<1x197x3072xf32>
-    %1966 = stablehlo.add %1965, %cst_8 : tensor<1x197x3072xf32>
-    %1967 = stablehlo.multiply %1966, %1964 : tensor<1x197x3072xf32>
-    %1968 = stablehlo.add %1967, %cst_9 : tensor<1x197x3072xf32>
-    %1969 = stablehlo.multiply %1968, %1964 : tensor<1x197x3072xf32>
-    %1970 = stablehlo.add %1969, %cst_10 : tensor<1x197x3072xf32>
-    %1971 = stablehlo.multiply %1970, %1964 : tensor<1x197x3072xf32>
-    %1972 = stablehlo.add %1971, %cst_11 : tensor<1x197x3072xf32>
-    %1973 = stablehlo.multiply %1972, %1964 : tensor<1x197x3072xf32>
-    %1974 = stablehlo.add %1973, %cst_12 : tensor<1x197x3072xf32>
-    %1975 = stablehlo.multiply %1974, %1964 : tensor<1x197x3072xf32>
-    %1976 = stablehlo.add %1975, %cst_13 : tensor<1x197x3072xf32>
-    %1977 = stablehlo.multiply %cst_14, %1964 : tensor<1x197x3072xf32>
-    %1978 = stablehlo.add %1977, %cst_15 : tensor<1x197x3072xf32>
-    %1979 = stablehlo.multiply %1978, %1964 : tensor<1x197x3072xf32>
-    %1980 = stablehlo.add %1979, %cst_16 : tensor<1x197x3072xf32>
-    %1981 = stablehlo.multiply %1980, %1964 : tensor<1x197x3072xf32>
-    %1982 = stablehlo.add %1981, %cst_17 : tensor<1x197x3072xf32>
-    %1983 = stablehlo.multiply %1982, %1964 : tensor<1x197x3072xf32>
-    %1984 = stablehlo.add %1983, %cst_18 : tensor<1x197x3072xf32>
-    %1985 = stablehlo.multiply %1963, %1976 : tensor<1x197x3072xf32>
-    %1986 = stablehlo.divide %1985, %1984 : tensor<1x197x3072xf32>
-    %1987 = stablehlo.clamp %cst_19, %1986, %cst_20 : tensor<1x197x3072xf32>
-    %1988 = stablehlo.convert %1987 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %1989 = stablehlo.add %1988, %cst_2 : tensor<1x197x3072xbf16>
-    %1990 = stablehlo.multiply %1989, %1960 : tensor<1x197x3072xbf16>
-    %1991 = stablehlo.reshape %1990 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %1992 = stablehlo.convert %1991 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %1993 = stablehlo.dot_general %1992, %arg196, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %1994 = stablehlo.broadcast_in_dim %1993, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1995 = stablehlo.multiply %1994, %60 : tensor<197x768xf32>
-    %1996 = stablehlo.broadcast_in_dim %1995, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %1997 = stablehlo.broadcast_in_dim %arg197, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %1998 = stablehlo.add %1996, %1997 : tensor<197x768xf32>
-    %1999 = stablehlo.convert %1998 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2000 = stablehlo.reshape %1999 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2001 = stablehlo.broadcast_in_dim %arg62, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %2002 = stablehlo.broadcast_in_dim %2000, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2003 = stablehlo.multiply %2001, %2002 : tensor<1x197x768xbf16>
-    %2004 = stablehlo.add %2003, %1912 : tensor<1x197x768xbf16>
-    %2005 = stablehlo.convert %2004 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %2006 = stablehlo.convert %2005 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %2007 = stablehlo.reduce(%2006 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2008 = stablehlo.reshape %2007 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2009 = stablehlo.broadcast_in_dim %2008, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2010 = stablehlo.divide %2009, %15 : tensor<1x197x1xf64>
-    %2011 = stablehlo.broadcast_in_dim %2006, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %2012 = stablehlo.broadcast_in_dim %2010, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %2013 = stablehlo.subtract %2011, %2012 : tensor<1x197x768xf64>
-    %2014 = stablehlo.multiply %2013, %2013 : tensor<1x197x768xf64>
-    %2015 = stablehlo.reduce(%2014 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2016 = stablehlo.reshape %2015 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2017 = stablehlo.broadcast_in_dim %2016, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2018 = stablehlo.divide %2017, %15 : tensor<1x197x1xf64>
-    %2019 = stablehlo.convert %2018 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2020 = stablehlo.reduce(%2005 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2021 = stablehlo.reshape %2020 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2022 = stablehlo.broadcast_in_dim %2021, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2023 = stablehlo.divide %2022, %31 : tensor<1x197x1xf32>
-    %2024 = stablehlo.broadcast_in_dim %2019, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2025 = stablehlo.add %2024, %36 : tensor<1x197x1xf32>
-    %2026 = stablehlo.rsqrt %2025 : tensor<1x197x1xf32>
-    %2027 = stablehlo.broadcast_in_dim %2005, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2028 = stablehlo.broadcast_in_dim %2023, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2029 = stablehlo.subtract %2027, %2028 : tensor<1x197x768xf32>
-    %2030 = stablehlo.broadcast_in_dim %2029, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2031 = stablehlo.broadcast_in_dim %2026, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2032 = stablehlo.multiply %2030, %2031 : tensor<1x197x768xf32>
-    %2033 = stablehlo.convert %arg63 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2034 = stablehlo.broadcast_in_dim %2032, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2035 = stablehlo.broadcast_in_dim %2033, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2036 = stablehlo.multiply %2034, %2035 : tensor<1x197x768xf32>
-    %2037 = stablehlo.convert %arg64 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2038 = stablehlo.broadcast_in_dim %2036, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2039 = stablehlo.broadcast_in_dim %2037, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2040 = stablehlo.add %2038, %2039 : tensor<1x197x768xf32>
-    %2041 = stablehlo.convert %2040 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %2042 = stablehlo.reshape %2041 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %2043 = stablehlo.convert %2042 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %2044 = stablehlo.dot_general %2043, %arg198, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %2045 = stablehlo.broadcast_in_dim %2044, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2046 = stablehlo.multiply %2045, %60 : tensor<197x768xf32>
-    %2047 = stablehlo.broadcast_in_dim %2046, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2048 = stablehlo.broadcast_in_dim %arg199, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2049 = stablehlo.add %2047, %2048 : tensor<197x768xf32>
-    %2050 = stablehlo.convert %2049 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2051 = stablehlo.reshape %2050 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2052 = stablehlo.dot_general %2042, %arg200, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %2053 = stablehlo.reshape %2052 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2054 = stablehlo.reshape %2053 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %2055 = stablehlo.transpose %2054, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2056 = stablehlo.dot_general %2043, %arg201, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %2057 = stablehlo.broadcast_in_dim %2056, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2058 = stablehlo.multiply %2057, %60 : tensor<197x768xf32>
-    %2059 = stablehlo.broadcast_in_dim %2058, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2060 = stablehlo.broadcast_in_dim %arg202, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2061 = stablehlo.add %2059, %2060 : tensor<197x768xf32>
-    %2062 = stablehlo.convert %2061 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2063 = stablehlo.reshape %2062 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2064 = stablehlo.reshape %2063 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %2065 = stablehlo.transpose %2064, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2066 = stablehlo.reshape %2051 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %2067 = stablehlo.transpose %2066, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2068 = stablehlo.transpose %2055, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %2069 = stablehlo.reshape %2067 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2070 = stablehlo.reshape %2068 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %2071 = stablehlo.broadcast_in_dim %2070, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %2072 = stablehlo.dot_general %2069, %2071, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %2073 = stablehlo.reshape %2072 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %2074 = stablehlo.broadcast_in_dim %2073, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %2075 = stablehlo.divide %2074, %92 : tensor<1x12x197x197xbf16>
-    %2076 = stablehlo.add %2075, %arg203 : tensor<1x12x197x197xbf16>
-    %2077 = stablehlo.convert %2076 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %2078 = stablehlo.reduce(%2077 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %2079 = stablehlo.reshape %2078 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %2080 = stablehlo.broadcast_in_dim %2077, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %2081 = stablehlo.broadcast_in_dim %2079, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %2082 = stablehlo.subtract %2080, %2081 : tensor<1x12x197x197xf32>
-    %2083 = stablehlo.exponential %2082 : tensor<1x12x197x197xf32>
-    %2084 = stablehlo.reduce(%2083 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %2085 = stablehlo.reshape %2084 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %2086 = stablehlo.broadcast_in_dim %2083, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %2087 = stablehlo.broadcast_in_dim %2085, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %2088 = stablehlo.divide %2086, %2087 : tensor<1x12x197x197xf32>
-    %2089 = stablehlo.convert %2088 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %2090 = stablehlo.reshape %2089 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %2091 = stablehlo.reshape %2065 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2092 = stablehlo.broadcast_in_dim %2091, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2093 = stablehlo.dot_general %2090, %2092, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2094 = stablehlo.reshape %2093 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2095 = stablehlo.transpose %2094, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %2096 = stablehlo.reshape %2095 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %2097 = stablehlo.reshape %2096 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %2098 = stablehlo.convert %2097 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %2099 = stablehlo.dot_general %2098, %arg204, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %2100 = stablehlo.broadcast_in_dim %2099, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2101 = stablehlo.multiply %2100, %60 : tensor<197x768xf32>
-    %2102 = stablehlo.broadcast_in_dim %2101, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2103 = stablehlo.broadcast_in_dim %arg205, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2104 = stablehlo.add %2102, %2103 : tensor<197x768xf32>
-    %2105 = stablehlo.convert %2104 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2106 = stablehlo.reshape %2105 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2107 = stablehlo.broadcast_in_dim %arg65, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %2108 = stablehlo.broadcast_in_dim %2106, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2109 = stablehlo.multiply %2107, %2108 : tensor<1x197x768xbf16>
-    %2110 = stablehlo.add %2109, %2004 : tensor<1x197x768xbf16>
-    %2111 = stablehlo.convert %2110 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %2112 = stablehlo.convert %2111 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %2113 = stablehlo.reduce(%2112 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2114 = stablehlo.reshape %2113 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2115 = stablehlo.broadcast_in_dim %2114, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2116 = stablehlo.divide %2115, %15 : tensor<1x197x1xf64>
-    %2117 = stablehlo.broadcast_in_dim %2112, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %2118 = stablehlo.broadcast_in_dim %2116, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %2119 = stablehlo.subtract %2117, %2118 : tensor<1x197x768xf64>
-    %2120 = stablehlo.multiply %2119, %2119 : tensor<1x197x768xf64>
-    %2121 = stablehlo.reduce(%2120 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2122 = stablehlo.reshape %2121 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2123 = stablehlo.broadcast_in_dim %2122, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2124 = stablehlo.divide %2123, %15 : tensor<1x197x1xf64>
-    %2125 = stablehlo.convert %2124 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2126 = stablehlo.reduce(%2111 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2127 = stablehlo.reshape %2126 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2128 = stablehlo.broadcast_in_dim %2127, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2129 = stablehlo.divide %2128, %31 : tensor<1x197x1xf32>
-    %2130 = stablehlo.broadcast_in_dim %2125, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2131 = stablehlo.add %2130, %36 : tensor<1x197x1xf32>
-    %2132 = stablehlo.rsqrt %2131 : tensor<1x197x1xf32>
-    %2133 = stablehlo.broadcast_in_dim %2111, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2134 = stablehlo.broadcast_in_dim %2129, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2135 = stablehlo.subtract %2133, %2134 : tensor<1x197x768xf32>
-    %2136 = stablehlo.broadcast_in_dim %2135, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2137 = stablehlo.broadcast_in_dim %2132, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2138 = stablehlo.multiply %2136, %2137 : tensor<1x197x768xf32>
-    %2139 = stablehlo.convert %arg66 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2140 = stablehlo.broadcast_in_dim %2138, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2141 = stablehlo.broadcast_in_dim %2139, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2142 = stablehlo.multiply %2140, %2141 : tensor<1x197x768xf32>
-    %2143 = stablehlo.convert %arg67 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2144 = stablehlo.broadcast_in_dim %2142, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2145 = stablehlo.broadcast_in_dim %2143, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2146 = stablehlo.add %2144, %2145 : tensor<1x197x768xf32>
-    %2147 = stablehlo.convert %2146 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %2148 = stablehlo.reshape %2147 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %2149 = stablehlo.convert %2148 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %2150 = stablehlo.dot_general %2149, %arg206, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %2151 = stablehlo.broadcast_in_dim %2150, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %2152 = stablehlo.multiply %2151, %170 : tensor<197x3072xf32>
-    %2153 = stablehlo.broadcast_in_dim %2152, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %2154 = stablehlo.broadcast_in_dim %arg207, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %2155 = stablehlo.add %2153, %2154 : tensor<197x3072xf32>
-    %2156 = stablehlo.convert %2155 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %2157 = stablehlo.reshape %2156 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %2158 = stablehlo.multiply %2157, %cst_4 : tensor<1x197x3072xbf16>
-    %2159 = stablehlo.multiply %2157, %178 : tensor<1x197x3072xbf16>
-    %2160 = stablehlo.convert %2159 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %2161 = stablehlo.clamp %cst_5, %2160, %cst_6 : tensor<1x197x3072xf32>
-    %2162 = stablehlo.multiply %2161, %2161 : tensor<1x197x3072xf32>
-    %2163 = stablehlo.multiply %cst_7, %2162 : tensor<1x197x3072xf32>
-    %2164 = stablehlo.add %2163, %cst_8 : tensor<1x197x3072xf32>
-    %2165 = stablehlo.multiply %2164, %2162 : tensor<1x197x3072xf32>
-    %2166 = stablehlo.add %2165, %cst_9 : tensor<1x197x3072xf32>
-    %2167 = stablehlo.multiply %2166, %2162 : tensor<1x197x3072xf32>
-    %2168 = stablehlo.add %2167, %cst_10 : tensor<1x197x3072xf32>
-    %2169 = stablehlo.multiply %2168, %2162 : tensor<1x197x3072xf32>
-    %2170 = stablehlo.add %2169, %cst_11 : tensor<1x197x3072xf32>
-    %2171 = stablehlo.multiply %2170, %2162 : tensor<1x197x3072xf32>
-    %2172 = stablehlo.add %2171, %cst_12 : tensor<1x197x3072xf32>
-    %2173 = stablehlo.multiply %2172, %2162 : tensor<1x197x3072xf32>
-    %2174 = stablehlo.add %2173, %cst_13 : tensor<1x197x3072xf32>
-    %2175 = stablehlo.multiply %cst_14, %2162 : tensor<1x197x3072xf32>
-    %2176 = stablehlo.add %2175, %cst_15 : tensor<1x197x3072xf32>
-    %2177 = stablehlo.multiply %2176, %2162 : tensor<1x197x3072xf32>
-    %2178 = stablehlo.add %2177, %cst_16 : tensor<1x197x3072xf32>
-    %2179 = stablehlo.multiply %2178, %2162 : tensor<1x197x3072xf32>
-    %2180 = stablehlo.add %2179, %cst_17 : tensor<1x197x3072xf32>
-    %2181 = stablehlo.multiply %2180, %2162 : tensor<1x197x3072xf32>
-    %2182 = stablehlo.add %2181, %cst_18 : tensor<1x197x3072xf32>
-    %2183 = stablehlo.multiply %2161, %2174 : tensor<1x197x3072xf32>
-    %2184 = stablehlo.divide %2183, %2182 : tensor<1x197x3072xf32>
-    %2185 = stablehlo.clamp %cst_19, %2184, %cst_20 : tensor<1x197x3072xf32>
-    %2186 = stablehlo.convert %2185 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %2187 = stablehlo.add %2186, %cst_2 : tensor<1x197x3072xbf16>
-    %2188 = stablehlo.multiply %2187, %2158 : tensor<1x197x3072xbf16>
-    %2189 = stablehlo.reshape %2188 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %2190 = stablehlo.convert %2189 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %2191 = stablehlo.dot_general %2190, %arg208, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %2192 = stablehlo.broadcast_in_dim %2191, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2193 = stablehlo.multiply %2192, %60 : tensor<197x768xf32>
-    %2194 = stablehlo.broadcast_in_dim %2193, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2195 = stablehlo.broadcast_in_dim %arg209, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2196 = stablehlo.add %2194, %2195 : tensor<197x768xf32>
-    %2197 = stablehlo.convert %2196 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2198 = stablehlo.reshape %2197 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2199 = stablehlo.broadcast_in_dim %arg68, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %2200 = stablehlo.broadcast_in_dim %2198, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2201 = stablehlo.multiply %2199, %2200 : tensor<1x197x768xbf16>
-    %2202 = stablehlo.add %2201, %2110 : tensor<1x197x768xbf16>
-    %2203 = stablehlo.convert %2202 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %2204 = stablehlo.convert %2203 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %2205 = stablehlo.reduce(%2204 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2206 = stablehlo.reshape %2205 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2207 = stablehlo.broadcast_in_dim %2206, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2208 = stablehlo.divide %2207, %15 : tensor<1x197x1xf64>
-    %2209 = stablehlo.broadcast_in_dim %2204, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %2210 = stablehlo.broadcast_in_dim %2208, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %2211 = stablehlo.subtract %2209, %2210 : tensor<1x197x768xf64>
-    %2212 = stablehlo.multiply %2211, %2211 : tensor<1x197x768xf64>
-    %2213 = stablehlo.reduce(%2212 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2214 = stablehlo.reshape %2213 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2215 = stablehlo.broadcast_in_dim %2214, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2216 = stablehlo.divide %2215, %15 : tensor<1x197x1xf64>
-    %2217 = stablehlo.convert %2216 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2218 = stablehlo.reduce(%2203 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2219 = stablehlo.reshape %2218 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2220 = stablehlo.broadcast_in_dim %2219, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2221 = stablehlo.divide %2220, %31 : tensor<1x197x1xf32>
-    %2222 = stablehlo.broadcast_in_dim %2217, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2223 = stablehlo.add %2222, %36 : tensor<1x197x1xf32>
-    %2224 = stablehlo.rsqrt %2223 : tensor<1x197x1xf32>
-    %2225 = stablehlo.broadcast_in_dim %2203, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2226 = stablehlo.broadcast_in_dim %2221, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2227 = stablehlo.subtract %2225, %2226 : tensor<1x197x768xf32>
-    %2228 = stablehlo.broadcast_in_dim %2227, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2229 = stablehlo.broadcast_in_dim %2224, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2230 = stablehlo.multiply %2228, %2229 : tensor<1x197x768xf32>
-    %2231 = stablehlo.convert %arg69 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2232 = stablehlo.broadcast_in_dim %2230, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2233 = stablehlo.broadcast_in_dim %2231, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2234 = stablehlo.multiply %2232, %2233 : tensor<1x197x768xf32>
-    %2235 = stablehlo.convert %arg70 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2236 = stablehlo.broadcast_in_dim %2234, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2237 = stablehlo.broadcast_in_dim %2235, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2238 = stablehlo.add %2236, %2237 : tensor<1x197x768xf32>
-    %2239 = stablehlo.convert %2238 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %2240 = stablehlo.reshape %2239 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %2241 = stablehlo.convert %2240 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %2242 = stablehlo.dot_general %2241, %arg210, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %2243 = stablehlo.broadcast_in_dim %2242, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2244 = stablehlo.multiply %2243, %60 : tensor<197x768xf32>
-    %2245 = stablehlo.broadcast_in_dim %2244, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2246 = stablehlo.broadcast_in_dim %arg211, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2247 = stablehlo.add %2245, %2246 : tensor<197x768xf32>
-    %2248 = stablehlo.convert %2247 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2249 = stablehlo.reshape %2248 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2250 = stablehlo.dot_general %2240, %arg212, contracting_dims = [1] x [0] : (tensor<197x768xbf16>, tensor<768x768xbf16>) -> tensor<197x768xbf16>
-    %2251 = stablehlo.reshape %2250 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2252 = stablehlo.reshape %2251 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %2253 = stablehlo.transpose %2252, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2254 = stablehlo.dot_general %2241, %arg213, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %2255 = stablehlo.broadcast_in_dim %2254, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2256 = stablehlo.multiply %2255, %60 : tensor<197x768xf32>
-    %2257 = stablehlo.broadcast_in_dim %2256, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2258 = stablehlo.broadcast_in_dim %arg214, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2259 = stablehlo.add %2257, %2258 : tensor<197x768xf32>
-    %2260 = stablehlo.convert %2259 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2261 = stablehlo.reshape %2260 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2262 = stablehlo.reshape %2261 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %2263 = stablehlo.transpose %2262, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2264 = stablehlo.reshape %2249 : (tensor<1x197x768xbf16>) -> tensor<1x197x12x64xbf16>
-    %2265 = stablehlo.transpose %2264, dims = [0, 2, 1, 3] : (tensor<1x197x12x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2266 = stablehlo.transpose %2253, dims = [0, 1, 3, 2] : (tensor<1x12x197x64xbf16>) -> tensor<1x12x64x197xbf16>
-    %2267 = stablehlo.reshape %2265 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2268 = stablehlo.reshape %2266 : (tensor<1x12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %2269 = stablehlo.broadcast_in_dim %2268, dims = [0, 1, 2] : (tensor<12x64x197xbf16>) -> tensor<12x64x197xbf16>
-    %2270 = stablehlo.dot_general %2267, %2269, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x64xbf16>, tensor<12x64x197xbf16>) -> tensor<12x197x197xbf16>
-    %2271 = stablehlo.reshape %2270 : (tensor<12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %2272 = stablehlo.broadcast_in_dim %2271, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xbf16>
-    %2273 = stablehlo.divide %2272, %92 : tensor<1x12x197x197xbf16>
-    %2274 = stablehlo.add %2273, %arg215 : tensor<1x12x197x197xbf16>
-    %2275 = stablehlo.convert %2274 : (tensor<1x12x197x197xbf16>) -> tensor<1x12x197x197xf32>
-    %2276 = stablehlo.reduce(%2275 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %2277 = stablehlo.reshape %2276 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %2278 = stablehlo.broadcast_in_dim %2275, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %2279 = stablehlo.broadcast_in_dim %2277, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %2280 = stablehlo.subtract %2278, %2279 : tensor<1x12x197x197xf32>
-    %2281 = stablehlo.exponential %2280 : tensor<1x12x197x197xf32>
-    %2282 = stablehlo.reduce(%2281 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x12x197x197xf32>, tensor<f32>) -> tensor<1x12x197xf32>
-    %2283 = stablehlo.reshape %2282 : (tensor<1x12x197xf32>) -> tensor<1x12x197x1xf32>
-    %2284 = stablehlo.broadcast_in_dim %2281, dims = [0, 1, 2, 3] : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xf32>
-    %2285 = stablehlo.broadcast_in_dim %2283, dims = [0, 1, 2, 3] : (tensor<1x12x197x1xf32>) -> tensor<1x12x197x197xf32>
-    %2286 = stablehlo.divide %2284, %2285 : tensor<1x12x197x197xf32>
-    %2287 = stablehlo.convert %2286 : (tensor<1x12x197x197xf32>) -> tensor<1x12x197x197xbf16>
-    %2288 = stablehlo.reshape %2287 : (tensor<1x12x197x197xbf16>) -> tensor<12x197x197xbf16>
-    %2289 = stablehlo.reshape %2263 : (tensor<1x12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2290 = stablehlo.broadcast_in_dim %2289, dims = [0, 1, 2] : (tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2291 = stablehlo.dot_general %2288, %2290, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<12x197x197xbf16>, tensor<12x197x64xbf16>) -> tensor<12x197x64xbf16>
-    %2292 = stablehlo.reshape %2291 : (tensor<12x197x64xbf16>) -> tensor<1x12x197x64xbf16>
-    %2293 = stablehlo.transpose %2292, dims = [0, 2, 1, 3] : (tensor<1x12x197x64xbf16>) -> tensor<1x197x12x64xbf16>
-    %2294 = stablehlo.reshape %2293 : (tensor<1x197x12x64xbf16>) -> tensor<1x197x768xbf16>
-    %2295 = stablehlo.reshape %2294 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %2296 = stablehlo.convert %2295 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %2297 = stablehlo.dot_general %2296, %arg216, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x768xf32>) -> tensor<197x768xf32>
-    %2298 = stablehlo.broadcast_in_dim %2297, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2299 = stablehlo.multiply %2298, %60 : tensor<197x768xf32>
-    %2300 = stablehlo.broadcast_in_dim %2299, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2301 = stablehlo.broadcast_in_dim %arg217, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2302 = stablehlo.add %2300, %2301 : tensor<197x768xf32>
-    %2303 = stablehlo.convert %2302 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2304 = stablehlo.reshape %2303 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2305 = stablehlo.broadcast_in_dim %arg71, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %2306 = stablehlo.broadcast_in_dim %2304, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2307 = stablehlo.multiply %2305, %2306 : tensor<1x197x768xbf16>
-    %2308 = stablehlo.add %2307, %2202 : tensor<1x197x768xbf16>
-    %2309 = stablehlo.convert %2308 : (tensor<1x197x768xbf16>) -> tensor<1x197x768xf32>
-    %2310 = stablehlo.convert %2309 : (tensor<1x197x768xf32>) -> tensor<1x197x768xf64>
-    %2311 = stablehlo.reduce(%2310 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2312 = stablehlo.reshape %2311 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2313 = stablehlo.broadcast_in_dim %2312, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2314 = stablehlo.divide %2313, %15 : tensor<1x197x1xf64>
-    %2315 = stablehlo.broadcast_in_dim %2310, dims = [0, 1, 2] : (tensor<1x197x768xf64>) -> tensor<1x197x768xf64>
-    %2316 = stablehlo.broadcast_in_dim %2314, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x768xf64>
-    %2317 = stablehlo.subtract %2315, %2316 : tensor<1x197x768xf64>
-    %2318 = stablehlo.multiply %2317, %2317 : tensor<1x197x768xf64>
-    %2319 = stablehlo.reduce(%2318 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2320 = stablehlo.reshape %2319 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2321 = stablehlo.broadcast_in_dim %2320, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2322 = stablehlo.divide %2321, %15 : tensor<1x197x1xf64>
-    %2323 = stablehlo.convert %2322 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2324 = stablehlo.reduce(%2309 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x768xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2325 = stablehlo.reshape %2324 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2326 = stablehlo.broadcast_in_dim %2325, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2327 = stablehlo.divide %2326, %31 : tensor<1x197x1xf32>
-    %2328 = stablehlo.broadcast_in_dim %2323, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2329 = stablehlo.add %2328, %36 : tensor<1x197x1xf32>
-    %2330 = stablehlo.rsqrt %2329 : tensor<1x197x1xf32>
-    %2331 = stablehlo.broadcast_in_dim %2309, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2332 = stablehlo.broadcast_in_dim %2327, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2333 = stablehlo.subtract %2331, %2332 : tensor<1x197x768xf32>
-    %2334 = stablehlo.broadcast_in_dim %2333, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2335 = stablehlo.broadcast_in_dim %2330, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x768xf32>
-    %2336 = stablehlo.multiply %2334, %2335 : tensor<1x197x768xf32>
-    %2337 = stablehlo.convert %arg72 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2338 = stablehlo.broadcast_in_dim %2336, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2339 = stablehlo.broadcast_in_dim %2337, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2340 = stablehlo.multiply %2338, %2339 : tensor<1x197x768xf32>
-    %2341 = stablehlo.convert %arg73 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2342 = stablehlo.broadcast_in_dim %2340, dims = [0, 1, 2] : (tensor<1x197x768xf32>) -> tensor<1x197x768xf32>
-    %2343 = stablehlo.broadcast_in_dim %2341, dims = [2] : (tensor<768xf32>) -> tensor<1x197x768xf32>
-    %2344 = stablehlo.add %2342, %2343 : tensor<1x197x768xf32>
-    %2345 = stablehlo.convert %2344 : (tensor<1x197x768xf32>) -> tensor<1x197x768xbf16>
-    %2346 = stablehlo.reshape %2345 : (tensor<1x197x768xbf16>) -> tensor<197x768xbf16>
-    %2347 = stablehlo.convert %2346 : (tensor<197x768xbf16>) -> tensor<197x768xf32>
-    %2348 = stablehlo.dot_general %2347, %arg218, contracting_dims = [1] x [0] : (tensor<197x768xf32>, tensor<768x3072xf32>) -> tensor<197x3072xf32>
-    %2349 = stablehlo.broadcast_in_dim %2348, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %2350 = stablehlo.multiply %2349, %170 : tensor<197x3072xf32>
-    %2351 = stablehlo.broadcast_in_dim %2350, dims = [0, 1] : (tensor<197x3072xf32>) -> tensor<197x3072xf32>
-    %2352 = stablehlo.broadcast_in_dim %arg219, dims = [1] : (tensor<3072xf32>) -> tensor<197x3072xf32>
-    %2353 = stablehlo.add %2351, %2352 : tensor<197x3072xf32>
-    %2354 = stablehlo.convert %2353 : (tensor<197x3072xf32>) -> tensor<197x3072xbf16>
-    %2355 = stablehlo.reshape %2354 : (tensor<197x3072xbf16>) -> tensor<1x197x3072xbf16>
-    %2356 = stablehlo.multiply %2355, %cst_4 : tensor<1x197x3072xbf16>
-    %2357 = stablehlo.multiply %2355, %178 : tensor<1x197x3072xbf16>
-    %2358 = stablehlo.convert %2357 : (tensor<1x197x3072xbf16>) -> tensor<1x197x3072xf32>
-    %2359 = stablehlo.clamp %cst_5, %2358, %cst_6 : tensor<1x197x3072xf32>
-    %2360 = stablehlo.multiply %2359, %2359 : tensor<1x197x3072xf32>
-    %2361 = stablehlo.multiply %cst_7, %2360 : tensor<1x197x3072xf32>
-    %2362 = stablehlo.add %2361, %cst_8 : tensor<1x197x3072xf32>
-    %2363 = stablehlo.multiply %2362, %2360 : tensor<1x197x3072xf32>
-    %2364 = stablehlo.add %2363, %cst_9 : tensor<1x197x3072xf32>
-    %2365 = stablehlo.multiply %2364, %2360 : tensor<1x197x3072xf32>
-    %2366 = stablehlo.add %2365, %cst_10 : tensor<1x197x3072xf32>
-    %2367 = stablehlo.multiply %2366, %2360 : tensor<1x197x3072xf32>
-    %2368 = stablehlo.add %2367, %cst_11 : tensor<1x197x3072xf32>
-    %2369 = stablehlo.multiply %2368, %2360 : tensor<1x197x3072xf32>
-    %2370 = stablehlo.add %2369, %cst_12 : tensor<1x197x3072xf32>
-    %2371 = stablehlo.multiply %2370, %2360 : tensor<1x197x3072xf32>
-    %2372 = stablehlo.add %2371, %cst_13 : tensor<1x197x3072xf32>
-    %2373 = stablehlo.multiply %cst_14, %2360 : tensor<1x197x3072xf32>
-    %2374 = stablehlo.add %2373, %cst_15 : tensor<1x197x3072xf32>
-    %2375 = stablehlo.multiply %2374, %2360 : tensor<1x197x3072xf32>
-    %2376 = stablehlo.add %2375, %cst_16 : tensor<1x197x3072xf32>
-    %2377 = stablehlo.multiply %2376, %2360 : tensor<1x197x3072xf32>
-    %2378 = stablehlo.add %2377, %cst_17 : tensor<1x197x3072xf32>
-    %2379 = stablehlo.multiply %2378, %2360 : tensor<1x197x3072xf32>
-    %2380 = stablehlo.add %2379, %cst_18 : tensor<1x197x3072xf32>
-    %2381 = stablehlo.multiply %2359, %2372 : tensor<1x197x3072xf32>
-    %2382 = stablehlo.divide %2381, %2380 : tensor<1x197x3072xf32>
-    %2383 = stablehlo.clamp %cst_19, %2382, %cst_20 : tensor<1x197x3072xf32>
-    %2384 = stablehlo.convert %2383 : (tensor<1x197x3072xf32>) -> tensor<1x197x3072xbf16>
-    %2385 = stablehlo.add %2384, %cst_2 : tensor<1x197x3072xbf16>
-    %2386 = stablehlo.multiply %2385, %2356 : tensor<1x197x3072xbf16>
-    %2387 = stablehlo.reshape %2386 : (tensor<1x197x3072xbf16>) -> tensor<197x3072xbf16>
-    %2388 = stablehlo.convert %2387 : (tensor<197x3072xbf16>) -> tensor<197x3072xf32>
-    %2389 = stablehlo.dot_general %2388, %arg220, contracting_dims = [1] x [0] : (tensor<197x3072xf32>, tensor<3072x768xf32>) -> tensor<197x768xf32>
-    %2390 = stablehlo.broadcast_in_dim %2389, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2391 = stablehlo.multiply %2390, %60 : tensor<197x768xf32>
-    %2392 = stablehlo.broadcast_in_dim %2391, dims = [0, 1] : (tensor<197x768xf32>) -> tensor<197x768xf32>
-    %2393 = stablehlo.broadcast_in_dim %arg221, dims = [1] : (tensor<768xf32>) -> tensor<197x768xf32>
-    %2394 = stablehlo.add %2392, %2393 : tensor<197x768xf32>
-    %2395 = stablehlo.convert %2394 : (tensor<197x768xf32>) -> tensor<197x768xbf16>
-    %2396 = stablehlo.reshape %2395 : (tensor<197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2397 = stablehlo.broadcast_in_dim %arg74, dims = [2] : (tensor<768xbf16>) -> tensor<1x197x768xbf16>
-    %2398 = stablehlo.broadcast_in_dim %2396, dims = [0, 1, 2] : (tensor<1x197x768xbf16>) -> tensor<1x197x768xbf16>
-    %2399 = stablehlo.multiply %2397, %2398 : tensor<1x197x768xbf16>
-    %2400 = stablehlo.add %2399, %2308 : tensor<1x197x768xbf16>
-    %2401 = stablehlo.slice %2400 [0:1, 1:197, 0:768] : (tensor<1x197x768xbf16>) -> tensor<1x196x768xbf16>
-    %2402 = stablehlo.reduce(%2401 init: %cst_21) applies stablehlo.add across dimensions = [1] : (tensor<1x196x768xbf16>, tensor<bf16>) -> tensor<1x768xbf16>
-    %2403 = stablehlo.convert %cst_26 : (tensor<1xi64>) -> tensor<1xbf16>
-    %2404 = stablehlo.reshape %2403 : (tensor<1xbf16>) -> tensor<bf16>
-    %2405 = stablehlo.broadcast_in_dim %2402, dims = [0, 1] : (tensor<1x768xbf16>) -> tensor<1x768xbf16>
-    %2406 = stablehlo.broadcast_in_dim %2404, dims = [] : (tensor<bf16>) -> tensor<1x768xbf16>
-    %2407 = stablehlo.divide %2405, %2406 : tensor<1x768xbf16>
-    %2408 = stablehlo.convert %2407 : (tensor<1x768xbf16>) -> tensor<1x768xf32>
-    %2409 = stablehlo.convert %2408 : (tensor<1x768xf32>) -> tensor<1x768xf64>
-    %2410 = stablehlo.reduce(%2409 init: %cst) applies stablehlo.add across dimensions = [1] : (tensor<1x768xf64>, tensor<f64>) -> tensor<1xf64>
-    %2411 = stablehlo.reshape %2410 : (tensor<1xf64>) -> tensor<1x1xf64>
-    %2412 = stablehlo.broadcast_in_dim %2411, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<1x1xf64>
-    %2413 = stablehlo.broadcast_in_dim %13, dims = [] : (tensor<f64>) -> tensor<1x1xf64>
-    %2414 = stablehlo.divide %2412, %2413 : tensor<1x1xf64>
-    %2415 = stablehlo.broadcast_in_dim %2409, dims = [0, 1] : (tensor<1x768xf64>) -> tensor<1x768xf64>
-    %2416 = stablehlo.broadcast_in_dim %2414, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<1x768xf64>
-    %2417 = stablehlo.subtract %2415, %2416 : tensor<1x768xf64>
-    %2418 = stablehlo.multiply %2417, %2417 : tensor<1x768xf64>
-    %2419 = stablehlo.reduce(%2418 init: %cst) applies stablehlo.add across dimensions = [1] : (tensor<1x768xf64>, tensor<f64>) -> tensor<1xf64>
-    %2420 = stablehlo.reshape %2419 : (tensor<1xf64>) -> tensor<1x1xf64>
-    %2421 = stablehlo.broadcast_in_dim %2420, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<1x1xf64>
-    %2422 = stablehlo.divide %2421, %2413 : tensor<1x1xf64>
-    %2423 = stablehlo.convert %2422 : (tensor<1x1xf64>) -> tensor<1x1xf32>
-    %2424 = stablehlo.reduce(%2408 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<1x768xf32>, tensor<f32>) -> tensor<1xf32>
-    %2425 = stablehlo.reshape %2424 : (tensor<1xf32>) -> tensor<1x1xf32>
-    %2426 = stablehlo.broadcast_in_dim %2425, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x1xf32>
-    %2427 = stablehlo.broadcast_in_dim %29, dims = [] : (tensor<f32>) -> tensor<1x1xf32>
-    %2428 = stablehlo.divide %2426, %2427 : tensor<1x1xf32>
-    %2429 = stablehlo.broadcast_in_dim %2423, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x1xf32>
-    %2430 = stablehlo.broadcast_in_dim %34, dims = [] : (tensor<f32>) -> tensor<1x1xf32>
-    %2431 = stablehlo.add %2429, %2430 : tensor<1x1xf32>
-    %2432 = stablehlo.rsqrt %2431 : tensor<1x1xf32>
-    %2433 = stablehlo.broadcast_in_dim %2408, dims = [0, 1] : (tensor<1x768xf32>) -> tensor<1x768xf32>
-    %2434 = stablehlo.broadcast_in_dim %2428, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x768xf32>
-    %2435 = stablehlo.subtract %2433, %2434 : tensor<1x768xf32>
-    %2436 = stablehlo.broadcast_in_dim %2435, dims = [0, 1] : (tensor<1x768xf32>) -> tensor<1x768xf32>
-    %2437 = stablehlo.broadcast_in_dim %2432, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x768xf32>
-    %2438 = stablehlo.multiply %2436, %2437 : tensor<1x768xf32>
-    %2439 = stablehlo.convert %arg75 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2440 = stablehlo.broadcast_in_dim %2438, dims = [0, 1] : (tensor<1x768xf32>) -> tensor<1x768xf32>
-    %2441 = stablehlo.broadcast_in_dim %2439, dims = [1] : (tensor<768xf32>) -> tensor<1x768xf32>
-    %2442 = stablehlo.multiply %2440, %2441 : tensor<1x768xf32>
-    %2443 = stablehlo.convert %arg76 : (tensor<768xbf16>) -> tensor<768xf32>
-    %2444 = stablehlo.broadcast_in_dim %2442, dims = [0, 1] : (tensor<1x768xf32>) -> tensor<1x768xf32>
-    %2445 = stablehlo.broadcast_in_dim %2443, dims = [1] : (tensor<768xf32>) -> tensor<1x768xf32>
-    %2446 = stablehlo.add %2444, %2445 : tensor<1x768xf32>
-    %2447 = stablehlo.convert %2446 : (tensor<1x768xf32>) -> tensor<1x768xbf16>
-    %2448 = stablehlo.convert %2447 : (tensor<1x768xbf16>) -> tensor<1x768xf32>
-    %2449 = stablehlo.dot_general %2448, %arg222, contracting_dims = [1] x [0] : (tensor<1x768xf32>, tensor<768x1000xf32>) -> tensor<1x1000xf32>
-    %2450 = stablehlo.broadcast_in_dim %2449, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %2451 = stablehlo.broadcast_in_dim %58, dims = [] : (tensor<f32>) -> tensor<1x1000xf32>
-    %2452 = stablehlo.multiply %2450, %2451 : tensor<1x1000xf32>
-    %2453 = stablehlo.broadcast_in_dim %2452, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %2454 = stablehlo.broadcast_in_dim %arg223, dims = [1] : (tensor<1000xf32>) -> tensor<1x1000xf32>
-    %2455 = stablehlo.add %2453, %2454 : tensor<1x1000xf32>
-    %2456 = stablehlo.convert %2455 : (tensor<1x1000xf32>) -> tensor<1x1000xbf16>
-    return %2456 : tensor<1x1000xbf16>
-  }
-}
diff --git a/mlir_tests/microsoftbeit-large-patch16-224.mlir b/mlir_tests/microsoftbeit-large-patch16-224.mlir
deleted file mode 100644
index 63af8c7b..00000000
--- a/mlir_tests/microsoftbeit-large-patch16-224.mlir
+++ /dev/null
@@ -1,4866 +0,0 @@
-module {
-  func.func @main(%arg0: tensor<1x3x224x224xbf16>, %arg1: tensor<1024x3x16x16xbf16>, %arg2: tensor<1024xbf16>, %arg3: tensor<1024xbf16>, %arg4: tensor<1024xbf16>, %arg5: tensor<1024xbf16>, %arg6: tensor<1024xbf16>, %arg7: tensor<1024xbf16>, %arg8: tensor<1024xbf16>, %arg9: tensor<1024xbf16>, %arg10: tensor<1024xbf16>, %arg11: tensor<1024xbf16>, %arg12: tensor<1024xbf16>, %arg13: tensor<1024xbf16>, %arg14: tensor<1024xbf16>, %arg15: tensor<1024xbf16>, %arg16: tensor<1024xbf16>, %arg17: tensor<1024xbf16>, %arg18: tensor<1024xbf16>, %arg19: tensor<1024xbf16>, %arg20: tensor<1024xbf16>, %arg21: tensor<1024xbf16>, %arg22: tensor<1024xbf16>, %arg23: tensor<1024xbf16>, %arg24: tensor<1024xbf16>, %arg25: tensor<1024xbf16>, %arg26: tensor<1024xbf16>, %arg27: tensor<1024xbf16>, %arg28: tensor<1024xbf16>, %arg29: tensor<1024xbf16>, %arg30: tensor<1024xbf16>, %arg31: tensor<1024xbf16>, %arg32: tensor<1024xbf16>, %arg33: tensor<1024xbf16>, %arg34: tensor<1024xbf16>, %arg35: tensor<1024xbf16>, %arg36: tensor<1024xbf16>, %arg37: tensor<1024xbf16>, %arg38: tensor<1024xbf16>, %arg39: tensor<1024xbf16>, %arg40: tensor<1024xbf16>, %arg41: tensor<1024xbf16>, %arg42: tensor<1024xbf16>, %arg43: tensor<1024xbf16>, %arg44: tensor<1024xbf16>, %arg45: tensor<1024xbf16>, %arg46: tensor<1024xbf16>, %arg47: tensor<1024xbf16>, %arg48: tensor<1024xbf16>, %arg49: tensor<1024xbf16>, %arg50: tensor<1024xbf16>, %arg51: tensor<1024xbf16>, %arg52: tensor<1024xbf16>, %arg53: tensor<1024xbf16>, %arg54: tensor<1024xbf16>, %arg55: tensor<1024xbf16>, %arg56: tensor<1024xbf16>, %arg57: tensor<1024xbf16>, %arg58: tensor<1024xbf16>, %arg59: tensor<1024xbf16>, %arg60: tensor<1024xbf16>, %arg61: tensor<1024xbf16>, %arg62: tensor<1024xbf16>, %arg63: tensor<1024xbf16>, %arg64: tensor<1024xbf16>, %arg65: tensor<1024xbf16>, %arg66: tensor<1024xbf16>, %arg67: tensor<1024xbf16>, %arg68: tensor<1024xbf16>, %arg69: tensor<1024xbf16>, %arg70: tensor<1024xbf16>, %arg71: tensor<1024xbf16>, %arg72: tensor<1024xbf16>, %arg73: tensor<1024xbf16>, %arg74: tensor<1024xbf16>, %arg75: tensor<1024xbf16>, %arg76: tensor<1024xbf16>, %arg77: tensor<1024xbf16>, %arg78: tensor<1024xbf16>, %arg79: tensor<1024xbf16>, %arg80: tensor<1024xbf16>, %arg81: tensor<1024xbf16>, %arg82: tensor<1024xbf16>, %arg83: tensor<1024xbf16>, %arg84: tensor<1024xbf16>, %arg85: tensor<1024xbf16>, %arg86: tensor<1024xbf16>, %arg87: tensor<1024xbf16>, %arg88: tensor<1024xbf16>, %arg89: tensor<1024xbf16>, %arg90: tensor<1024xbf16>, %arg91: tensor<1024xbf16>, %arg92: tensor<1024xbf16>, %arg93: tensor<1024xbf16>, %arg94: tensor<1024xbf16>, %arg95: tensor<1024xbf16>, %arg96: tensor<1024xbf16>, %arg97: tensor<1024xbf16>, %arg98: tensor<1024xbf16>, %arg99: tensor<1024xbf16>, %arg100: tensor<1024xbf16>, %arg101: tensor<1024xbf16>, %arg102: tensor<1024xbf16>, %arg103: tensor<1024xbf16>, %arg104: tensor<1024xbf16>, %arg105: tensor<1024xbf16>, %arg106: tensor<1024xbf16>, %arg107: tensor<1024xbf16>, %arg108: tensor<1024xbf16>, %arg109: tensor<1024xbf16>, %arg110: tensor<1024xbf16>, %arg111: tensor<1024xbf16>, %arg112: tensor<1024xbf16>, %arg113: tensor<1024xbf16>, %arg114: tensor<1024xbf16>, %arg115: tensor<1024xbf16>, %arg116: tensor<1024xbf16>, %arg117: tensor<1024xbf16>, %arg118: tensor<1024xbf16>, %arg119: tensor<1024xbf16>, %arg120: tensor<1024xbf16>, %arg121: tensor<1024xbf16>, %arg122: tensor<1024xbf16>, %arg123: tensor<1024xbf16>, %arg124: tensor<1024xbf16>, %arg125: tensor<1024xbf16>, %arg126: tensor<1024xbf16>, %arg127: tensor<1024xbf16>, %arg128: tensor<1024xbf16>, %arg129: tensor<1024xbf16>, %arg130: tensor<1024xbf16>, %arg131: tensor<1024xbf16>, %arg132: tensor<1024xbf16>, %arg133: tensor<1024xbf16>, %arg134: tensor<1024xbf16>, %arg135: tensor<1024xbf16>, %arg136: tensor<1024xbf16>, %arg137: tensor<1024xbf16>, %arg138: tensor<1024xbf16>, %arg139: tensor<1024xbf16>, %arg140: tensor<1024xbf16>, %arg141: tensor<1024xbf16>, %arg142: tensor<1024xbf16>, %arg143: tensor<1024xbf16>, %arg144: tensor<1024xbf16>, %arg145: tensor<1024xbf16>, %arg146: tensor<1024xbf16>, %arg147: tensor<1024xbf16>, %arg148: tensor<1024xbf16>, %arg149: tensor<1x1x1024xbf16>, %arg150: tensor<1024x1024xf32>, %arg151: tensor<1024xf32>, %arg152: tensor<1024x1024xbf16>, %arg153: tensor<1024x1024xf32>, %arg154: tensor<1024xf32>, %arg155: tensor<1x16x197x197xbf16>, %arg156: tensor<1024x1024xf32>, %arg157: tensor<1024xf32>, %arg158: tensor<1024x4096xf32>, %arg159: tensor<4096xf32>, %arg160: tensor<4096x1024xf32>, %arg161: tensor<1024xf32>, %arg162: tensor<1024x1024xf32>, %arg163: tensor<1024xf32>, %arg164: tensor<1024x1024xbf16>, %arg165: tensor<1024x1024xf32>, %arg166: tensor<1024xf32>, %arg167: tensor<1x16x197x197xbf16>, %arg168: tensor<1024x1024xf32>, %arg169: tensor<1024xf32>, %arg170: tensor<1024x4096xf32>, %arg171: tensor<4096xf32>, %arg172: tensor<4096x1024xf32>, %arg173: tensor<1024xf32>, %arg174: tensor<1024x1024xf32>, %arg175: tensor<1024xf32>, %arg176: tensor<1024x1024xbf16>, %arg177: tensor<1024x1024xf32>, %arg178: tensor<1024xf32>, %arg179: tensor<1x16x197x197xbf16>, %arg180: tensor<1024x1024xf32>, %arg181: tensor<1024xf32>, %arg182: tensor<1024x4096xf32>, %arg183: tensor<4096xf32>, %arg184: tensor<4096x1024xf32>, %arg185: tensor<1024xf32>, %arg186: tensor<1024x1024xf32>, %arg187: tensor<1024xf32>, %arg188: tensor<1024x1024xbf16>, %arg189: tensor<1024x1024xf32>, %arg190: tensor<1024xf32>, %arg191: tensor<1x16x197x197xbf16>, %arg192: tensor<1024x1024xf32>, %arg193: tensor<1024xf32>, %arg194: tensor<1024x4096xf32>, %arg195: tensor<4096xf32>, %arg196: tensor<4096x1024xf32>, %arg197: tensor<1024xf32>, %arg198: tensor<1024x1024xf32>, %arg199: tensor<1024xf32>, %arg200: tensor<1024x1024xbf16>, %arg201: tensor<1024x1024xf32>, %arg202: tensor<1024xf32>, %arg203: tensor<1x16x197x197xbf16>, %arg204: tensor<1024x1024xf32>, %arg205: tensor<1024xf32>, %arg206: tensor<1024x4096xf32>, %arg207: tensor<4096xf32>, %arg208: tensor<4096x1024xf32>, %arg209: tensor<1024xf32>, %arg210: tensor<1024x1024xf32>, %arg211: tensor<1024xf32>, %arg212: tensor<1024x1024xbf16>, %arg213: tensor<1024x1024xf32>, %arg214: tensor<1024xf32>, %arg215: tensor<1x16x197x197xbf16>, %arg216: tensor<1024x1024xf32>, %arg217: tensor<1024xf32>, %arg218: tensor<1024x4096xf32>, %arg219: tensor<4096xf32>, %arg220: tensor<4096x1024xf32>, %arg221: tensor<1024xf32>, %arg222: tensor<1024x1024xf32>, %arg223: tensor<1024xf32>, %arg224: tensor<1024x1024xbf16>, %arg225: tensor<1024x1024xf32>, %arg226: tensor<1024xf32>, %arg227: tensor<1x16x197x197xbf16>, %arg228: tensor<1024x1024xf32>, %arg229: tensor<1024xf32>, %arg230: tensor<1024x4096xf32>, %arg231: tensor<4096xf32>, %arg232: tensor<4096x1024xf32>, %arg233: tensor<1024xf32>, %arg234: tensor<1024x1024xf32>, %arg235: tensor<1024xf32>, %arg236: tensor<1024x1024xbf16>, %arg237: tensor<1024x1024xf32>, %arg238: tensor<1024xf32>, %arg239: tensor<1x16x197x197xbf16>, %arg240: tensor<1024x1024xf32>, %arg241: tensor<1024xf32>, %arg242: tensor<1024x4096xf32>, %arg243: tensor<4096xf32>, %arg244: tensor<4096x1024xf32>, %arg245: tensor<1024xf32>, %arg246: tensor<1024x1024xf32>, %arg247: tensor<1024xf32>, %arg248: tensor<1024x1024xbf16>, %arg249: tensor<1024x1024xf32>, %arg250: tensor<1024xf32>, %arg251: tensor<1x16x197x197xbf16>, %arg252: tensor<1024x1024xf32>, %arg253: tensor<1024xf32>, %arg254: tensor<1024x4096xf32>, %arg255: tensor<4096xf32>, %arg256: tensor<4096x1024xf32>, %arg257: tensor<1024xf32>, %arg258: tensor<1024x1024xf32>, %arg259: tensor<1024xf32>, %arg260: tensor<1024x1024xbf16>, %arg261: tensor<1024x1024xf32>, %arg262: tensor<1024xf32>, %arg263: tensor<1x16x197x197xbf16>, %arg264: tensor<1024x1024xf32>, %arg265: tensor<1024xf32>, %arg266: tensor<1024x4096xf32>, %arg267: tensor<4096xf32>, %arg268: tensor<4096x1024xf32>, %arg269: tensor<1024xf32>, %arg270: tensor<1024x1024xf32>, %arg271: tensor<1024xf32>, %arg272: tensor<1024x1024xbf16>, %arg273: tensor<1024x1024xf32>, %arg274: tensor<1024xf32>, %arg275: tensor<1x16x197x197xbf16>, %arg276: tensor<1024x1024xf32>, %arg277: tensor<1024xf32>, %arg278: tensor<1024x4096xf32>, %arg279: tensor<4096xf32>, %arg280: tensor<4096x1024xf32>, %arg281: tensor<1024xf32>, %arg282: tensor<1024x1024xf32>, %arg283: tensor<1024xf32>, %arg284: tensor<1024x1024xbf16>, %arg285: tensor<1024x1024xf32>, %arg286: tensor<1024xf32>, %arg287: tensor<1x16x197x197xbf16>, %arg288: tensor<1024x1024xf32>, %arg289: tensor<1024xf32>, %arg290: tensor<1024x4096xf32>, %arg291: tensor<4096xf32>, %arg292: tensor<4096x1024xf32>, %arg293: tensor<1024xf32>, %arg294: tensor<1024x1024xf32>, %arg295: tensor<1024xf32>, %arg296: tensor<1024x1024xbf16>, %arg297: tensor<1024x1024xf32>, %arg298: tensor<1024xf32>, %arg299: tensor<1x16x197x197xbf16>, %arg300: tensor<1024x1024xf32>, %arg301: tensor<1024xf32>, %arg302: tensor<1024x4096xf32>, %arg303: tensor<4096xf32>, %arg304: tensor<4096x1024xf32>, %arg305: tensor<1024xf32>, %arg306: tensor<1024x1024xf32>, %arg307: tensor<1024xf32>, %arg308: tensor<1024x1024xbf16>, %arg309: tensor<1024x1024xf32>, %arg310: tensor<1024xf32>, %arg311: tensor<1x16x197x197xbf16>, %arg312: tensor<1024x1024xf32>, %arg313: tensor<1024xf32>, %arg314: tensor<1024x4096xf32>, %arg315: tensor<4096xf32>, %arg316: tensor<4096x1024xf32>, %arg317: tensor<1024xf32>, %arg318: tensor<1024x1024xf32>, %arg319: tensor<1024xf32>, %arg320: tensor<1024x1024xbf16>, %arg321: tensor<1024x1024xf32>, %arg322: tensor<1024xf32>, %arg323: tensor<1x16x197x197xbf16>, %arg324: tensor<1024x1024xf32>, %arg325: tensor<1024xf32>, %arg326: tensor<1024x4096xf32>, %arg327: tensor<4096xf32>, %arg328: tensor<4096x1024xf32>, %arg329: tensor<1024xf32>, %arg330: tensor<1024x1024xf32>, %arg331: tensor<1024xf32>, %arg332: tensor<1024x1024xbf16>, %arg333: tensor<1024x1024xf32>, %arg334: tensor<1024xf32>, %arg335: tensor<1x16x197x197xbf16>, %arg336: tensor<1024x1024xf32>, %arg337: tensor<1024xf32>, %arg338: tensor<1024x4096xf32>, %arg339: tensor<4096xf32>, %arg340: tensor<4096x1024xf32>, %arg341: tensor<1024xf32>, %arg342: tensor<1024x1024xf32>, %arg343: tensor<1024xf32>, %arg344: tensor<1024x1024xbf16>, %arg345: tensor<1024x1024xf32>, %arg346: tensor<1024xf32>, %arg347: tensor<1x16x197x197xbf16>, %arg348: tensor<1024x1024xf32>, %arg349: tensor<1024xf32>, %arg350: tensor<1024x4096xf32>, %arg351: tensor<4096xf32>, %arg352: tensor<4096x1024xf32>, %arg353: tensor<1024xf32>, %arg354: tensor<1024x1024xf32>, %arg355: tensor<1024xf32>, %arg356: tensor<1024x1024xbf16>, %arg357: tensor<1024x1024xf32>, %arg358: tensor<1024xf32>, %arg359: tensor<1x16x197x197xbf16>, %arg360: tensor<1024x1024xf32>, %arg361: tensor<1024xf32>, %arg362: tensor<1024x4096xf32>, %arg363: tensor<4096xf32>, %arg364: tensor<4096x1024xf32>, %arg365: tensor<1024xf32>, %arg366: tensor<1024x1024xf32>, %arg367: tensor<1024xf32>, %arg368: tensor<1024x1024xbf16>, %arg369: tensor<1024x1024xf32>, %arg370: tensor<1024xf32>, %arg371: tensor<1x16x197x197xbf16>, %arg372: tensor<1024x1024xf32>, %arg373: tensor<1024xf32>, %arg374: tensor<1024x4096xf32>, %arg375: tensor<4096xf32>, %arg376: tensor<4096x1024xf32>, %arg377: tensor<1024xf32>, %arg378: tensor<1024x1024xf32>, %arg379: tensor<1024xf32>, %arg380: tensor<1024x1024xbf16>, %arg381: tensor<1024x1024xf32>, %arg382: tensor<1024xf32>, %arg383: tensor<1x16x197x197xbf16>, %arg384: tensor<1024x1024xf32>, %arg385: tensor<1024xf32>, %arg386: tensor<1024x4096xf32>, %arg387: tensor<4096xf32>, %arg388: tensor<4096x1024xf32>, %arg389: tensor<1024xf32>, %arg390: tensor<1024x1024xf32>, %arg391: tensor<1024xf32>, %arg392: tensor<1024x1024xbf16>, %arg393: tensor<1024x1024xf32>, %arg394: tensor<1024xf32>, %arg395: tensor<1x16x197x197xbf16>, %arg396: tensor<1024x1024xf32>, %arg397: tensor<1024xf32>, %arg398: tensor<1024x4096xf32>, %arg399: tensor<4096xf32>, %arg400: tensor<4096x1024xf32>, %arg401: tensor<1024xf32>, %arg402: tensor<1024x1024xf32>, %arg403: tensor<1024xf32>, %arg404: tensor<1024x1024xbf16>, %arg405: tensor<1024x1024xf32>, %arg406: tensor<1024xf32>, %arg407: tensor<1x16x197x197xbf16>, %arg408: tensor<1024x1024xf32>, %arg409: tensor<1024xf32>, %arg410: tensor<1024x4096xf32>, %arg411: tensor<4096xf32>, %arg412: tensor<4096x1024xf32>, %arg413: tensor<1024xf32>, %arg414: tensor<1024x1024xf32>, %arg415: tensor<1024xf32>, %arg416: tensor<1024x1024xbf16>, %arg417: tensor<1024x1024xf32>, %arg418: tensor<1024xf32>, %arg419: tensor<1x16x197x197xbf16>, %arg420: tensor<1024x1024xf32>, %arg421: tensor<1024xf32>, %arg422: tensor<1024x4096xf32>, %arg423: tensor<4096xf32>, %arg424: tensor<4096x1024xf32>, %arg425: tensor<1024xf32>, %arg426: tensor<1024x1024xf32>, %arg427: tensor<1024xf32>, %arg428: tensor<1024x1024xbf16>, %arg429: tensor<1024x1024xf32>, %arg430: tensor<1024xf32>, %arg431: tensor<1x16x197x197xbf16>, %arg432: tensor<1024x1024xf32>, %arg433: tensor<1024xf32>, %arg434: tensor<1024x4096xf32>, %arg435: tensor<4096xf32>, %arg436: tensor<4096x1024xf32>, %arg437: tensor<1024xf32>, %arg438: tensor<1024x1000xf32>, %arg439: tensor<1000xf32>) -> tensor<1x1000xbf16> {
-    %cst = stablehlo.constant dense<0.000000e+00> : tensor<f64>
-    %cst_0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
-    %cst_1 = stablehlo.constant dense<0xFF800000> : tensor<f32>
-    %cst_2 = stablehlo.constant dense<1.000000e+00> : tensor<1x197x4096xbf16>
-    %cst_3 = stablehlo.constant dense<2.000000e+00> : tensor<1x197x4096xbf16>
-    %cst_4 = stablehlo.constant dense<5.000000e-01> : tensor<1x197x4096xbf16>
-    %cst_5 = stablehlo.constant dense<-4.000000e+00> : tensor<1x197x4096xf32>
-    %cst_6 = stablehlo.constant dense<4.000000e+00> : tensor<1x197x4096xf32>
-    %cst_7 = stablehlo.constant dense<-2.72614237E-10> : tensor<1x197x4096xf32>
-    %cst_8 = stablehlo.constant dense<2.77068146E-8> : tensor<1x197x4096xf32>
-    %cst_9 = stablehlo.constant dense<-2.10102394E-6> : tensor<1x197x4096xf32>
-    %cst_10 = stablehlo.constant dense<-5.69250624E-5> : tensor<1x197x4096xf32>
-    %cst_11 = stablehlo.constant dense<-7.34990637E-4> : tensor<1x197x4096xf32>
-    %cst_12 = stablehlo.constant dense<-2.954600e-03> : tensor<1x197x4096xf32>
-    %cst_13 = stablehlo.constant dense<-0.0160960332> : tensor<1x197x4096xf32>
-    %cst_14 = stablehlo.constant dense<-1.45660715E-5> : tensor<1x197x4096xf32>
-    %cst_15 = stablehlo.constant dense<-2.13374049E-4> : tensor<1x197x4096xf32>
-    %cst_16 = stablehlo.constant dense<-0.00168282702> : tensor<1x197x4096xf32>
-    %cst_17 = stablehlo.constant dense<-0.00737332925> : tensor<1x197x4096xf32>
-    %cst_18 = stablehlo.constant dense<-0.0142647391> : tensor<1x197x4096xf32>
-    %cst_19 = stablehlo.constant dense<-1.000000e+00> : tensor<1x197x4096xf32>
-    %cst_20 = stablehlo.constant dense<1.000000e+00> : tensor<1x197x4096xf32>
-    %cst_21 = stablehlo.constant dense<0.000000e+00> : tensor<bf16>
-    %cst_22 = arith.constant dense<1024> : tensor<1xi64>
-    %cst_23 = arith.constant dense<9.9999999999999998E-13> : tensor<1xf64>
-    %cst_24 = arith.constant dense<1> : tensor<1xi64>
-    %cst_25 = arith.constant dense<8.000000e+00> : tensor<1xf64>
-    %cst_26 = arith.constant dense<196> : tensor<1xi64>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [16, 16], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x224x224xbf16>, tensor<1024x3x16x16xbf16>) -> tensor<1x1024x14x14xbf16>
-    %1 = stablehlo.reshape %arg2 : (tensor<1024xbf16>) -> tensor<1024x1x1xbf16>
-    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x14x14xbf16>
-    %3 = stablehlo.broadcast_in_dim %1, dims = [1, 2, 3] : (tensor<1024x1x1xbf16>) -> tensor<1x1024x14x14xbf16>
-    %4 = stablehlo.add %2, %3 : tensor<1x1024x14x14xbf16>
-    %5 = stablehlo.reshape %4 : (tensor<1x1024x14x14xbf16>) -> tensor<1x1024x196xbf16>
-    %6 = stablehlo.transpose %5, dims = [0, 2, 1] : (tensor<1x1024x196xbf16>) -> tensor<1x196x1024xbf16>
-    %7 = stablehlo.concatenate %arg149, %6, dim = 1 : (tensor<1x1x1024xbf16>, tensor<1x196x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %8 = stablehlo.convert %7 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %9 = stablehlo.convert %8 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %10 = stablehlo.reduce(%9 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %11 = stablehlo.reshape %10 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %12 = stablehlo.convert %cst_22 : (tensor<1xi64>) -> tensor<1xf64>
-    %13 = stablehlo.reshape %12 : (tensor<1xf64>) -> tensor<f64>
-    %14 = stablehlo.broadcast_in_dim %11, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %15 = stablehlo.broadcast_in_dim %13, dims = [] : (tensor<f64>) -> tensor<1x197x1xf64>
-    %16 = stablehlo.divide %14, %15 : tensor<1x197x1xf64>
-    %17 = stablehlo.broadcast_in_dim %9, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %18 = stablehlo.broadcast_in_dim %16, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %19 = stablehlo.subtract %17, %18 : tensor<1x197x1024xf64>
-    %20 = stablehlo.multiply %19, %19 : tensor<1x197x1024xf64>
-    %21 = stablehlo.reduce(%20 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %22 = stablehlo.reshape %21 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %23 = stablehlo.broadcast_in_dim %22, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %24 = stablehlo.divide %23, %15 : tensor<1x197x1xf64>
-    %25 = stablehlo.convert %24 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %26 = stablehlo.reduce(%8 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %27 = stablehlo.reshape %26 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %28 = stablehlo.convert %cst_22 : (tensor<1xi64>) -> tensor<1xf32>
-    %29 = stablehlo.reshape %28 : (tensor<1xf32>) -> tensor<f32>
-    %30 = stablehlo.broadcast_in_dim %27, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %31 = stablehlo.broadcast_in_dim %29, dims = [] : (tensor<f32>) -> tensor<1x197x1xf32>
-    %32 = stablehlo.divide %30, %31 : tensor<1x197x1xf32>
-    %33 = stablehlo.convert %cst_23 : (tensor<1xf64>) -> tensor<1xf32>
-    %34 = stablehlo.reshape %33 : (tensor<1xf32>) -> tensor<f32>
-    %35 = stablehlo.broadcast_in_dim %25, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %36 = stablehlo.broadcast_in_dim %34, dims = [] : (tensor<f32>) -> tensor<1x197x1xf32>
-    %37 = stablehlo.add %35, %36 : tensor<1x197x1xf32>
-    %38 = stablehlo.rsqrt %37 : tensor<1x197x1xf32>
-    %39 = stablehlo.broadcast_in_dim %8, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %40 = stablehlo.broadcast_in_dim %32, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %41 = stablehlo.subtract %39, %40 : tensor<1x197x1024xf32>
-    %42 = stablehlo.broadcast_in_dim %41, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %43 = stablehlo.broadcast_in_dim %38, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %44 = stablehlo.multiply %42, %43 : tensor<1x197x1024xf32>
-    %45 = stablehlo.convert %arg3 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %46 = stablehlo.broadcast_in_dim %44, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %47 = stablehlo.broadcast_in_dim %45, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %48 = stablehlo.multiply %46, %47 : tensor<1x197x1024xf32>
-    %49 = stablehlo.convert %arg4 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %50 = stablehlo.broadcast_in_dim %48, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %51 = stablehlo.broadcast_in_dim %49, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %52 = stablehlo.add %50, %51 : tensor<1x197x1024xf32>
-    %53 = stablehlo.convert %52 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %54 = stablehlo.reshape %53 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %55 = stablehlo.convert %54 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %56 = stablehlo.dot_general %55, %arg150, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %57 = stablehlo.convert %cst_24 : (tensor<1xi64>) -> tensor<1xf32>
-    %58 = stablehlo.reshape %57 : (tensor<1xf32>) -> tensor<f32>
-    %59 = stablehlo.broadcast_in_dim %56, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %60 = stablehlo.broadcast_in_dim %58, dims = [] : (tensor<f32>) -> tensor<197x1024xf32>
-    %61 = stablehlo.multiply %59, %60 : tensor<197x1024xf32>
-    %62 = stablehlo.broadcast_in_dim %61, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %63 = stablehlo.broadcast_in_dim %arg151, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %64 = stablehlo.add %62, %63 : tensor<197x1024xf32>
-    %65 = stablehlo.convert %64 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %66 = stablehlo.reshape %65 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %67 = stablehlo.dot_general %54, %arg152, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %68 = stablehlo.reshape %67 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %69 = stablehlo.reshape %68 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %70 = stablehlo.transpose %69, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %71 = stablehlo.dot_general %55, %arg153, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %72 = stablehlo.broadcast_in_dim %71, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %73 = stablehlo.multiply %72, %60 : tensor<197x1024xf32>
-    %74 = stablehlo.broadcast_in_dim %73, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %75 = stablehlo.broadcast_in_dim %arg154, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %76 = stablehlo.add %74, %75 : tensor<197x1024xf32>
-    %77 = stablehlo.convert %76 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %78 = stablehlo.reshape %77 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %79 = stablehlo.reshape %78 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %80 = stablehlo.transpose %79, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %81 = stablehlo.reshape %66 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %82 = stablehlo.transpose %81, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %83 = stablehlo.transpose %70, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %84 = stablehlo.reshape %82 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %85 = stablehlo.reshape %83 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %86 = stablehlo.broadcast_in_dim %85, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %87 = stablehlo.dot_general %84, %86, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %88 = stablehlo.reshape %87 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %89 = stablehlo.convert %cst_25 : (tensor<1xf64>) -> tensor<1xbf16>
-    %90 = stablehlo.reshape %89 : (tensor<1xbf16>) -> tensor<bf16>
-    %91 = stablehlo.broadcast_in_dim %88, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %92 = stablehlo.broadcast_in_dim %90, dims = [] : (tensor<bf16>) -> tensor<1x16x197x197xbf16>
-    %93 = stablehlo.divide %91, %92 : tensor<1x16x197x197xbf16>
-    %94 = stablehlo.add %93, %arg155 : tensor<1x16x197x197xbf16>
-    %95 = stablehlo.convert %94 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %96 = stablehlo.reduce(%95 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %97 = stablehlo.reshape %96 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %98 = stablehlo.broadcast_in_dim %95, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %99 = stablehlo.broadcast_in_dim %97, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %100 = stablehlo.subtract %98, %99 : tensor<1x16x197x197xf32>
-    %101 = stablehlo.exponential %100 : tensor<1x16x197x197xf32>
-    %102 = stablehlo.reduce(%101 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %103 = stablehlo.reshape %102 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %104 = stablehlo.broadcast_in_dim %101, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %105 = stablehlo.broadcast_in_dim %103, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %106 = stablehlo.divide %104, %105 : tensor<1x16x197x197xf32>
-    %107 = stablehlo.convert %106 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %108 = stablehlo.reshape %107 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %109 = stablehlo.reshape %80 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %110 = stablehlo.broadcast_in_dim %109, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %111 = stablehlo.dot_general %108, %110, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %112 = stablehlo.reshape %111 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %113 = stablehlo.transpose %112, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %114 = stablehlo.reshape %113 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %115 = stablehlo.reshape %114 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %116 = stablehlo.convert %115 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %117 = stablehlo.dot_general %116, %arg156, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %118 = stablehlo.broadcast_in_dim %117, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %119 = stablehlo.multiply %118, %60 : tensor<197x1024xf32>
-    %120 = stablehlo.broadcast_in_dim %119, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %121 = stablehlo.broadcast_in_dim %arg157, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %122 = stablehlo.add %120, %121 : tensor<197x1024xf32>
-    %123 = stablehlo.convert %122 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %124 = stablehlo.reshape %123 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %125 = stablehlo.broadcast_in_dim %arg5, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %126 = stablehlo.broadcast_in_dim %124, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %127 = stablehlo.multiply %125, %126 : tensor<1x197x1024xbf16>
-    %128 = stablehlo.add %127, %7 : tensor<1x197x1024xbf16>
-    %129 = stablehlo.convert %128 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %130 = stablehlo.convert %129 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %131 = stablehlo.reduce(%130 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %132 = stablehlo.reshape %131 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %133 = stablehlo.broadcast_in_dim %132, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %134 = stablehlo.divide %133, %15 : tensor<1x197x1xf64>
-    %135 = stablehlo.broadcast_in_dim %130, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %136 = stablehlo.broadcast_in_dim %134, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %137 = stablehlo.subtract %135, %136 : tensor<1x197x1024xf64>
-    %138 = stablehlo.multiply %137, %137 : tensor<1x197x1024xf64>
-    %139 = stablehlo.reduce(%138 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %140 = stablehlo.reshape %139 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %141 = stablehlo.broadcast_in_dim %140, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %142 = stablehlo.divide %141, %15 : tensor<1x197x1xf64>
-    %143 = stablehlo.convert %142 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %144 = stablehlo.reduce(%129 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %145 = stablehlo.reshape %144 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %146 = stablehlo.broadcast_in_dim %145, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %147 = stablehlo.divide %146, %31 : tensor<1x197x1xf32>
-    %148 = stablehlo.broadcast_in_dim %143, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %149 = stablehlo.add %148, %36 : tensor<1x197x1xf32>
-    %150 = stablehlo.rsqrt %149 : tensor<1x197x1xf32>
-    %151 = stablehlo.broadcast_in_dim %129, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %152 = stablehlo.broadcast_in_dim %147, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %153 = stablehlo.subtract %151, %152 : tensor<1x197x1024xf32>
-    %154 = stablehlo.broadcast_in_dim %153, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %155 = stablehlo.broadcast_in_dim %150, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %156 = stablehlo.multiply %154, %155 : tensor<1x197x1024xf32>
-    %157 = stablehlo.convert %arg6 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %158 = stablehlo.broadcast_in_dim %156, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %159 = stablehlo.broadcast_in_dim %157, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %160 = stablehlo.multiply %158, %159 : tensor<1x197x1024xf32>
-    %161 = stablehlo.convert %arg7 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %162 = stablehlo.broadcast_in_dim %160, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %163 = stablehlo.broadcast_in_dim %161, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %164 = stablehlo.add %162, %163 : tensor<1x197x1024xf32>
-    %165 = stablehlo.convert %164 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %166 = stablehlo.reshape %165 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %167 = stablehlo.convert %166 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %168 = stablehlo.dot_general %167, %arg158, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %169 = stablehlo.broadcast_in_dim %168, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %170 = stablehlo.broadcast_in_dim %58, dims = [] : (tensor<f32>) -> tensor<197x4096xf32>
-    %171 = stablehlo.multiply %169, %170 : tensor<197x4096xf32>
-    %172 = stablehlo.broadcast_in_dim %171, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %173 = stablehlo.broadcast_in_dim %arg159, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %174 = stablehlo.add %172, %173 : tensor<197x4096xf32>
-    %175 = stablehlo.convert %174 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %176 = stablehlo.reshape %175 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %177 = stablehlo.multiply %176, %cst_4 : tensor<1x197x4096xbf16>
-    %178 = stablehlo.rsqrt %cst_3 : tensor<1x197x4096xbf16>
-    %179 = stablehlo.multiply %176, %178 : tensor<1x197x4096xbf16>
-    %180 = stablehlo.convert %179 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %181 = stablehlo.clamp %cst_5, %180, %cst_6 : tensor<1x197x4096xf32>
-    %182 = stablehlo.multiply %181, %181 : tensor<1x197x4096xf32>
-    %183 = stablehlo.multiply %cst_7, %182 : tensor<1x197x4096xf32>
-    %184 = stablehlo.add %183, %cst_8 : tensor<1x197x4096xf32>
-    %185 = stablehlo.multiply %184, %182 : tensor<1x197x4096xf32>
-    %186 = stablehlo.add %185, %cst_9 : tensor<1x197x4096xf32>
-    %187 = stablehlo.multiply %186, %182 : tensor<1x197x4096xf32>
-    %188 = stablehlo.add %187, %cst_10 : tensor<1x197x4096xf32>
-    %189 = stablehlo.multiply %188, %182 : tensor<1x197x4096xf32>
-    %190 = stablehlo.add %189, %cst_11 : tensor<1x197x4096xf32>
-    %191 = stablehlo.multiply %190, %182 : tensor<1x197x4096xf32>
-    %192 = stablehlo.add %191, %cst_12 : tensor<1x197x4096xf32>
-    %193 = stablehlo.multiply %192, %182 : tensor<1x197x4096xf32>
-    %194 = stablehlo.add %193, %cst_13 : tensor<1x197x4096xf32>
-    %195 = stablehlo.multiply %cst_14, %182 : tensor<1x197x4096xf32>
-    %196 = stablehlo.add %195, %cst_15 : tensor<1x197x4096xf32>
-    %197 = stablehlo.multiply %196, %182 : tensor<1x197x4096xf32>
-    %198 = stablehlo.add %197, %cst_16 : tensor<1x197x4096xf32>
-    %199 = stablehlo.multiply %198, %182 : tensor<1x197x4096xf32>
-    %200 = stablehlo.add %199, %cst_17 : tensor<1x197x4096xf32>
-    %201 = stablehlo.multiply %200, %182 : tensor<1x197x4096xf32>
-    %202 = stablehlo.add %201, %cst_18 : tensor<1x197x4096xf32>
-    %203 = stablehlo.multiply %181, %194 : tensor<1x197x4096xf32>
-    %204 = stablehlo.divide %203, %202 : tensor<1x197x4096xf32>
-    %205 = stablehlo.clamp %cst_19, %204, %cst_20 : tensor<1x197x4096xf32>
-    %206 = stablehlo.convert %205 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %207 = stablehlo.add %206, %cst_2 : tensor<1x197x4096xbf16>
-    %208 = stablehlo.multiply %207, %177 : tensor<1x197x4096xbf16>
-    %209 = stablehlo.reshape %208 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %210 = stablehlo.convert %209 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %211 = stablehlo.dot_general %210, %arg160, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %212 = stablehlo.broadcast_in_dim %211, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %213 = stablehlo.multiply %212, %60 : tensor<197x1024xf32>
-    %214 = stablehlo.broadcast_in_dim %213, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %215 = stablehlo.broadcast_in_dim %arg161, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %216 = stablehlo.add %214, %215 : tensor<197x1024xf32>
-    %217 = stablehlo.convert %216 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %218 = stablehlo.reshape %217 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %219 = stablehlo.broadcast_in_dim %arg8, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %220 = stablehlo.broadcast_in_dim %218, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %221 = stablehlo.multiply %219, %220 : tensor<1x197x1024xbf16>
-    %222 = stablehlo.add %221, %128 : tensor<1x197x1024xbf16>
-    %223 = stablehlo.convert %222 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %224 = stablehlo.convert %223 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %225 = stablehlo.reduce(%224 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %226 = stablehlo.reshape %225 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %227 = stablehlo.broadcast_in_dim %226, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %228 = stablehlo.divide %227, %15 : tensor<1x197x1xf64>
-    %229 = stablehlo.broadcast_in_dim %224, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %230 = stablehlo.broadcast_in_dim %228, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %231 = stablehlo.subtract %229, %230 : tensor<1x197x1024xf64>
-    %232 = stablehlo.multiply %231, %231 : tensor<1x197x1024xf64>
-    %233 = stablehlo.reduce(%232 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %234 = stablehlo.reshape %233 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %235 = stablehlo.broadcast_in_dim %234, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %236 = stablehlo.divide %235, %15 : tensor<1x197x1xf64>
-    %237 = stablehlo.convert %236 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %238 = stablehlo.reduce(%223 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %239 = stablehlo.reshape %238 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %240 = stablehlo.broadcast_in_dim %239, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %241 = stablehlo.divide %240, %31 : tensor<1x197x1xf32>
-    %242 = stablehlo.broadcast_in_dim %237, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %243 = stablehlo.add %242, %36 : tensor<1x197x1xf32>
-    %244 = stablehlo.rsqrt %243 : tensor<1x197x1xf32>
-    %245 = stablehlo.broadcast_in_dim %223, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %246 = stablehlo.broadcast_in_dim %241, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %247 = stablehlo.subtract %245, %246 : tensor<1x197x1024xf32>
-    %248 = stablehlo.broadcast_in_dim %247, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %249 = stablehlo.broadcast_in_dim %244, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %250 = stablehlo.multiply %248, %249 : tensor<1x197x1024xf32>
-    %251 = stablehlo.convert %arg9 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %252 = stablehlo.broadcast_in_dim %250, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %253 = stablehlo.broadcast_in_dim %251, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %254 = stablehlo.multiply %252, %253 : tensor<1x197x1024xf32>
-    %255 = stablehlo.convert %arg10 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %256 = stablehlo.broadcast_in_dim %254, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %257 = stablehlo.broadcast_in_dim %255, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %258 = stablehlo.add %256, %257 : tensor<1x197x1024xf32>
-    %259 = stablehlo.convert %258 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %260 = stablehlo.reshape %259 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %261 = stablehlo.convert %260 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %262 = stablehlo.dot_general %261, %arg162, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %263 = stablehlo.broadcast_in_dim %262, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %264 = stablehlo.multiply %263, %60 : tensor<197x1024xf32>
-    %265 = stablehlo.broadcast_in_dim %264, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %266 = stablehlo.broadcast_in_dim %arg163, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %267 = stablehlo.add %265, %266 : tensor<197x1024xf32>
-    %268 = stablehlo.convert %267 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %269 = stablehlo.reshape %268 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %270 = stablehlo.dot_general %260, %arg164, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %271 = stablehlo.reshape %270 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %272 = stablehlo.reshape %271 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %273 = stablehlo.transpose %272, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %274 = stablehlo.dot_general %261, %arg165, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %275 = stablehlo.broadcast_in_dim %274, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %276 = stablehlo.multiply %275, %60 : tensor<197x1024xf32>
-    %277 = stablehlo.broadcast_in_dim %276, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %278 = stablehlo.broadcast_in_dim %arg166, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %279 = stablehlo.add %277, %278 : tensor<197x1024xf32>
-    %280 = stablehlo.convert %279 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %281 = stablehlo.reshape %280 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %282 = stablehlo.reshape %281 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %283 = stablehlo.transpose %282, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %284 = stablehlo.reshape %269 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %285 = stablehlo.transpose %284, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %286 = stablehlo.transpose %273, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %287 = stablehlo.reshape %285 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %288 = stablehlo.reshape %286 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %289 = stablehlo.broadcast_in_dim %288, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %290 = stablehlo.dot_general %287, %289, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %291 = stablehlo.reshape %290 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %292 = stablehlo.broadcast_in_dim %291, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %293 = stablehlo.divide %292, %92 : tensor<1x16x197x197xbf16>
-    %294 = stablehlo.add %293, %arg167 : tensor<1x16x197x197xbf16>
-    %295 = stablehlo.convert %294 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %296 = stablehlo.reduce(%295 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %297 = stablehlo.reshape %296 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %298 = stablehlo.broadcast_in_dim %295, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %299 = stablehlo.broadcast_in_dim %297, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %300 = stablehlo.subtract %298, %299 : tensor<1x16x197x197xf32>
-    %301 = stablehlo.exponential %300 : tensor<1x16x197x197xf32>
-    %302 = stablehlo.reduce(%301 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %303 = stablehlo.reshape %302 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %304 = stablehlo.broadcast_in_dim %301, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %305 = stablehlo.broadcast_in_dim %303, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %306 = stablehlo.divide %304, %305 : tensor<1x16x197x197xf32>
-    %307 = stablehlo.convert %306 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %308 = stablehlo.reshape %307 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %309 = stablehlo.reshape %283 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %310 = stablehlo.broadcast_in_dim %309, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %311 = stablehlo.dot_general %308, %310, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %312 = stablehlo.reshape %311 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %313 = stablehlo.transpose %312, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %314 = stablehlo.reshape %313 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %315 = stablehlo.reshape %314 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %316 = stablehlo.convert %315 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %317 = stablehlo.dot_general %316, %arg168, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %318 = stablehlo.broadcast_in_dim %317, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %319 = stablehlo.multiply %318, %60 : tensor<197x1024xf32>
-    %320 = stablehlo.broadcast_in_dim %319, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %321 = stablehlo.broadcast_in_dim %arg169, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %322 = stablehlo.add %320, %321 : tensor<197x1024xf32>
-    %323 = stablehlo.convert %322 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %324 = stablehlo.reshape %323 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %325 = stablehlo.broadcast_in_dim %arg11, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %326 = stablehlo.broadcast_in_dim %324, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %327 = stablehlo.multiply %325, %326 : tensor<1x197x1024xbf16>
-    %328 = stablehlo.add %327, %222 : tensor<1x197x1024xbf16>
-    %329 = stablehlo.convert %328 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %330 = stablehlo.convert %329 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %331 = stablehlo.reduce(%330 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %332 = stablehlo.reshape %331 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %333 = stablehlo.broadcast_in_dim %332, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %334 = stablehlo.divide %333, %15 : tensor<1x197x1xf64>
-    %335 = stablehlo.broadcast_in_dim %330, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %336 = stablehlo.broadcast_in_dim %334, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %337 = stablehlo.subtract %335, %336 : tensor<1x197x1024xf64>
-    %338 = stablehlo.multiply %337, %337 : tensor<1x197x1024xf64>
-    %339 = stablehlo.reduce(%338 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %340 = stablehlo.reshape %339 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %341 = stablehlo.broadcast_in_dim %340, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %342 = stablehlo.divide %341, %15 : tensor<1x197x1xf64>
-    %343 = stablehlo.convert %342 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %344 = stablehlo.reduce(%329 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %345 = stablehlo.reshape %344 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %346 = stablehlo.broadcast_in_dim %345, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %347 = stablehlo.divide %346, %31 : tensor<1x197x1xf32>
-    %348 = stablehlo.broadcast_in_dim %343, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %349 = stablehlo.add %348, %36 : tensor<1x197x1xf32>
-    %350 = stablehlo.rsqrt %349 : tensor<1x197x1xf32>
-    %351 = stablehlo.broadcast_in_dim %329, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %352 = stablehlo.broadcast_in_dim %347, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %353 = stablehlo.subtract %351, %352 : tensor<1x197x1024xf32>
-    %354 = stablehlo.broadcast_in_dim %353, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %355 = stablehlo.broadcast_in_dim %350, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %356 = stablehlo.multiply %354, %355 : tensor<1x197x1024xf32>
-    %357 = stablehlo.convert %arg12 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %358 = stablehlo.broadcast_in_dim %356, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %359 = stablehlo.broadcast_in_dim %357, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %360 = stablehlo.multiply %358, %359 : tensor<1x197x1024xf32>
-    %361 = stablehlo.convert %arg13 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %362 = stablehlo.broadcast_in_dim %360, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %363 = stablehlo.broadcast_in_dim %361, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %364 = stablehlo.add %362, %363 : tensor<1x197x1024xf32>
-    %365 = stablehlo.convert %364 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %366 = stablehlo.reshape %365 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %367 = stablehlo.convert %366 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %368 = stablehlo.dot_general %367, %arg170, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %369 = stablehlo.broadcast_in_dim %368, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %370 = stablehlo.multiply %369, %170 : tensor<197x4096xf32>
-    %371 = stablehlo.broadcast_in_dim %370, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %372 = stablehlo.broadcast_in_dim %arg171, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %373 = stablehlo.add %371, %372 : tensor<197x4096xf32>
-    %374 = stablehlo.convert %373 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %375 = stablehlo.reshape %374 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %376 = stablehlo.multiply %375, %cst_4 : tensor<1x197x4096xbf16>
-    %377 = stablehlo.multiply %375, %178 : tensor<1x197x4096xbf16>
-    %378 = stablehlo.convert %377 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %379 = stablehlo.clamp %cst_5, %378, %cst_6 : tensor<1x197x4096xf32>
-    %380 = stablehlo.multiply %379, %379 : tensor<1x197x4096xf32>
-    %381 = stablehlo.multiply %cst_7, %380 : tensor<1x197x4096xf32>
-    %382 = stablehlo.add %381, %cst_8 : tensor<1x197x4096xf32>
-    %383 = stablehlo.multiply %382, %380 : tensor<1x197x4096xf32>
-    %384 = stablehlo.add %383, %cst_9 : tensor<1x197x4096xf32>
-    %385 = stablehlo.multiply %384, %380 : tensor<1x197x4096xf32>
-    %386 = stablehlo.add %385, %cst_10 : tensor<1x197x4096xf32>
-    %387 = stablehlo.multiply %386, %380 : tensor<1x197x4096xf32>
-    %388 = stablehlo.add %387, %cst_11 : tensor<1x197x4096xf32>
-    %389 = stablehlo.multiply %388, %380 : tensor<1x197x4096xf32>
-    %390 = stablehlo.add %389, %cst_12 : tensor<1x197x4096xf32>
-    %391 = stablehlo.multiply %390, %380 : tensor<1x197x4096xf32>
-    %392 = stablehlo.add %391, %cst_13 : tensor<1x197x4096xf32>
-    %393 = stablehlo.multiply %cst_14, %380 : tensor<1x197x4096xf32>
-    %394 = stablehlo.add %393, %cst_15 : tensor<1x197x4096xf32>
-    %395 = stablehlo.multiply %394, %380 : tensor<1x197x4096xf32>
-    %396 = stablehlo.add %395, %cst_16 : tensor<1x197x4096xf32>
-    %397 = stablehlo.multiply %396, %380 : tensor<1x197x4096xf32>
-    %398 = stablehlo.add %397, %cst_17 : tensor<1x197x4096xf32>
-    %399 = stablehlo.multiply %398, %380 : tensor<1x197x4096xf32>
-    %400 = stablehlo.add %399, %cst_18 : tensor<1x197x4096xf32>
-    %401 = stablehlo.multiply %379, %392 : tensor<1x197x4096xf32>
-    %402 = stablehlo.divide %401, %400 : tensor<1x197x4096xf32>
-    %403 = stablehlo.clamp %cst_19, %402, %cst_20 : tensor<1x197x4096xf32>
-    %404 = stablehlo.convert %403 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %405 = stablehlo.add %404, %cst_2 : tensor<1x197x4096xbf16>
-    %406 = stablehlo.multiply %405, %376 : tensor<1x197x4096xbf16>
-    %407 = stablehlo.reshape %406 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %408 = stablehlo.convert %407 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %409 = stablehlo.dot_general %408, %arg172, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %410 = stablehlo.broadcast_in_dim %409, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %411 = stablehlo.multiply %410, %60 : tensor<197x1024xf32>
-    %412 = stablehlo.broadcast_in_dim %411, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %413 = stablehlo.broadcast_in_dim %arg173, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %414 = stablehlo.add %412, %413 : tensor<197x1024xf32>
-    %415 = stablehlo.convert %414 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %416 = stablehlo.reshape %415 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %417 = stablehlo.broadcast_in_dim %arg14, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %418 = stablehlo.broadcast_in_dim %416, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %419 = stablehlo.multiply %417, %418 : tensor<1x197x1024xbf16>
-    %420 = stablehlo.add %419, %328 : tensor<1x197x1024xbf16>
-    %421 = stablehlo.convert %420 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %422 = stablehlo.convert %421 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %423 = stablehlo.reduce(%422 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %424 = stablehlo.reshape %423 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %425 = stablehlo.broadcast_in_dim %424, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %426 = stablehlo.divide %425, %15 : tensor<1x197x1xf64>
-    %427 = stablehlo.broadcast_in_dim %422, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %428 = stablehlo.broadcast_in_dim %426, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %429 = stablehlo.subtract %427, %428 : tensor<1x197x1024xf64>
-    %430 = stablehlo.multiply %429, %429 : tensor<1x197x1024xf64>
-    %431 = stablehlo.reduce(%430 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %432 = stablehlo.reshape %431 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %433 = stablehlo.broadcast_in_dim %432, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %434 = stablehlo.divide %433, %15 : tensor<1x197x1xf64>
-    %435 = stablehlo.convert %434 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %436 = stablehlo.reduce(%421 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %437 = stablehlo.reshape %436 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %438 = stablehlo.broadcast_in_dim %437, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %439 = stablehlo.divide %438, %31 : tensor<1x197x1xf32>
-    %440 = stablehlo.broadcast_in_dim %435, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %441 = stablehlo.add %440, %36 : tensor<1x197x1xf32>
-    %442 = stablehlo.rsqrt %441 : tensor<1x197x1xf32>
-    %443 = stablehlo.broadcast_in_dim %421, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %444 = stablehlo.broadcast_in_dim %439, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %445 = stablehlo.subtract %443, %444 : tensor<1x197x1024xf32>
-    %446 = stablehlo.broadcast_in_dim %445, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %447 = stablehlo.broadcast_in_dim %442, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %448 = stablehlo.multiply %446, %447 : tensor<1x197x1024xf32>
-    %449 = stablehlo.convert %arg15 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %450 = stablehlo.broadcast_in_dim %448, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %451 = stablehlo.broadcast_in_dim %449, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %452 = stablehlo.multiply %450, %451 : tensor<1x197x1024xf32>
-    %453 = stablehlo.convert %arg16 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %454 = stablehlo.broadcast_in_dim %452, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %455 = stablehlo.broadcast_in_dim %453, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %456 = stablehlo.add %454, %455 : tensor<1x197x1024xf32>
-    %457 = stablehlo.convert %456 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %458 = stablehlo.reshape %457 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %459 = stablehlo.convert %458 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %460 = stablehlo.dot_general %459, %arg174, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %461 = stablehlo.broadcast_in_dim %460, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %462 = stablehlo.multiply %461, %60 : tensor<197x1024xf32>
-    %463 = stablehlo.broadcast_in_dim %462, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %464 = stablehlo.broadcast_in_dim %arg175, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %465 = stablehlo.add %463, %464 : tensor<197x1024xf32>
-    %466 = stablehlo.convert %465 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %467 = stablehlo.reshape %466 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %468 = stablehlo.dot_general %458, %arg176, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %469 = stablehlo.reshape %468 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %470 = stablehlo.reshape %469 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %471 = stablehlo.transpose %470, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %472 = stablehlo.dot_general %459, %arg177, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %473 = stablehlo.broadcast_in_dim %472, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %474 = stablehlo.multiply %473, %60 : tensor<197x1024xf32>
-    %475 = stablehlo.broadcast_in_dim %474, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %476 = stablehlo.broadcast_in_dim %arg178, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %477 = stablehlo.add %475, %476 : tensor<197x1024xf32>
-    %478 = stablehlo.convert %477 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %479 = stablehlo.reshape %478 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %480 = stablehlo.reshape %479 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %481 = stablehlo.transpose %480, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %482 = stablehlo.reshape %467 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %483 = stablehlo.transpose %482, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %484 = stablehlo.transpose %471, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %485 = stablehlo.reshape %483 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %486 = stablehlo.reshape %484 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %487 = stablehlo.broadcast_in_dim %486, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %488 = stablehlo.dot_general %485, %487, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %489 = stablehlo.reshape %488 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %490 = stablehlo.broadcast_in_dim %489, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %491 = stablehlo.divide %490, %92 : tensor<1x16x197x197xbf16>
-    %492 = stablehlo.add %491, %arg179 : tensor<1x16x197x197xbf16>
-    %493 = stablehlo.convert %492 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %494 = stablehlo.reduce(%493 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %495 = stablehlo.reshape %494 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %496 = stablehlo.broadcast_in_dim %493, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %497 = stablehlo.broadcast_in_dim %495, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %498 = stablehlo.subtract %496, %497 : tensor<1x16x197x197xf32>
-    %499 = stablehlo.exponential %498 : tensor<1x16x197x197xf32>
-    %500 = stablehlo.reduce(%499 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %501 = stablehlo.reshape %500 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %502 = stablehlo.broadcast_in_dim %499, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %503 = stablehlo.broadcast_in_dim %501, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %504 = stablehlo.divide %502, %503 : tensor<1x16x197x197xf32>
-    %505 = stablehlo.convert %504 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %506 = stablehlo.reshape %505 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %507 = stablehlo.reshape %481 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %508 = stablehlo.broadcast_in_dim %507, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %509 = stablehlo.dot_general %506, %508, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %510 = stablehlo.reshape %509 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %511 = stablehlo.transpose %510, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %512 = stablehlo.reshape %511 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %513 = stablehlo.reshape %512 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %514 = stablehlo.convert %513 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %515 = stablehlo.dot_general %514, %arg180, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %516 = stablehlo.broadcast_in_dim %515, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %517 = stablehlo.multiply %516, %60 : tensor<197x1024xf32>
-    %518 = stablehlo.broadcast_in_dim %517, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %519 = stablehlo.broadcast_in_dim %arg181, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %520 = stablehlo.add %518, %519 : tensor<197x1024xf32>
-    %521 = stablehlo.convert %520 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %522 = stablehlo.reshape %521 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %523 = stablehlo.broadcast_in_dim %arg17, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %524 = stablehlo.broadcast_in_dim %522, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %525 = stablehlo.multiply %523, %524 : tensor<1x197x1024xbf16>
-    %526 = stablehlo.add %525, %420 : tensor<1x197x1024xbf16>
-    %527 = stablehlo.convert %526 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %528 = stablehlo.convert %527 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %529 = stablehlo.reduce(%528 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %530 = stablehlo.reshape %529 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %531 = stablehlo.broadcast_in_dim %530, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %532 = stablehlo.divide %531, %15 : tensor<1x197x1xf64>
-    %533 = stablehlo.broadcast_in_dim %528, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %534 = stablehlo.broadcast_in_dim %532, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %535 = stablehlo.subtract %533, %534 : tensor<1x197x1024xf64>
-    %536 = stablehlo.multiply %535, %535 : tensor<1x197x1024xf64>
-    %537 = stablehlo.reduce(%536 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %538 = stablehlo.reshape %537 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %539 = stablehlo.broadcast_in_dim %538, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %540 = stablehlo.divide %539, %15 : tensor<1x197x1xf64>
-    %541 = stablehlo.convert %540 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %542 = stablehlo.reduce(%527 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %543 = stablehlo.reshape %542 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %544 = stablehlo.broadcast_in_dim %543, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %545 = stablehlo.divide %544, %31 : tensor<1x197x1xf32>
-    %546 = stablehlo.broadcast_in_dim %541, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %547 = stablehlo.add %546, %36 : tensor<1x197x1xf32>
-    %548 = stablehlo.rsqrt %547 : tensor<1x197x1xf32>
-    %549 = stablehlo.broadcast_in_dim %527, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %550 = stablehlo.broadcast_in_dim %545, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %551 = stablehlo.subtract %549, %550 : tensor<1x197x1024xf32>
-    %552 = stablehlo.broadcast_in_dim %551, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %553 = stablehlo.broadcast_in_dim %548, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %554 = stablehlo.multiply %552, %553 : tensor<1x197x1024xf32>
-    %555 = stablehlo.convert %arg18 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %556 = stablehlo.broadcast_in_dim %554, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %557 = stablehlo.broadcast_in_dim %555, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %558 = stablehlo.multiply %556, %557 : tensor<1x197x1024xf32>
-    %559 = stablehlo.convert %arg19 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %560 = stablehlo.broadcast_in_dim %558, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %561 = stablehlo.broadcast_in_dim %559, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %562 = stablehlo.add %560, %561 : tensor<1x197x1024xf32>
-    %563 = stablehlo.convert %562 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %564 = stablehlo.reshape %563 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %565 = stablehlo.convert %564 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %566 = stablehlo.dot_general %565, %arg182, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %567 = stablehlo.broadcast_in_dim %566, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %568 = stablehlo.multiply %567, %170 : tensor<197x4096xf32>
-    %569 = stablehlo.broadcast_in_dim %568, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %570 = stablehlo.broadcast_in_dim %arg183, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %571 = stablehlo.add %569, %570 : tensor<197x4096xf32>
-    %572 = stablehlo.convert %571 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %573 = stablehlo.reshape %572 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %574 = stablehlo.multiply %573, %cst_4 : tensor<1x197x4096xbf16>
-    %575 = stablehlo.multiply %573, %178 : tensor<1x197x4096xbf16>
-    %576 = stablehlo.convert %575 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %577 = stablehlo.clamp %cst_5, %576, %cst_6 : tensor<1x197x4096xf32>
-    %578 = stablehlo.multiply %577, %577 : tensor<1x197x4096xf32>
-    %579 = stablehlo.multiply %cst_7, %578 : tensor<1x197x4096xf32>
-    %580 = stablehlo.add %579, %cst_8 : tensor<1x197x4096xf32>
-    %581 = stablehlo.multiply %580, %578 : tensor<1x197x4096xf32>
-    %582 = stablehlo.add %581, %cst_9 : tensor<1x197x4096xf32>
-    %583 = stablehlo.multiply %582, %578 : tensor<1x197x4096xf32>
-    %584 = stablehlo.add %583, %cst_10 : tensor<1x197x4096xf32>
-    %585 = stablehlo.multiply %584, %578 : tensor<1x197x4096xf32>
-    %586 = stablehlo.add %585, %cst_11 : tensor<1x197x4096xf32>
-    %587 = stablehlo.multiply %586, %578 : tensor<1x197x4096xf32>
-    %588 = stablehlo.add %587, %cst_12 : tensor<1x197x4096xf32>
-    %589 = stablehlo.multiply %588, %578 : tensor<1x197x4096xf32>
-    %590 = stablehlo.add %589, %cst_13 : tensor<1x197x4096xf32>
-    %591 = stablehlo.multiply %cst_14, %578 : tensor<1x197x4096xf32>
-    %592 = stablehlo.add %591, %cst_15 : tensor<1x197x4096xf32>
-    %593 = stablehlo.multiply %592, %578 : tensor<1x197x4096xf32>
-    %594 = stablehlo.add %593, %cst_16 : tensor<1x197x4096xf32>
-    %595 = stablehlo.multiply %594, %578 : tensor<1x197x4096xf32>
-    %596 = stablehlo.add %595, %cst_17 : tensor<1x197x4096xf32>
-    %597 = stablehlo.multiply %596, %578 : tensor<1x197x4096xf32>
-    %598 = stablehlo.add %597, %cst_18 : tensor<1x197x4096xf32>
-    %599 = stablehlo.multiply %577, %590 : tensor<1x197x4096xf32>
-    %600 = stablehlo.divide %599, %598 : tensor<1x197x4096xf32>
-    %601 = stablehlo.clamp %cst_19, %600, %cst_20 : tensor<1x197x4096xf32>
-    %602 = stablehlo.convert %601 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %603 = stablehlo.add %602, %cst_2 : tensor<1x197x4096xbf16>
-    %604 = stablehlo.multiply %603, %574 : tensor<1x197x4096xbf16>
-    %605 = stablehlo.reshape %604 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %606 = stablehlo.convert %605 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %607 = stablehlo.dot_general %606, %arg184, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %608 = stablehlo.broadcast_in_dim %607, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %609 = stablehlo.multiply %608, %60 : tensor<197x1024xf32>
-    %610 = stablehlo.broadcast_in_dim %609, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %611 = stablehlo.broadcast_in_dim %arg185, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %612 = stablehlo.add %610, %611 : tensor<197x1024xf32>
-    %613 = stablehlo.convert %612 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %614 = stablehlo.reshape %613 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %615 = stablehlo.broadcast_in_dim %arg20, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %616 = stablehlo.broadcast_in_dim %614, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %617 = stablehlo.multiply %615, %616 : tensor<1x197x1024xbf16>
-    %618 = stablehlo.add %617, %526 : tensor<1x197x1024xbf16>
-    %619 = stablehlo.convert %618 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %620 = stablehlo.convert %619 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %621 = stablehlo.reduce(%620 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %622 = stablehlo.reshape %621 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %623 = stablehlo.broadcast_in_dim %622, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %624 = stablehlo.divide %623, %15 : tensor<1x197x1xf64>
-    %625 = stablehlo.broadcast_in_dim %620, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %626 = stablehlo.broadcast_in_dim %624, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %627 = stablehlo.subtract %625, %626 : tensor<1x197x1024xf64>
-    %628 = stablehlo.multiply %627, %627 : tensor<1x197x1024xf64>
-    %629 = stablehlo.reduce(%628 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %630 = stablehlo.reshape %629 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %631 = stablehlo.broadcast_in_dim %630, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %632 = stablehlo.divide %631, %15 : tensor<1x197x1xf64>
-    %633 = stablehlo.convert %632 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %634 = stablehlo.reduce(%619 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %635 = stablehlo.reshape %634 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %636 = stablehlo.broadcast_in_dim %635, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %637 = stablehlo.divide %636, %31 : tensor<1x197x1xf32>
-    %638 = stablehlo.broadcast_in_dim %633, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %639 = stablehlo.add %638, %36 : tensor<1x197x1xf32>
-    %640 = stablehlo.rsqrt %639 : tensor<1x197x1xf32>
-    %641 = stablehlo.broadcast_in_dim %619, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %642 = stablehlo.broadcast_in_dim %637, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %643 = stablehlo.subtract %641, %642 : tensor<1x197x1024xf32>
-    %644 = stablehlo.broadcast_in_dim %643, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %645 = stablehlo.broadcast_in_dim %640, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %646 = stablehlo.multiply %644, %645 : tensor<1x197x1024xf32>
-    %647 = stablehlo.convert %arg21 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %648 = stablehlo.broadcast_in_dim %646, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %649 = stablehlo.broadcast_in_dim %647, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %650 = stablehlo.multiply %648, %649 : tensor<1x197x1024xf32>
-    %651 = stablehlo.convert %arg22 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %652 = stablehlo.broadcast_in_dim %650, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %653 = stablehlo.broadcast_in_dim %651, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %654 = stablehlo.add %652, %653 : tensor<1x197x1024xf32>
-    %655 = stablehlo.convert %654 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %656 = stablehlo.reshape %655 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %657 = stablehlo.convert %656 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %658 = stablehlo.dot_general %657, %arg186, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %659 = stablehlo.broadcast_in_dim %658, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %660 = stablehlo.multiply %659, %60 : tensor<197x1024xf32>
-    %661 = stablehlo.broadcast_in_dim %660, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %662 = stablehlo.broadcast_in_dim %arg187, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %663 = stablehlo.add %661, %662 : tensor<197x1024xf32>
-    %664 = stablehlo.convert %663 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %665 = stablehlo.reshape %664 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %666 = stablehlo.dot_general %656, %arg188, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %667 = stablehlo.reshape %666 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %668 = stablehlo.reshape %667 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %669 = stablehlo.transpose %668, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %670 = stablehlo.dot_general %657, %arg189, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %671 = stablehlo.broadcast_in_dim %670, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %672 = stablehlo.multiply %671, %60 : tensor<197x1024xf32>
-    %673 = stablehlo.broadcast_in_dim %672, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %674 = stablehlo.broadcast_in_dim %arg190, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %675 = stablehlo.add %673, %674 : tensor<197x1024xf32>
-    %676 = stablehlo.convert %675 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %677 = stablehlo.reshape %676 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %678 = stablehlo.reshape %677 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %679 = stablehlo.transpose %678, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %680 = stablehlo.reshape %665 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %681 = stablehlo.transpose %680, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %682 = stablehlo.transpose %669, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %683 = stablehlo.reshape %681 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %684 = stablehlo.reshape %682 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %685 = stablehlo.broadcast_in_dim %684, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %686 = stablehlo.dot_general %683, %685, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %687 = stablehlo.reshape %686 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %688 = stablehlo.broadcast_in_dim %687, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %689 = stablehlo.divide %688, %92 : tensor<1x16x197x197xbf16>
-    %690 = stablehlo.add %689, %arg191 : tensor<1x16x197x197xbf16>
-    %691 = stablehlo.convert %690 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %692 = stablehlo.reduce(%691 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %693 = stablehlo.reshape %692 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %694 = stablehlo.broadcast_in_dim %691, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %695 = stablehlo.broadcast_in_dim %693, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %696 = stablehlo.subtract %694, %695 : tensor<1x16x197x197xf32>
-    %697 = stablehlo.exponential %696 : tensor<1x16x197x197xf32>
-    %698 = stablehlo.reduce(%697 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %699 = stablehlo.reshape %698 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %700 = stablehlo.broadcast_in_dim %697, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %701 = stablehlo.broadcast_in_dim %699, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %702 = stablehlo.divide %700, %701 : tensor<1x16x197x197xf32>
-    %703 = stablehlo.convert %702 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %704 = stablehlo.reshape %703 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %705 = stablehlo.reshape %679 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %706 = stablehlo.broadcast_in_dim %705, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %707 = stablehlo.dot_general %704, %706, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %708 = stablehlo.reshape %707 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %709 = stablehlo.transpose %708, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %710 = stablehlo.reshape %709 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %711 = stablehlo.reshape %710 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %712 = stablehlo.convert %711 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %713 = stablehlo.dot_general %712, %arg192, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %714 = stablehlo.broadcast_in_dim %713, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %715 = stablehlo.multiply %714, %60 : tensor<197x1024xf32>
-    %716 = stablehlo.broadcast_in_dim %715, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %717 = stablehlo.broadcast_in_dim %arg193, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %718 = stablehlo.add %716, %717 : tensor<197x1024xf32>
-    %719 = stablehlo.convert %718 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %720 = stablehlo.reshape %719 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %721 = stablehlo.broadcast_in_dim %arg23, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %722 = stablehlo.broadcast_in_dim %720, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %723 = stablehlo.multiply %721, %722 : tensor<1x197x1024xbf16>
-    %724 = stablehlo.add %723, %618 : tensor<1x197x1024xbf16>
-    %725 = stablehlo.convert %724 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %726 = stablehlo.convert %725 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %727 = stablehlo.reduce(%726 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %728 = stablehlo.reshape %727 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %729 = stablehlo.broadcast_in_dim %728, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %730 = stablehlo.divide %729, %15 : tensor<1x197x1xf64>
-    %731 = stablehlo.broadcast_in_dim %726, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %732 = stablehlo.broadcast_in_dim %730, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %733 = stablehlo.subtract %731, %732 : tensor<1x197x1024xf64>
-    %734 = stablehlo.multiply %733, %733 : tensor<1x197x1024xf64>
-    %735 = stablehlo.reduce(%734 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %736 = stablehlo.reshape %735 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %737 = stablehlo.broadcast_in_dim %736, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %738 = stablehlo.divide %737, %15 : tensor<1x197x1xf64>
-    %739 = stablehlo.convert %738 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %740 = stablehlo.reduce(%725 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %741 = stablehlo.reshape %740 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %742 = stablehlo.broadcast_in_dim %741, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %743 = stablehlo.divide %742, %31 : tensor<1x197x1xf32>
-    %744 = stablehlo.broadcast_in_dim %739, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %745 = stablehlo.add %744, %36 : tensor<1x197x1xf32>
-    %746 = stablehlo.rsqrt %745 : tensor<1x197x1xf32>
-    %747 = stablehlo.broadcast_in_dim %725, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %748 = stablehlo.broadcast_in_dim %743, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %749 = stablehlo.subtract %747, %748 : tensor<1x197x1024xf32>
-    %750 = stablehlo.broadcast_in_dim %749, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %751 = stablehlo.broadcast_in_dim %746, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %752 = stablehlo.multiply %750, %751 : tensor<1x197x1024xf32>
-    %753 = stablehlo.convert %arg24 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %754 = stablehlo.broadcast_in_dim %752, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %755 = stablehlo.broadcast_in_dim %753, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %756 = stablehlo.multiply %754, %755 : tensor<1x197x1024xf32>
-    %757 = stablehlo.convert %arg25 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %758 = stablehlo.broadcast_in_dim %756, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %759 = stablehlo.broadcast_in_dim %757, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %760 = stablehlo.add %758, %759 : tensor<1x197x1024xf32>
-    %761 = stablehlo.convert %760 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %762 = stablehlo.reshape %761 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %763 = stablehlo.convert %762 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %764 = stablehlo.dot_general %763, %arg194, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %765 = stablehlo.broadcast_in_dim %764, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %766 = stablehlo.multiply %765, %170 : tensor<197x4096xf32>
-    %767 = stablehlo.broadcast_in_dim %766, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %768 = stablehlo.broadcast_in_dim %arg195, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %769 = stablehlo.add %767, %768 : tensor<197x4096xf32>
-    %770 = stablehlo.convert %769 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %771 = stablehlo.reshape %770 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %772 = stablehlo.multiply %771, %cst_4 : tensor<1x197x4096xbf16>
-    %773 = stablehlo.multiply %771, %178 : tensor<1x197x4096xbf16>
-    %774 = stablehlo.convert %773 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %775 = stablehlo.clamp %cst_5, %774, %cst_6 : tensor<1x197x4096xf32>
-    %776 = stablehlo.multiply %775, %775 : tensor<1x197x4096xf32>
-    %777 = stablehlo.multiply %cst_7, %776 : tensor<1x197x4096xf32>
-    %778 = stablehlo.add %777, %cst_8 : tensor<1x197x4096xf32>
-    %779 = stablehlo.multiply %778, %776 : tensor<1x197x4096xf32>
-    %780 = stablehlo.add %779, %cst_9 : tensor<1x197x4096xf32>
-    %781 = stablehlo.multiply %780, %776 : tensor<1x197x4096xf32>
-    %782 = stablehlo.add %781, %cst_10 : tensor<1x197x4096xf32>
-    %783 = stablehlo.multiply %782, %776 : tensor<1x197x4096xf32>
-    %784 = stablehlo.add %783, %cst_11 : tensor<1x197x4096xf32>
-    %785 = stablehlo.multiply %784, %776 : tensor<1x197x4096xf32>
-    %786 = stablehlo.add %785, %cst_12 : tensor<1x197x4096xf32>
-    %787 = stablehlo.multiply %786, %776 : tensor<1x197x4096xf32>
-    %788 = stablehlo.add %787, %cst_13 : tensor<1x197x4096xf32>
-    %789 = stablehlo.multiply %cst_14, %776 : tensor<1x197x4096xf32>
-    %790 = stablehlo.add %789, %cst_15 : tensor<1x197x4096xf32>
-    %791 = stablehlo.multiply %790, %776 : tensor<1x197x4096xf32>
-    %792 = stablehlo.add %791, %cst_16 : tensor<1x197x4096xf32>
-    %793 = stablehlo.multiply %792, %776 : tensor<1x197x4096xf32>
-    %794 = stablehlo.add %793, %cst_17 : tensor<1x197x4096xf32>
-    %795 = stablehlo.multiply %794, %776 : tensor<1x197x4096xf32>
-    %796 = stablehlo.add %795, %cst_18 : tensor<1x197x4096xf32>
-    %797 = stablehlo.multiply %775, %788 : tensor<1x197x4096xf32>
-    %798 = stablehlo.divide %797, %796 : tensor<1x197x4096xf32>
-    %799 = stablehlo.clamp %cst_19, %798, %cst_20 : tensor<1x197x4096xf32>
-    %800 = stablehlo.convert %799 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %801 = stablehlo.add %800, %cst_2 : tensor<1x197x4096xbf16>
-    %802 = stablehlo.multiply %801, %772 : tensor<1x197x4096xbf16>
-    %803 = stablehlo.reshape %802 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %804 = stablehlo.convert %803 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %805 = stablehlo.dot_general %804, %arg196, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %806 = stablehlo.broadcast_in_dim %805, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %807 = stablehlo.multiply %806, %60 : tensor<197x1024xf32>
-    %808 = stablehlo.broadcast_in_dim %807, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %809 = stablehlo.broadcast_in_dim %arg197, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %810 = stablehlo.add %808, %809 : tensor<197x1024xf32>
-    %811 = stablehlo.convert %810 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %812 = stablehlo.reshape %811 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %813 = stablehlo.broadcast_in_dim %arg26, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %814 = stablehlo.broadcast_in_dim %812, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %815 = stablehlo.multiply %813, %814 : tensor<1x197x1024xbf16>
-    %816 = stablehlo.add %815, %724 : tensor<1x197x1024xbf16>
-    %817 = stablehlo.convert %816 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %818 = stablehlo.convert %817 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %819 = stablehlo.reduce(%818 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %820 = stablehlo.reshape %819 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %821 = stablehlo.broadcast_in_dim %820, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %822 = stablehlo.divide %821, %15 : tensor<1x197x1xf64>
-    %823 = stablehlo.broadcast_in_dim %818, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %824 = stablehlo.broadcast_in_dim %822, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %825 = stablehlo.subtract %823, %824 : tensor<1x197x1024xf64>
-    %826 = stablehlo.multiply %825, %825 : tensor<1x197x1024xf64>
-    %827 = stablehlo.reduce(%826 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %828 = stablehlo.reshape %827 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %829 = stablehlo.broadcast_in_dim %828, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %830 = stablehlo.divide %829, %15 : tensor<1x197x1xf64>
-    %831 = stablehlo.convert %830 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %832 = stablehlo.reduce(%817 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %833 = stablehlo.reshape %832 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %834 = stablehlo.broadcast_in_dim %833, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %835 = stablehlo.divide %834, %31 : tensor<1x197x1xf32>
-    %836 = stablehlo.broadcast_in_dim %831, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %837 = stablehlo.add %836, %36 : tensor<1x197x1xf32>
-    %838 = stablehlo.rsqrt %837 : tensor<1x197x1xf32>
-    %839 = stablehlo.broadcast_in_dim %817, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %840 = stablehlo.broadcast_in_dim %835, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %841 = stablehlo.subtract %839, %840 : tensor<1x197x1024xf32>
-    %842 = stablehlo.broadcast_in_dim %841, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %843 = stablehlo.broadcast_in_dim %838, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %844 = stablehlo.multiply %842, %843 : tensor<1x197x1024xf32>
-    %845 = stablehlo.convert %arg27 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %846 = stablehlo.broadcast_in_dim %844, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %847 = stablehlo.broadcast_in_dim %845, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %848 = stablehlo.multiply %846, %847 : tensor<1x197x1024xf32>
-    %849 = stablehlo.convert %arg28 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %850 = stablehlo.broadcast_in_dim %848, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %851 = stablehlo.broadcast_in_dim %849, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %852 = stablehlo.add %850, %851 : tensor<1x197x1024xf32>
-    %853 = stablehlo.convert %852 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %854 = stablehlo.reshape %853 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %855 = stablehlo.convert %854 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %856 = stablehlo.dot_general %855, %arg198, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %857 = stablehlo.broadcast_in_dim %856, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %858 = stablehlo.multiply %857, %60 : tensor<197x1024xf32>
-    %859 = stablehlo.broadcast_in_dim %858, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %860 = stablehlo.broadcast_in_dim %arg199, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %861 = stablehlo.add %859, %860 : tensor<197x1024xf32>
-    %862 = stablehlo.convert %861 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %863 = stablehlo.reshape %862 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %864 = stablehlo.dot_general %854, %arg200, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %865 = stablehlo.reshape %864 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %866 = stablehlo.reshape %865 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %867 = stablehlo.transpose %866, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %868 = stablehlo.dot_general %855, %arg201, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %869 = stablehlo.broadcast_in_dim %868, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %870 = stablehlo.multiply %869, %60 : tensor<197x1024xf32>
-    %871 = stablehlo.broadcast_in_dim %870, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %872 = stablehlo.broadcast_in_dim %arg202, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %873 = stablehlo.add %871, %872 : tensor<197x1024xf32>
-    %874 = stablehlo.convert %873 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %875 = stablehlo.reshape %874 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %876 = stablehlo.reshape %875 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %877 = stablehlo.transpose %876, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %878 = stablehlo.reshape %863 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %879 = stablehlo.transpose %878, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %880 = stablehlo.transpose %867, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %881 = stablehlo.reshape %879 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %882 = stablehlo.reshape %880 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %883 = stablehlo.broadcast_in_dim %882, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %884 = stablehlo.dot_general %881, %883, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %885 = stablehlo.reshape %884 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %886 = stablehlo.broadcast_in_dim %885, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %887 = stablehlo.divide %886, %92 : tensor<1x16x197x197xbf16>
-    %888 = stablehlo.add %887, %arg203 : tensor<1x16x197x197xbf16>
-    %889 = stablehlo.convert %888 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %890 = stablehlo.reduce(%889 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %891 = stablehlo.reshape %890 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %892 = stablehlo.broadcast_in_dim %889, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %893 = stablehlo.broadcast_in_dim %891, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %894 = stablehlo.subtract %892, %893 : tensor<1x16x197x197xf32>
-    %895 = stablehlo.exponential %894 : tensor<1x16x197x197xf32>
-    %896 = stablehlo.reduce(%895 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %897 = stablehlo.reshape %896 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %898 = stablehlo.broadcast_in_dim %895, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %899 = stablehlo.broadcast_in_dim %897, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %900 = stablehlo.divide %898, %899 : tensor<1x16x197x197xf32>
-    %901 = stablehlo.convert %900 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %902 = stablehlo.reshape %901 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %903 = stablehlo.reshape %877 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %904 = stablehlo.broadcast_in_dim %903, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %905 = stablehlo.dot_general %902, %904, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %906 = stablehlo.reshape %905 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %907 = stablehlo.transpose %906, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %908 = stablehlo.reshape %907 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %909 = stablehlo.reshape %908 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %910 = stablehlo.convert %909 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %911 = stablehlo.dot_general %910, %arg204, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %912 = stablehlo.broadcast_in_dim %911, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %913 = stablehlo.multiply %912, %60 : tensor<197x1024xf32>
-    %914 = stablehlo.broadcast_in_dim %913, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %915 = stablehlo.broadcast_in_dim %arg205, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %916 = stablehlo.add %914, %915 : tensor<197x1024xf32>
-    %917 = stablehlo.convert %916 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %918 = stablehlo.reshape %917 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %919 = stablehlo.broadcast_in_dim %arg29, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %920 = stablehlo.broadcast_in_dim %918, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %921 = stablehlo.multiply %919, %920 : tensor<1x197x1024xbf16>
-    %922 = stablehlo.add %921, %816 : tensor<1x197x1024xbf16>
-    %923 = stablehlo.convert %922 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %924 = stablehlo.convert %923 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %925 = stablehlo.reduce(%924 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %926 = stablehlo.reshape %925 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %927 = stablehlo.broadcast_in_dim %926, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %928 = stablehlo.divide %927, %15 : tensor<1x197x1xf64>
-    %929 = stablehlo.broadcast_in_dim %924, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %930 = stablehlo.broadcast_in_dim %928, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %931 = stablehlo.subtract %929, %930 : tensor<1x197x1024xf64>
-    %932 = stablehlo.multiply %931, %931 : tensor<1x197x1024xf64>
-    %933 = stablehlo.reduce(%932 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %934 = stablehlo.reshape %933 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %935 = stablehlo.broadcast_in_dim %934, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %936 = stablehlo.divide %935, %15 : tensor<1x197x1xf64>
-    %937 = stablehlo.convert %936 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %938 = stablehlo.reduce(%923 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %939 = stablehlo.reshape %938 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %940 = stablehlo.broadcast_in_dim %939, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %941 = stablehlo.divide %940, %31 : tensor<1x197x1xf32>
-    %942 = stablehlo.broadcast_in_dim %937, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %943 = stablehlo.add %942, %36 : tensor<1x197x1xf32>
-    %944 = stablehlo.rsqrt %943 : tensor<1x197x1xf32>
-    %945 = stablehlo.broadcast_in_dim %923, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %946 = stablehlo.broadcast_in_dim %941, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %947 = stablehlo.subtract %945, %946 : tensor<1x197x1024xf32>
-    %948 = stablehlo.broadcast_in_dim %947, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %949 = stablehlo.broadcast_in_dim %944, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %950 = stablehlo.multiply %948, %949 : tensor<1x197x1024xf32>
-    %951 = stablehlo.convert %arg30 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %952 = stablehlo.broadcast_in_dim %950, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %953 = stablehlo.broadcast_in_dim %951, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %954 = stablehlo.multiply %952, %953 : tensor<1x197x1024xf32>
-    %955 = stablehlo.convert %arg31 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %956 = stablehlo.broadcast_in_dim %954, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %957 = stablehlo.broadcast_in_dim %955, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %958 = stablehlo.add %956, %957 : tensor<1x197x1024xf32>
-    %959 = stablehlo.convert %958 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %960 = stablehlo.reshape %959 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %961 = stablehlo.convert %960 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %962 = stablehlo.dot_general %961, %arg206, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %963 = stablehlo.broadcast_in_dim %962, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %964 = stablehlo.multiply %963, %170 : tensor<197x4096xf32>
-    %965 = stablehlo.broadcast_in_dim %964, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %966 = stablehlo.broadcast_in_dim %arg207, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %967 = stablehlo.add %965, %966 : tensor<197x4096xf32>
-    %968 = stablehlo.convert %967 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %969 = stablehlo.reshape %968 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %970 = stablehlo.multiply %969, %cst_4 : tensor<1x197x4096xbf16>
-    %971 = stablehlo.multiply %969, %178 : tensor<1x197x4096xbf16>
-    %972 = stablehlo.convert %971 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %973 = stablehlo.clamp %cst_5, %972, %cst_6 : tensor<1x197x4096xf32>
-    %974 = stablehlo.multiply %973, %973 : tensor<1x197x4096xf32>
-    %975 = stablehlo.multiply %cst_7, %974 : tensor<1x197x4096xf32>
-    %976 = stablehlo.add %975, %cst_8 : tensor<1x197x4096xf32>
-    %977 = stablehlo.multiply %976, %974 : tensor<1x197x4096xf32>
-    %978 = stablehlo.add %977, %cst_9 : tensor<1x197x4096xf32>
-    %979 = stablehlo.multiply %978, %974 : tensor<1x197x4096xf32>
-    %980 = stablehlo.add %979, %cst_10 : tensor<1x197x4096xf32>
-    %981 = stablehlo.multiply %980, %974 : tensor<1x197x4096xf32>
-    %982 = stablehlo.add %981, %cst_11 : tensor<1x197x4096xf32>
-    %983 = stablehlo.multiply %982, %974 : tensor<1x197x4096xf32>
-    %984 = stablehlo.add %983, %cst_12 : tensor<1x197x4096xf32>
-    %985 = stablehlo.multiply %984, %974 : tensor<1x197x4096xf32>
-    %986 = stablehlo.add %985, %cst_13 : tensor<1x197x4096xf32>
-    %987 = stablehlo.multiply %cst_14, %974 : tensor<1x197x4096xf32>
-    %988 = stablehlo.add %987, %cst_15 : tensor<1x197x4096xf32>
-    %989 = stablehlo.multiply %988, %974 : tensor<1x197x4096xf32>
-    %990 = stablehlo.add %989, %cst_16 : tensor<1x197x4096xf32>
-    %991 = stablehlo.multiply %990, %974 : tensor<1x197x4096xf32>
-    %992 = stablehlo.add %991, %cst_17 : tensor<1x197x4096xf32>
-    %993 = stablehlo.multiply %992, %974 : tensor<1x197x4096xf32>
-    %994 = stablehlo.add %993, %cst_18 : tensor<1x197x4096xf32>
-    %995 = stablehlo.multiply %973, %986 : tensor<1x197x4096xf32>
-    %996 = stablehlo.divide %995, %994 : tensor<1x197x4096xf32>
-    %997 = stablehlo.clamp %cst_19, %996, %cst_20 : tensor<1x197x4096xf32>
-    %998 = stablehlo.convert %997 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %999 = stablehlo.add %998, %cst_2 : tensor<1x197x4096xbf16>
-    %1000 = stablehlo.multiply %999, %970 : tensor<1x197x4096xbf16>
-    %1001 = stablehlo.reshape %1000 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %1002 = stablehlo.convert %1001 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %1003 = stablehlo.dot_general %1002, %arg208, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %1004 = stablehlo.broadcast_in_dim %1003, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1005 = stablehlo.multiply %1004, %60 : tensor<197x1024xf32>
-    %1006 = stablehlo.broadcast_in_dim %1005, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1007 = stablehlo.broadcast_in_dim %arg209, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1008 = stablehlo.add %1006, %1007 : tensor<197x1024xf32>
-    %1009 = stablehlo.convert %1008 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1010 = stablehlo.reshape %1009 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1011 = stablehlo.broadcast_in_dim %arg32, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1012 = stablehlo.broadcast_in_dim %1010, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1013 = stablehlo.multiply %1011, %1012 : tensor<1x197x1024xbf16>
-    %1014 = stablehlo.add %1013, %922 : tensor<1x197x1024xbf16>
-    %1015 = stablehlo.convert %1014 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1016 = stablehlo.convert %1015 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1017 = stablehlo.reduce(%1016 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1018 = stablehlo.reshape %1017 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1019 = stablehlo.broadcast_in_dim %1018, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1020 = stablehlo.divide %1019, %15 : tensor<1x197x1xf64>
-    %1021 = stablehlo.broadcast_in_dim %1016, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1022 = stablehlo.broadcast_in_dim %1020, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1023 = stablehlo.subtract %1021, %1022 : tensor<1x197x1024xf64>
-    %1024 = stablehlo.multiply %1023, %1023 : tensor<1x197x1024xf64>
-    %1025 = stablehlo.reduce(%1024 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1026 = stablehlo.reshape %1025 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1027 = stablehlo.broadcast_in_dim %1026, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1028 = stablehlo.divide %1027, %15 : tensor<1x197x1xf64>
-    %1029 = stablehlo.convert %1028 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1030 = stablehlo.reduce(%1015 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1031 = stablehlo.reshape %1030 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1032 = stablehlo.broadcast_in_dim %1031, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1033 = stablehlo.divide %1032, %31 : tensor<1x197x1xf32>
-    %1034 = stablehlo.broadcast_in_dim %1029, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1035 = stablehlo.add %1034, %36 : tensor<1x197x1xf32>
-    %1036 = stablehlo.rsqrt %1035 : tensor<1x197x1xf32>
-    %1037 = stablehlo.broadcast_in_dim %1015, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1038 = stablehlo.broadcast_in_dim %1033, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1039 = stablehlo.subtract %1037, %1038 : tensor<1x197x1024xf32>
-    %1040 = stablehlo.broadcast_in_dim %1039, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1041 = stablehlo.broadcast_in_dim %1036, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1042 = stablehlo.multiply %1040, %1041 : tensor<1x197x1024xf32>
-    %1043 = stablehlo.convert %arg33 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1044 = stablehlo.broadcast_in_dim %1042, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1045 = stablehlo.broadcast_in_dim %1043, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1046 = stablehlo.multiply %1044, %1045 : tensor<1x197x1024xf32>
-    %1047 = stablehlo.convert %arg34 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1048 = stablehlo.broadcast_in_dim %1046, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1049 = stablehlo.broadcast_in_dim %1047, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1050 = stablehlo.add %1048, %1049 : tensor<1x197x1024xf32>
-    %1051 = stablehlo.convert %1050 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1052 = stablehlo.reshape %1051 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1053 = stablehlo.convert %1052 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1054 = stablehlo.dot_general %1053, %arg210, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1055 = stablehlo.broadcast_in_dim %1054, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1056 = stablehlo.multiply %1055, %60 : tensor<197x1024xf32>
-    %1057 = stablehlo.broadcast_in_dim %1056, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1058 = stablehlo.broadcast_in_dim %arg211, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1059 = stablehlo.add %1057, %1058 : tensor<197x1024xf32>
-    %1060 = stablehlo.convert %1059 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1061 = stablehlo.reshape %1060 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1062 = stablehlo.dot_general %1052, %arg212, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %1063 = stablehlo.reshape %1062 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1064 = stablehlo.reshape %1063 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1065 = stablehlo.transpose %1064, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1066 = stablehlo.dot_general %1053, %arg213, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1067 = stablehlo.broadcast_in_dim %1066, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1068 = stablehlo.multiply %1067, %60 : tensor<197x1024xf32>
-    %1069 = stablehlo.broadcast_in_dim %1068, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1070 = stablehlo.broadcast_in_dim %arg214, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1071 = stablehlo.add %1069, %1070 : tensor<197x1024xf32>
-    %1072 = stablehlo.convert %1071 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1073 = stablehlo.reshape %1072 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1074 = stablehlo.reshape %1073 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1075 = stablehlo.transpose %1074, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1076 = stablehlo.reshape %1061 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1077 = stablehlo.transpose %1076, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1078 = stablehlo.transpose %1065, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %1079 = stablehlo.reshape %1077 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1080 = stablehlo.reshape %1078 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1081 = stablehlo.broadcast_in_dim %1080, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1082 = stablehlo.dot_general %1079, %1081, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %1083 = stablehlo.reshape %1082 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1084 = stablehlo.broadcast_in_dim %1083, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1085 = stablehlo.divide %1084, %92 : tensor<1x16x197x197xbf16>
-    %1086 = stablehlo.add %1085, %arg215 : tensor<1x16x197x197xbf16>
-    %1087 = stablehlo.convert %1086 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %1088 = stablehlo.reduce(%1087 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1089 = stablehlo.reshape %1088 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1090 = stablehlo.broadcast_in_dim %1087, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1091 = stablehlo.broadcast_in_dim %1089, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1092 = stablehlo.subtract %1090, %1091 : tensor<1x16x197x197xf32>
-    %1093 = stablehlo.exponential %1092 : tensor<1x16x197x197xf32>
-    %1094 = stablehlo.reduce(%1093 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1095 = stablehlo.reshape %1094 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1096 = stablehlo.broadcast_in_dim %1093, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1097 = stablehlo.broadcast_in_dim %1095, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1098 = stablehlo.divide %1096, %1097 : tensor<1x16x197x197xf32>
-    %1099 = stablehlo.convert %1098 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %1100 = stablehlo.reshape %1099 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %1101 = stablehlo.reshape %1075 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1102 = stablehlo.broadcast_in_dim %1101, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1103 = stablehlo.dot_general %1100, %1102, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1104 = stablehlo.reshape %1103 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1105 = stablehlo.transpose %1104, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %1106 = stablehlo.reshape %1105 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %1107 = stablehlo.reshape %1106 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1108 = stablehlo.convert %1107 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1109 = stablehlo.dot_general %1108, %arg216, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1110 = stablehlo.broadcast_in_dim %1109, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1111 = stablehlo.multiply %1110, %60 : tensor<197x1024xf32>
-    %1112 = stablehlo.broadcast_in_dim %1111, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1113 = stablehlo.broadcast_in_dim %arg217, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1114 = stablehlo.add %1112, %1113 : tensor<197x1024xf32>
-    %1115 = stablehlo.convert %1114 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1116 = stablehlo.reshape %1115 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1117 = stablehlo.broadcast_in_dim %arg35, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1118 = stablehlo.broadcast_in_dim %1116, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1119 = stablehlo.multiply %1117, %1118 : tensor<1x197x1024xbf16>
-    %1120 = stablehlo.add %1119, %1014 : tensor<1x197x1024xbf16>
-    %1121 = stablehlo.convert %1120 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1122 = stablehlo.convert %1121 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1123 = stablehlo.reduce(%1122 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1124 = stablehlo.reshape %1123 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1125 = stablehlo.broadcast_in_dim %1124, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1126 = stablehlo.divide %1125, %15 : tensor<1x197x1xf64>
-    %1127 = stablehlo.broadcast_in_dim %1122, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1128 = stablehlo.broadcast_in_dim %1126, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1129 = stablehlo.subtract %1127, %1128 : tensor<1x197x1024xf64>
-    %1130 = stablehlo.multiply %1129, %1129 : tensor<1x197x1024xf64>
-    %1131 = stablehlo.reduce(%1130 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1132 = stablehlo.reshape %1131 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1133 = stablehlo.broadcast_in_dim %1132, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1134 = stablehlo.divide %1133, %15 : tensor<1x197x1xf64>
-    %1135 = stablehlo.convert %1134 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1136 = stablehlo.reduce(%1121 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1137 = stablehlo.reshape %1136 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1138 = stablehlo.broadcast_in_dim %1137, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1139 = stablehlo.divide %1138, %31 : tensor<1x197x1xf32>
-    %1140 = stablehlo.broadcast_in_dim %1135, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1141 = stablehlo.add %1140, %36 : tensor<1x197x1xf32>
-    %1142 = stablehlo.rsqrt %1141 : tensor<1x197x1xf32>
-    %1143 = stablehlo.broadcast_in_dim %1121, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1144 = stablehlo.broadcast_in_dim %1139, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1145 = stablehlo.subtract %1143, %1144 : tensor<1x197x1024xf32>
-    %1146 = stablehlo.broadcast_in_dim %1145, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1147 = stablehlo.broadcast_in_dim %1142, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1148 = stablehlo.multiply %1146, %1147 : tensor<1x197x1024xf32>
-    %1149 = stablehlo.convert %arg36 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1150 = stablehlo.broadcast_in_dim %1148, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1151 = stablehlo.broadcast_in_dim %1149, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1152 = stablehlo.multiply %1150, %1151 : tensor<1x197x1024xf32>
-    %1153 = stablehlo.convert %arg37 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1154 = stablehlo.broadcast_in_dim %1152, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1155 = stablehlo.broadcast_in_dim %1153, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1156 = stablehlo.add %1154, %1155 : tensor<1x197x1024xf32>
-    %1157 = stablehlo.convert %1156 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1158 = stablehlo.reshape %1157 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1159 = stablehlo.convert %1158 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1160 = stablehlo.dot_general %1159, %arg218, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %1161 = stablehlo.broadcast_in_dim %1160, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1162 = stablehlo.multiply %1161, %170 : tensor<197x4096xf32>
-    %1163 = stablehlo.broadcast_in_dim %1162, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1164 = stablehlo.broadcast_in_dim %arg219, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %1165 = stablehlo.add %1163, %1164 : tensor<197x4096xf32>
-    %1166 = stablehlo.convert %1165 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %1167 = stablehlo.reshape %1166 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %1168 = stablehlo.multiply %1167, %cst_4 : tensor<1x197x4096xbf16>
-    %1169 = stablehlo.multiply %1167, %178 : tensor<1x197x4096xbf16>
-    %1170 = stablehlo.convert %1169 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %1171 = stablehlo.clamp %cst_5, %1170, %cst_6 : tensor<1x197x4096xf32>
-    %1172 = stablehlo.multiply %1171, %1171 : tensor<1x197x4096xf32>
-    %1173 = stablehlo.multiply %cst_7, %1172 : tensor<1x197x4096xf32>
-    %1174 = stablehlo.add %1173, %cst_8 : tensor<1x197x4096xf32>
-    %1175 = stablehlo.multiply %1174, %1172 : tensor<1x197x4096xf32>
-    %1176 = stablehlo.add %1175, %cst_9 : tensor<1x197x4096xf32>
-    %1177 = stablehlo.multiply %1176, %1172 : tensor<1x197x4096xf32>
-    %1178 = stablehlo.add %1177, %cst_10 : tensor<1x197x4096xf32>
-    %1179 = stablehlo.multiply %1178, %1172 : tensor<1x197x4096xf32>
-    %1180 = stablehlo.add %1179, %cst_11 : tensor<1x197x4096xf32>
-    %1181 = stablehlo.multiply %1180, %1172 : tensor<1x197x4096xf32>
-    %1182 = stablehlo.add %1181, %cst_12 : tensor<1x197x4096xf32>
-    %1183 = stablehlo.multiply %1182, %1172 : tensor<1x197x4096xf32>
-    %1184 = stablehlo.add %1183, %cst_13 : tensor<1x197x4096xf32>
-    %1185 = stablehlo.multiply %cst_14, %1172 : tensor<1x197x4096xf32>
-    %1186 = stablehlo.add %1185, %cst_15 : tensor<1x197x4096xf32>
-    %1187 = stablehlo.multiply %1186, %1172 : tensor<1x197x4096xf32>
-    %1188 = stablehlo.add %1187, %cst_16 : tensor<1x197x4096xf32>
-    %1189 = stablehlo.multiply %1188, %1172 : tensor<1x197x4096xf32>
-    %1190 = stablehlo.add %1189, %cst_17 : tensor<1x197x4096xf32>
-    %1191 = stablehlo.multiply %1190, %1172 : tensor<1x197x4096xf32>
-    %1192 = stablehlo.add %1191, %cst_18 : tensor<1x197x4096xf32>
-    %1193 = stablehlo.multiply %1171, %1184 : tensor<1x197x4096xf32>
-    %1194 = stablehlo.divide %1193, %1192 : tensor<1x197x4096xf32>
-    %1195 = stablehlo.clamp %cst_19, %1194, %cst_20 : tensor<1x197x4096xf32>
-    %1196 = stablehlo.convert %1195 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %1197 = stablehlo.add %1196, %cst_2 : tensor<1x197x4096xbf16>
-    %1198 = stablehlo.multiply %1197, %1168 : tensor<1x197x4096xbf16>
-    %1199 = stablehlo.reshape %1198 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %1200 = stablehlo.convert %1199 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %1201 = stablehlo.dot_general %1200, %arg220, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %1202 = stablehlo.broadcast_in_dim %1201, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1203 = stablehlo.multiply %1202, %60 : tensor<197x1024xf32>
-    %1204 = stablehlo.broadcast_in_dim %1203, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1205 = stablehlo.broadcast_in_dim %arg221, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1206 = stablehlo.add %1204, %1205 : tensor<197x1024xf32>
-    %1207 = stablehlo.convert %1206 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1208 = stablehlo.reshape %1207 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1209 = stablehlo.broadcast_in_dim %arg38, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1210 = stablehlo.broadcast_in_dim %1208, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1211 = stablehlo.multiply %1209, %1210 : tensor<1x197x1024xbf16>
-    %1212 = stablehlo.add %1211, %1120 : tensor<1x197x1024xbf16>
-    %1213 = stablehlo.convert %1212 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1214 = stablehlo.convert %1213 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1215 = stablehlo.reduce(%1214 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1216 = stablehlo.reshape %1215 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1217 = stablehlo.broadcast_in_dim %1216, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1218 = stablehlo.divide %1217, %15 : tensor<1x197x1xf64>
-    %1219 = stablehlo.broadcast_in_dim %1214, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1220 = stablehlo.broadcast_in_dim %1218, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1221 = stablehlo.subtract %1219, %1220 : tensor<1x197x1024xf64>
-    %1222 = stablehlo.multiply %1221, %1221 : tensor<1x197x1024xf64>
-    %1223 = stablehlo.reduce(%1222 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1224 = stablehlo.reshape %1223 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1225 = stablehlo.broadcast_in_dim %1224, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1226 = stablehlo.divide %1225, %15 : tensor<1x197x1xf64>
-    %1227 = stablehlo.convert %1226 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1228 = stablehlo.reduce(%1213 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1229 = stablehlo.reshape %1228 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1230 = stablehlo.broadcast_in_dim %1229, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1231 = stablehlo.divide %1230, %31 : tensor<1x197x1xf32>
-    %1232 = stablehlo.broadcast_in_dim %1227, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1233 = stablehlo.add %1232, %36 : tensor<1x197x1xf32>
-    %1234 = stablehlo.rsqrt %1233 : tensor<1x197x1xf32>
-    %1235 = stablehlo.broadcast_in_dim %1213, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1236 = stablehlo.broadcast_in_dim %1231, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1237 = stablehlo.subtract %1235, %1236 : tensor<1x197x1024xf32>
-    %1238 = stablehlo.broadcast_in_dim %1237, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1239 = stablehlo.broadcast_in_dim %1234, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1240 = stablehlo.multiply %1238, %1239 : tensor<1x197x1024xf32>
-    %1241 = stablehlo.convert %arg39 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1242 = stablehlo.broadcast_in_dim %1240, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1243 = stablehlo.broadcast_in_dim %1241, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1244 = stablehlo.multiply %1242, %1243 : tensor<1x197x1024xf32>
-    %1245 = stablehlo.convert %arg40 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1246 = stablehlo.broadcast_in_dim %1244, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1247 = stablehlo.broadcast_in_dim %1245, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1248 = stablehlo.add %1246, %1247 : tensor<1x197x1024xf32>
-    %1249 = stablehlo.convert %1248 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1250 = stablehlo.reshape %1249 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1251 = stablehlo.convert %1250 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1252 = stablehlo.dot_general %1251, %arg222, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1253 = stablehlo.broadcast_in_dim %1252, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1254 = stablehlo.multiply %1253, %60 : tensor<197x1024xf32>
-    %1255 = stablehlo.broadcast_in_dim %1254, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1256 = stablehlo.broadcast_in_dim %arg223, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1257 = stablehlo.add %1255, %1256 : tensor<197x1024xf32>
-    %1258 = stablehlo.convert %1257 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1259 = stablehlo.reshape %1258 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1260 = stablehlo.dot_general %1250, %arg224, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %1261 = stablehlo.reshape %1260 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1262 = stablehlo.reshape %1261 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1263 = stablehlo.transpose %1262, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1264 = stablehlo.dot_general %1251, %arg225, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1265 = stablehlo.broadcast_in_dim %1264, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1266 = stablehlo.multiply %1265, %60 : tensor<197x1024xf32>
-    %1267 = stablehlo.broadcast_in_dim %1266, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1268 = stablehlo.broadcast_in_dim %arg226, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1269 = stablehlo.add %1267, %1268 : tensor<197x1024xf32>
-    %1270 = stablehlo.convert %1269 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1271 = stablehlo.reshape %1270 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1272 = stablehlo.reshape %1271 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1273 = stablehlo.transpose %1272, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1274 = stablehlo.reshape %1259 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1275 = stablehlo.transpose %1274, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1276 = stablehlo.transpose %1263, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %1277 = stablehlo.reshape %1275 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1278 = stablehlo.reshape %1276 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1279 = stablehlo.broadcast_in_dim %1278, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1280 = stablehlo.dot_general %1277, %1279, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %1281 = stablehlo.reshape %1280 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1282 = stablehlo.broadcast_in_dim %1281, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1283 = stablehlo.divide %1282, %92 : tensor<1x16x197x197xbf16>
-    %1284 = stablehlo.add %1283, %arg227 : tensor<1x16x197x197xbf16>
-    %1285 = stablehlo.convert %1284 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %1286 = stablehlo.reduce(%1285 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1287 = stablehlo.reshape %1286 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1288 = stablehlo.broadcast_in_dim %1285, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1289 = stablehlo.broadcast_in_dim %1287, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1290 = stablehlo.subtract %1288, %1289 : tensor<1x16x197x197xf32>
-    %1291 = stablehlo.exponential %1290 : tensor<1x16x197x197xf32>
-    %1292 = stablehlo.reduce(%1291 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1293 = stablehlo.reshape %1292 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1294 = stablehlo.broadcast_in_dim %1291, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1295 = stablehlo.broadcast_in_dim %1293, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1296 = stablehlo.divide %1294, %1295 : tensor<1x16x197x197xf32>
-    %1297 = stablehlo.convert %1296 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %1298 = stablehlo.reshape %1297 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %1299 = stablehlo.reshape %1273 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1300 = stablehlo.broadcast_in_dim %1299, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1301 = stablehlo.dot_general %1298, %1300, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1302 = stablehlo.reshape %1301 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1303 = stablehlo.transpose %1302, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %1304 = stablehlo.reshape %1303 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %1305 = stablehlo.reshape %1304 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1306 = stablehlo.convert %1305 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1307 = stablehlo.dot_general %1306, %arg228, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1308 = stablehlo.broadcast_in_dim %1307, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1309 = stablehlo.multiply %1308, %60 : tensor<197x1024xf32>
-    %1310 = stablehlo.broadcast_in_dim %1309, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1311 = stablehlo.broadcast_in_dim %arg229, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1312 = stablehlo.add %1310, %1311 : tensor<197x1024xf32>
-    %1313 = stablehlo.convert %1312 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1314 = stablehlo.reshape %1313 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1315 = stablehlo.broadcast_in_dim %arg41, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1316 = stablehlo.broadcast_in_dim %1314, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1317 = stablehlo.multiply %1315, %1316 : tensor<1x197x1024xbf16>
-    %1318 = stablehlo.add %1317, %1212 : tensor<1x197x1024xbf16>
-    %1319 = stablehlo.convert %1318 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1320 = stablehlo.convert %1319 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1321 = stablehlo.reduce(%1320 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1322 = stablehlo.reshape %1321 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1323 = stablehlo.broadcast_in_dim %1322, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1324 = stablehlo.divide %1323, %15 : tensor<1x197x1xf64>
-    %1325 = stablehlo.broadcast_in_dim %1320, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1326 = stablehlo.broadcast_in_dim %1324, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1327 = stablehlo.subtract %1325, %1326 : tensor<1x197x1024xf64>
-    %1328 = stablehlo.multiply %1327, %1327 : tensor<1x197x1024xf64>
-    %1329 = stablehlo.reduce(%1328 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1330 = stablehlo.reshape %1329 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1331 = stablehlo.broadcast_in_dim %1330, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1332 = stablehlo.divide %1331, %15 : tensor<1x197x1xf64>
-    %1333 = stablehlo.convert %1332 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1334 = stablehlo.reduce(%1319 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1335 = stablehlo.reshape %1334 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1336 = stablehlo.broadcast_in_dim %1335, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1337 = stablehlo.divide %1336, %31 : tensor<1x197x1xf32>
-    %1338 = stablehlo.broadcast_in_dim %1333, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1339 = stablehlo.add %1338, %36 : tensor<1x197x1xf32>
-    %1340 = stablehlo.rsqrt %1339 : tensor<1x197x1xf32>
-    %1341 = stablehlo.broadcast_in_dim %1319, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1342 = stablehlo.broadcast_in_dim %1337, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1343 = stablehlo.subtract %1341, %1342 : tensor<1x197x1024xf32>
-    %1344 = stablehlo.broadcast_in_dim %1343, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1345 = stablehlo.broadcast_in_dim %1340, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1346 = stablehlo.multiply %1344, %1345 : tensor<1x197x1024xf32>
-    %1347 = stablehlo.convert %arg42 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1348 = stablehlo.broadcast_in_dim %1346, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1349 = stablehlo.broadcast_in_dim %1347, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1350 = stablehlo.multiply %1348, %1349 : tensor<1x197x1024xf32>
-    %1351 = stablehlo.convert %arg43 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1352 = stablehlo.broadcast_in_dim %1350, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1353 = stablehlo.broadcast_in_dim %1351, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1354 = stablehlo.add %1352, %1353 : tensor<1x197x1024xf32>
-    %1355 = stablehlo.convert %1354 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1356 = stablehlo.reshape %1355 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1357 = stablehlo.convert %1356 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1358 = stablehlo.dot_general %1357, %arg230, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %1359 = stablehlo.broadcast_in_dim %1358, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1360 = stablehlo.multiply %1359, %170 : tensor<197x4096xf32>
-    %1361 = stablehlo.broadcast_in_dim %1360, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1362 = stablehlo.broadcast_in_dim %arg231, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %1363 = stablehlo.add %1361, %1362 : tensor<197x4096xf32>
-    %1364 = stablehlo.convert %1363 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %1365 = stablehlo.reshape %1364 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %1366 = stablehlo.multiply %1365, %cst_4 : tensor<1x197x4096xbf16>
-    %1367 = stablehlo.multiply %1365, %178 : tensor<1x197x4096xbf16>
-    %1368 = stablehlo.convert %1367 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %1369 = stablehlo.clamp %cst_5, %1368, %cst_6 : tensor<1x197x4096xf32>
-    %1370 = stablehlo.multiply %1369, %1369 : tensor<1x197x4096xf32>
-    %1371 = stablehlo.multiply %cst_7, %1370 : tensor<1x197x4096xf32>
-    %1372 = stablehlo.add %1371, %cst_8 : tensor<1x197x4096xf32>
-    %1373 = stablehlo.multiply %1372, %1370 : tensor<1x197x4096xf32>
-    %1374 = stablehlo.add %1373, %cst_9 : tensor<1x197x4096xf32>
-    %1375 = stablehlo.multiply %1374, %1370 : tensor<1x197x4096xf32>
-    %1376 = stablehlo.add %1375, %cst_10 : tensor<1x197x4096xf32>
-    %1377 = stablehlo.multiply %1376, %1370 : tensor<1x197x4096xf32>
-    %1378 = stablehlo.add %1377, %cst_11 : tensor<1x197x4096xf32>
-    %1379 = stablehlo.multiply %1378, %1370 : tensor<1x197x4096xf32>
-    %1380 = stablehlo.add %1379, %cst_12 : tensor<1x197x4096xf32>
-    %1381 = stablehlo.multiply %1380, %1370 : tensor<1x197x4096xf32>
-    %1382 = stablehlo.add %1381, %cst_13 : tensor<1x197x4096xf32>
-    %1383 = stablehlo.multiply %cst_14, %1370 : tensor<1x197x4096xf32>
-    %1384 = stablehlo.add %1383, %cst_15 : tensor<1x197x4096xf32>
-    %1385 = stablehlo.multiply %1384, %1370 : tensor<1x197x4096xf32>
-    %1386 = stablehlo.add %1385, %cst_16 : tensor<1x197x4096xf32>
-    %1387 = stablehlo.multiply %1386, %1370 : tensor<1x197x4096xf32>
-    %1388 = stablehlo.add %1387, %cst_17 : tensor<1x197x4096xf32>
-    %1389 = stablehlo.multiply %1388, %1370 : tensor<1x197x4096xf32>
-    %1390 = stablehlo.add %1389, %cst_18 : tensor<1x197x4096xf32>
-    %1391 = stablehlo.multiply %1369, %1382 : tensor<1x197x4096xf32>
-    %1392 = stablehlo.divide %1391, %1390 : tensor<1x197x4096xf32>
-    %1393 = stablehlo.clamp %cst_19, %1392, %cst_20 : tensor<1x197x4096xf32>
-    %1394 = stablehlo.convert %1393 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %1395 = stablehlo.add %1394, %cst_2 : tensor<1x197x4096xbf16>
-    %1396 = stablehlo.multiply %1395, %1366 : tensor<1x197x4096xbf16>
-    %1397 = stablehlo.reshape %1396 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %1398 = stablehlo.convert %1397 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %1399 = stablehlo.dot_general %1398, %arg232, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %1400 = stablehlo.broadcast_in_dim %1399, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1401 = stablehlo.multiply %1400, %60 : tensor<197x1024xf32>
-    %1402 = stablehlo.broadcast_in_dim %1401, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1403 = stablehlo.broadcast_in_dim %arg233, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1404 = stablehlo.add %1402, %1403 : tensor<197x1024xf32>
-    %1405 = stablehlo.convert %1404 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1406 = stablehlo.reshape %1405 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1407 = stablehlo.broadcast_in_dim %arg44, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1408 = stablehlo.broadcast_in_dim %1406, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1409 = stablehlo.multiply %1407, %1408 : tensor<1x197x1024xbf16>
-    %1410 = stablehlo.add %1409, %1318 : tensor<1x197x1024xbf16>
-    %1411 = stablehlo.convert %1410 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1412 = stablehlo.convert %1411 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1413 = stablehlo.reduce(%1412 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1414 = stablehlo.reshape %1413 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1415 = stablehlo.broadcast_in_dim %1414, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1416 = stablehlo.divide %1415, %15 : tensor<1x197x1xf64>
-    %1417 = stablehlo.broadcast_in_dim %1412, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1418 = stablehlo.broadcast_in_dim %1416, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1419 = stablehlo.subtract %1417, %1418 : tensor<1x197x1024xf64>
-    %1420 = stablehlo.multiply %1419, %1419 : tensor<1x197x1024xf64>
-    %1421 = stablehlo.reduce(%1420 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1422 = stablehlo.reshape %1421 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1423 = stablehlo.broadcast_in_dim %1422, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1424 = stablehlo.divide %1423, %15 : tensor<1x197x1xf64>
-    %1425 = stablehlo.convert %1424 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1426 = stablehlo.reduce(%1411 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1427 = stablehlo.reshape %1426 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1428 = stablehlo.broadcast_in_dim %1427, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1429 = stablehlo.divide %1428, %31 : tensor<1x197x1xf32>
-    %1430 = stablehlo.broadcast_in_dim %1425, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1431 = stablehlo.add %1430, %36 : tensor<1x197x1xf32>
-    %1432 = stablehlo.rsqrt %1431 : tensor<1x197x1xf32>
-    %1433 = stablehlo.broadcast_in_dim %1411, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1434 = stablehlo.broadcast_in_dim %1429, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1435 = stablehlo.subtract %1433, %1434 : tensor<1x197x1024xf32>
-    %1436 = stablehlo.broadcast_in_dim %1435, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1437 = stablehlo.broadcast_in_dim %1432, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1438 = stablehlo.multiply %1436, %1437 : tensor<1x197x1024xf32>
-    %1439 = stablehlo.convert %arg45 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1440 = stablehlo.broadcast_in_dim %1438, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1441 = stablehlo.broadcast_in_dim %1439, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1442 = stablehlo.multiply %1440, %1441 : tensor<1x197x1024xf32>
-    %1443 = stablehlo.convert %arg46 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1444 = stablehlo.broadcast_in_dim %1442, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1445 = stablehlo.broadcast_in_dim %1443, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1446 = stablehlo.add %1444, %1445 : tensor<1x197x1024xf32>
-    %1447 = stablehlo.convert %1446 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1448 = stablehlo.reshape %1447 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1449 = stablehlo.convert %1448 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1450 = stablehlo.dot_general %1449, %arg234, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1451 = stablehlo.broadcast_in_dim %1450, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1452 = stablehlo.multiply %1451, %60 : tensor<197x1024xf32>
-    %1453 = stablehlo.broadcast_in_dim %1452, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1454 = stablehlo.broadcast_in_dim %arg235, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1455 = stablehlo.add %1453, %1454 : tensor<197x1024xf32>
-    %1456 = stablehlo.convert %1455 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1457 = stablehlo.reshape %1456 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1458 = stablehlo.dot_general %1448, %arg236, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %1459 = stablehlo.reshape %1458 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1460 = stablehlo.reshape %1459 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1461 = stablehlo.transpose %1460, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1462 = stablehlo.dot_general %1449, %arg237, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1463 = stablehlo.broadcast_in_dim %1462, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1464 = stablehlo.multiply %1463, %60 : tensor<197x1024xf32>
-    %1465 = stablehlo.broadcast_in_dim %1464, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1466 = stablehlo.broadcast_in_dim %arg238, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1467 = stablehlo.add %1465, %1466 : tensor<197x1024xf32>
-    %1468 = stablehlo.convert %1467 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1469 = stablehlo.reshape %1468 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1470 = stablehlo.reshape %1469 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1471 = stablehlo.transpose %1470, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1472 = stablehlo.reshape %1457 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1473 = stablehlo.transpose %1472, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1474 = stablehlo.transpose %1461, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %1475 = stablehlo.reshape %1473 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1476 = stablehlo.reshape %1474 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1477 = stablehlo.broadcast_in_dim %1476, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1478 = stablehlo.dot_general %1475, %1477, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %1479 = stablehlo.reshape %1478 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1480 = stablehlo.broadcast_in_dim %1479, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1481 = stablehlo.divide %1480, %92 : tensor<1x16x197x197xbf16>
-    %1482 = stablehlo.add %1481, %arg239 : tensor<1x16x197x197xbf16>
-    %1483 = stablehlo.convert %1482 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %1484 = stablehlo.reduce(%1483 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1485 = stablehlo.reshape %1484 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1486 = stablehlo.broadcast_in_dim %1483, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1487 = stablehlo.broadcast_in_dim %1485, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1488 = stablehlo.subtract %1486, %1487 : tensor<1x16x197x197xf32>
-    %1489 = stablehlo.exponential %1488 : tensor<1x16x197x197xf32>
-    %1490 = stablehlo.reduce(%1489 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1491 = stablehlo.reshape %1490 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1492 = stablehlo.broadcast_in_dim %1489, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1493 = stablehlo.broadcast_in_dim %1491, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1494 = stablehlo.divide %1492, %1493 : tensor<1x16x197x197xf32>
-    %1495 = stablehlo.convert %1494 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %1496 = stablehlo.reshape %1495 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %1497 = stablehlo.reshape %1471 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1498 = stablehlo.broadcast_in_dim %1497, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1499 = stablehlo.dot_general %1496, %1498, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1500 = stablehlo.reshape %1499 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1501 = stablehlo.transpose %1500, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %1502 = stablehlo.reshape %1501 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %1503 = stablehlo.reshape %1502 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1504 = stablehlo.convert %1503 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1505 = stablehlo.dot_general %1504, %arg240, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1506 = stablehlo.broadcast_in_dim %1505, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1507 = stablehlo.multiply %1506, %60 : tensor<197x1024xf32>
-    %1508 = stablehlo.broadcast_in_dim %1507, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1509 = stablehlo.broadcast_in_dim %arg241, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1510 = stablehlo.add %1508, %1509 : tensor<197x1024xf32>
-    %1511 = stablehlo.convert %1510 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1512 = stablehlo.reshape %1511 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1513 = stablehlo.broadcast_in_dim %arg47, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1514 = stablehlo.broadcast_in_dim %1512, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1515 = stablehlo.multiply %1513, %1514 : tensor<1x197x1024xbf16>
-    %1516 = stablehlo.add %1515, %1410 : tensor<1x197x1024xbf16>
-    %1517 = stablehlo.convert %1516 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1518 = stablehlo.convert %1517 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1519 = stablehlo.reduce(%1518 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1520 = stablehlo.reshape %1519 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1521 = stablehlo.broadcast_in_dim %1520, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1522 = stablehlo.divide %1521, %15 : tensor<1x197x1xf64>
-    %1523 = stablehlo.broadcast_in_dim %1518, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1524 = stablehlo.broadcast_in_dim %1522, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1525 = stablehlo.subtract %1523, %1524 : tensor<1x197x1024xf64>
-    %1526 = stablehlo.multiply %1525, %1525 : tensor<1x197x1024xf64>
-    %1527 = stablehlo.reduce(%1526 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1528 = stablehlo.reshape %1527 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1529 = stablehlo.broadcast_in_dim %1528, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1530 = stablehlo.divide %1529, %15 : tensor<1x197x1xf64>
-    %1531 = stablehlo.convert %1530 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1532 = stablehlo.reduce(%1517 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1533 = stablehlo.reshape %1532 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1534 = stablehlo.broadcast_in_dim %1533, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1535 = stablehlo.divide %1534, %31 : tensor<1x197x1xf32>
-    %1536 = stablehlo.broadcast_in_dim %1531, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1537 = stablehlo.add %1536, %36 : tensor<1x197x1xf32>
-    %1538 = stablehlo.rsqrt %1537 : tensor<1x197x1xf32>
-    %1539 = stablehlo.broadcast_in_dim %1517, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1540 = stablehlo.broadcast_in_dim %1535, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1541 = stablehlo.subtract %1539, %1540 : tensor<1x197x1024xf32>
-    %1542 = stablehlo.broadcast_in_dim %1541, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1543 = stablehlo.broadcast_in_dim %1538, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1544 = stablehlo.multiply %1542, %1543 : tensor<1x197x1024xf32>
-    %1545 = stablehlo.convert %arg48 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1546 = stablehlo.broadcast_in_dim %1544, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1547 = stablehlo.broadcast_in_dim %1545, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1548 = stablehlo.multiply %1546, %1547 : tensor<1x197x1024xf32>
-    %1549 = stablehlo.convert %arg49 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1550 = stablehlo.broadcast_in_dim %1548, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1551 = stablehlo.broadcast_in_dim %1549, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1552 = stablehlo.add %1550, %1551 : tensor<1x197x1024xf32>
-    %1553 = stablehlo.convert %1552 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1554 = stablehlo.reshape %1553 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1555 = stablehlo.convert %1554 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1556 = stablehlo.dot_general %1555, %arg242, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %1557 = stablehlo.broadcast_in_dim %1556, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1558 = stablehlo.multiply %1557, %170 : tensor<197x4096xf32>
-    %1559 = stablehlo.broadcast_in_dim %1558, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1560 = stablehlo.broadcast_in_dim %arg243, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %1561 = stablehlo.add %1559, %1560 : tensor<197x4096xf32>
-    %1562 = stablehlo.convert %1561 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %1563 = stablehlo.reshape %1562 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %1564 = stablehlo.multiply %1563, %cst_4 : tensor<1x197x4096xbf16>
-    %1565 = stablehlo.multiply %1563, %178 : tensor<1x197x4096xbf16>
-    %1566 = stablehlo.convert %1565 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %1567 = stablehlo.clamp %cst_5, %1566, %cst_6 : tensor<1x197x4096xf32>
-    %1568 = stablehlo.multiply %1567, %1567 : tensor<1x197x4096xf32>
-    %1569 = stablehlo.multiply %cst_7, %1568 : tensor<1x197x4096xf32>
-    %1570 = stablehlo.add %1569, %cst_8 : tensor<1x197x4096xf32>
-    %1571 = stablehlo.multiply %1570, %1568 : tensor<1x197x4096xf32>
-    %1572 = stablehlo.add %1571, %cst_9 : tensor<1x197x4096xf32>
-    %1573 = stablehlo.multiply %1572, %1568 : tensor<1x197x4096xf32>
-    %1574 = stablehlo.add %1573, %cst_10 : tensor<1x197x4096xf32>
-    %1575 = stablehlo.multiply %1574, %1568 : tensor<1x197x4096xf32>
-    %1576 = stablehlo.add %1575, %cst_11 : tensor<1x197x4096xf32>
-    %1577 = stablehlo.multiply %1576, %1568 : tensor<1x197x4096xf32>
-    %1578 = stablehlo.add %1577, %cst_12 : tensor<1x197x4096xf32>
-    %1579 = stablehlo.multiply %1578, %1568 : tensor<1x197x4096xf32>
-    %1580 = stablehlo.add %1579, %cst_13 : tensor<1x197x4096xf32>
-    %1581 = stablehlo.multiply %cst_14, %1568 : tensor<1x197x4096xf32>
-    %1582 = stablehlo.add %1581, %cst_15 : tensor<1x197x4096xf32>
-    %1583 = stablehlo.multiply %1582, %1568 : tensor<1x197x4096xf32>
-    %1584 = stablehlo.add %1583, %cst_16 : tensor<1x197x4096xf32>
-    %1585 = stablehlo.multiply %1584, %1568 : tensor<1x197x4096xf32>
-    %1586 = stablehlo.add %1585, %cst_17 : tensor<1x197x4096xf32>
-    %1587 = stablehlo.multiply %1586, %1568 : tensor<1x197x4096xf32>
-    %1588 = stablehlo.add %1587, %cst_18 : tensor<1x197x4096xf32>
-    %1589 = stablehlo.multiply %1567, %1580 : tensor<1x197x4096xf32>
-    %1590 = stablehlo.divide %1589, %1588 : tensor<1x197x4096xf32>
-    %1591 = stablehlo.clamp %cst_19, %1590, %cst_20 : tensor<1x197x4096xf32>
-    %1592 = stablehlo.convert %1591 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %1593 = stablehlo.add %1592, %cst_2 : tensor<1x197x4096xbf16>
-    %1594 = stablehlo.multiply %1593, %1564 : tensor<1x197x4096xbf16>
-    %1595 = stablehlo.reshape %1594 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %1596 = stablehlo.convert %1595 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %1597 = stablehlo.dot_general %1596, %arg244, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %1598 = stablehlo.broadcast_in_dim %1597, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1599 = stablehlo.multiply %1598, %60 : tensor<197x1024xf32>
-    %1600 = stablehlo.broadcast_in_dim %1599, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1601 = stablehlo.broadcast_in_dim %arg245, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1602 = stablehlo.add %1600, %1601 : tensor<197x1024xf32>
-    %1603 = stablehlo.convert %1602 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1604 = stablehlo.reshape %1603 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1605 = stablehlo.broadcast_in_dim %arg50, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1606 = stablehlo.broadcast_in_dim %1604, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1607 = stablehlo.multiply %1605, %1606 : tensor<1x197x1024xbf16>
-    %1608 = stablehlo.add %1607, %1516 : tensor<1x197x1024xbf16>
-    %1609 = stablehlo.convert %1608 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1610 = stablehlo.convert %1609 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1611 = stablehlo.reduce(%1610 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1612 = stablehlo.reshape %1611 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1613 = stablehlo.broadcast_in_dim %1612, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1614 = stablehlo.divide %1613, %15 : tensor<1x197x1xf64>
-    %1615 = stablehlo.broadcast_in_dim %1610, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1616 = stablehlo.broadcast_in_dim %1614, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1617 = stablehlo.subtract %1615, %1616 : tensor<1x197x1024xf64>
-    %1618 = stablehlo.multiply %1617, %1617 : tensor<1x197x1024xf64>
-    %1619 = stablehlo.reduce(%1618 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1620 = stablehlo.reshape %1619 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1621 = stablehlo.broadcast_in_dim %1620, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1622 = stablehlo.divide %1621, %15 : tensor<1x197x1xf64>
-    %1623 = stablehlo.convert %1622 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1624 = stablehlo.reduce(%1609 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1625 = stablehlo.reshape %1624 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1626 = stablehlo.broadcast_in_dim %1625, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1627 = stablehlo.divide %1626, %31 : tensor<1x197x1xf32>
-    %1628 = stablehlo.broadcast_in_dim %1623, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1629 = stablehlo.add %1628, %36 : tensor<1x197x1xf32>
-    %1630 = stablehlo.rsqrt %1629 : tensor<1x197x1xf32>
-    %1631 = stablehlo.broadcast_in_dim %1609, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1632 = stablehlo.broadcast_in_dim %1627, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1633 = stablehlo.subtract %1631, %1632 : tensor<1x197x1024xf32>
-    %1634 = stablehlo.broadcast_in_dim %1633, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1635 = stablehlo.broadcast_in_dim %1630, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1636 = stablehlo.multiply %1634, %1635 : tensor<1x197x1024xf32>
-    %1637 = stablehlo.convert %arg51 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1638 = stablehlo.broadcast_in_dim %1636, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1639 = stablehlo.broadcast_in_dim %1637, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1640 = stablehlo.multiply %1638, %1639 : tensor<1x197x1024xf32>
-    %1641 = stablehlo.convert %arg52 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1642 = stablehlo.broadcast_in_dim %1640, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1643 = stablehlo.broadcast_in_dim %1641, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1644 = stablehlo.add %1642, %1643 : tensor<1x197x1024xf32>
-    %1645 = stablehlo.convert %1644 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1646 = stablehlo.reshape %1645 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1647 = stablehlo.convert %1646 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1648 = stablehlo.dot_general %1647, %arg246, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1649 = stablehlo.broadcast_in_dim %1648, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1650 = stablehlo.multiply %1649, %60 : tensor<197x1024xf32>
-    %1651 = stablehlo.broadcast_in_dim %1650, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1652 = stablehlo.broadcast_in_dim %arg247, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1653 = stablehlo.add %1651, %1652 : tensor<197x1024xf32>
-    %1654 = stablehlo.convert %1653 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1655 = stablehlo.reshape %1654 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1656 = stablehlo.dot_general %1646, %arg248, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %1657 = stablehlo.reshape %1656 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1658 = stablehlo.reshape %1657 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1659 = stablehlo.transpose %1658, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1660 = stablehlo.dot_general %1647, %arg249, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1661 = stablehlo.broadcast_in_dim %1660, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1662 = stablehlo.multiply %1661, %60 : tensor<197x1024xf32>
-    %1663 = stablehlo.broadcast_in_dim %1662, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1664 = stablehlo.broadcast_in_dim %arg250, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1665 = stablehlo.add %1663, %1664 : tensor<197x1024xf32>
-    %1666 = stablehlo.convert %1665 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1667 = stablehlo.reshape %1666 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1668 = stablehlo.reshape %1667 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1669 = stablehlo.transpose %1668, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1670 = stablehlo.reshape %1655 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1671 = stablehlo.transpose %1670, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1672 = stablehlo.transpose %1659, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %1673 = stablehlo.reshape %1671 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1674 = stablehlo.reshape %1672 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1675 = stablehlo.broadcast_in_dim %1674, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1676 = stablehlo.dot_general %1673, %1675, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %1677 = stablehlo.reshape %1676 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1678 = stablehlo.broadcast_in_dim %1677, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1679 = stablehlo.divide %1678, %92 : tensor<1x16x197x197xbf16>
-    %1680 = stablehlo.add %1679, %arg251 : tensor<1x16x197x197xbf16>
-    %1681 = stablehlo.convert %1680 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %1682 = stablehlo.reduce(%1681 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1683 = stablehlo.reshape %1682 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1684 = stablehlo.broadcast_in_dim %1681, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1685 = stablehlo.broadcast_in_dim %1683, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1686 = stablehlo.subtract %1684, %1685 : tensor<1x16x197x197xf32>
-    %1687 = stablehlo.exponential %1686 : tensor<1x16x197x197xf32>
-    %1688 = stablehlo.reduce(%1687 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1689 = stablehlo.reshape %1688 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1690 = stablehlo.broadcast_in_dim %1687, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1691 = stablehlo.broadcast_in_dim %1689, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1692 = stablehlo.divide %1690, %1691 : tensor<1x16x197x197xf32>
-    %1693 = stablehlo.convert %1692 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %1694 = stablehlo.reshape %1693 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %1695 = stablehlo.reshape %1669 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1696 = stablehlo.broadcast_in_dim %1695, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1697 = stablehlo.dot_general %1694, %1696, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1698 = stablehlo.reshape %1697 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1699 = stablehlo.transpose %1698, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %1700 = stablehlo.reshape %1699 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %1701 = stablehlo.reshape %1700 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1702 = stablehlo.convert %1701 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1703 = stablehlo.dot_general %1702, %arg252, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1704 = stablehlo.broadcast_in_dim %1703, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1705 = stablehlo.multiply %1704, %60 : tensor<197x1024xf32>
-    %1706 = stablehlo.broadcast_in_dim %1705, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1707 = stablehlo.broadcast_in_dim %arg253, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1708 = stablehlo.add %1706, %1707 : tensor<197x1024xf32>
-    %1709 = stablehlo.convert %1708 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1710 = stablehlo.reshape %1709 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1711 = stablehlo.broadcast_in_dim %arg53, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1712 = stablehlo.broadcast_in_dim %1710, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1713 = stablehlo.multiply %1711, %1712 : tensor<1x197x1024xbf16>
-    %1714 = stablehlo.add %1713, %1608 : tensor<1x197x1024xbf16>
-    %1715 = stablehlo.convert %1714 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1716 = stablehlo.convert %1715 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1717 = stablehlo.reduce(%1716 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1718 = stablehlo.reshape %1717 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1719 = stablehlo.broadcast_in_dim %1718, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1720 = stablehlo.divide %1719, %15 : tensor<1x197x1xf64>
-    %1721 = stablehlo.broadcast_in_dim %1716, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1722 = stablehlo.broadcast_in_dim %1720, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1723 = stablehlo.subtract %1721, %1722 : tensor<1x197x1024xf64>
-    %1724 = stablehlo.multiply %1723, %1723 : tensor<1x197x1024xf64>
-    %1725 = stablehlo.reduce(%1724 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1726 = stablehlo.reshape %1725 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1727 = stablehlo.broadcast_in_dim %1726, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1728 = stablehlo.divide %1727, %15 : tensor<1x197x1xf64>
-    %1729 = stablehlo.convert %1728 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1730 = stablehlo.reduce(%1715 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1731 = stablehlo.reshape %1730 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1732 = stablehlo.broadcast_in_dim %1731, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1733 = stablehlo.divide %1732, %31 : tensor<1x197x1xf32>
-    %1734 = stablehlo.broadcast_in_dim %1729, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1735 = stablehlo.add %1734, %36 : tensor<1x197x1xf32>
-    %1736 = stablehlo.rsqrt %1735 : tensor<1x197x1xf32>
-    %1737 = stablehlo.broadcast_in_dim %1715, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1738 = stablehlo.broadcast_in_dim %1733, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1739 = stablehlo.subtract %1737, %1738 : tensor<1x197x1024xf32>
-    %1740 = stablehlo.broadcast_in_dim %1739, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1741 = stablehlo.broadcast_in_dim %1736, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1742 = stablehlo.multiply %1740, %1741 : tensor<1x197x1024xf32>
-    %1743 = stablehlo.convert %arg54 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1744 = stablehlo.broadcast_in_dim %1742, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1745 = stablehlo.broadcast_in_dim %1743, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1746 = stablehlo.multiply %1744, %1745 : tensor<1x197x1024xf32>
-    %1747 = stablehlo.convert %arg55 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1748 = stablehlo.broadcast_in_dim %1746, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1749 = stablehlo.broadcast_in_dim %1747, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1750 = stablehlo.add %1748, %1749 : tensor<1x197x1024xf32>
-    %1751 = stablehlo.convert %1750 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1752 = stablehlo.reshape %1751 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1753 = stablehlo.convert %1752 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1754 = stablehlo.dot_general %1753, %arg254, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %1755 = stablehlo.broadcast_in_dim %1754, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1756 = stablehlo.multiply %1755, %170 : tensor<197x4096xf32>
-    %1757 = stablehlo.broadcast_in_dim %1756, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1758 = stablehlo.broadcast_in_dim %arg255, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %1759 = stablehlo.add %1757, %1758 : tensor<197x4096xf32>
-    %1760 = stablehlo.convert %1759 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %1761 = stablehlo.reshape %1760 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %1762 = stablehlo.multiply %1761, %cst_4 : tensor<1x197x4096xbf16>
-    %1763 = stablehlo.multiply %1761, %178 : tensor<1x197x4096xbf16>
-    %1764 = stablehlo.convert %1763 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %1765 = stablehlo.clamp %cst_5, %1764, %cst_6 : tensor<1x197x4096xf32>
-    %1766 = stablehlo.multiply %1765, %1765 : tensor<1x197x4096xf32>
-    %1767 = stablehlo.multiply %cst_7, %1766 : tensor<1x197x4096xf32>
-    %1768 = stablehlo.add %1767, %cst_8 : tensor<1x197x4096xf32>
-    %1769 = stablehlo.multiply %1768, %1766 : tensor<1x197x4096xf32>
-    %1770 = stablehlo.add %1769, %cst_9 : tensor<1x197x4096xf32>
-    %1771 = stablehlo.multiply %1770, %1766 : tensor<1x197x4096xf32>
-    %1772 = stablehlo.add %1771, %cst_10 : tensor<1x197x4096xf32>
-    %1773 = stablehlo.multiply %1772, %1766 : tensor<1x197x4096xf32>
-    %1774 = stablehlo.add %1773, %cst_11 : tensor<1x197x4096xf32>
-    %1775 = stablehlo.multiply %1774, %1766 : tensor<1x197x4096xf32>
-    %1776 = stablehlo.add %1775, %cst_12 : tensor<1x197x4096xf32>
-    %1777 = stablehlo.multiply %1776, %1766 : tensor<1x197x4096xf32>
-    %1778 = stablehlo.add %1777, %cst_13 : tensor<1x197x4096xf32>
-    %1779 = stablehlo.multiply %cst_14, %1766 : tensor<1x197x4096xf32>
-    %1780 = stablehlo.add %1779, %cst_15 : tensor<1x197x4096xf32>
-    %1781 = stablehlo.multiply %1780, %1766 : tensor<1x197x4096xf32>
-    %1782 = stablehlo.add %1781, %cst_16 : tensor<1x197x4096xf32>
-    %1783 = stablehlo.multiply %1782, %1766 : tensor<1x197x4096xf32>
-    %1784 = stablehlo.add %1783, %cst_17 : tensor<1x197x4096xf32>
-    %1785 = stablehlo.multiply %1784, %1766 : tensor<1x197x4096xf32>
-    %1786 = stablehlo.add %1785, %cst_18 : tensor<1x197x4096xf32>
-    %1787 = stablehlo.multiply %1765, %1778 : tensor<1x197x4096xf32>
-    %1788 = stablehlo.divide %1787, %1786 : tensor<1x197x4096xf32>
-    %1789 = stablehlo.clamp %cst_19, %1788, %cst_20 : tensor<1x197x4096xf32>
-    %1790 = stablehlo.convert %1789 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %1791 = stablehlo.add %1790, %cst_2 : tensor<1x197x4096xbf16>
-    %1792 = stablehlo.multiply %1791, %1762 : tensor<1x197x4096xbf16>
-    %1793 = stablehlo.reshape %1792 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %1794 = stablehlo.convert %1793 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %1795 = stablehlo.dot_general %1794, %arg256, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %1796 = stablehlo.broadcast_in_dim %1795, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1797 = stablehlo.multiply %1796, %60 : tensor<197x1024xf32>
-    %1798 = stablehlo.broadcast_in_dim %1797, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1799 = stablehlo.broadcast_in_dim %arg257, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1800 = stablehlo.add %1798, %1799 : tensor<197x1024xf32>
-    %1801 = stablehlo.convert %1800 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1802 = stablehlo.reshape %1801 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1803 = stablehlo.broadcast_in_dim %arg56, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1804 = stablehlo.broadcast_in_dim %1802, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1805 = stablehlo.multiply %1803, %1804 : tensor<1x197x1024xbf16>
-    %1806 = stablehlo.add %1805, %1714 : tensor<1x197x1024xbf16>
-    %1807 = stablehlo.convert %1806 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1808 = stablehlo.convert %1807 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1809 = stablehlo.reduce(%1808 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1810 = stablehlo.reshape %1809 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1811 = stablehlo.broadcast_in_dim %1810, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1812 = stablehlo.divide %1811, %15 : tensor<1x197x1xf64>
-    %1813 = stablehlo.broadcast_in_dim %1808, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1814 = stablehlo.broadcast_in_dim %1812, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1815 = stablehlo.subtract %1813, %1814 : tensor<1x197x1024xf64>
-    %1816 = stablehlo.multiply %1815, %1815 : tensor<1x197x1024xf64>
-    %1817 = stablehlo.reduce(%1816 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1818 = stablehlo.reshape %1817 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1819 = stablehlo.broadcast_in_dim %1818, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1820 = stablehlo.divide %1819, %15 : tensor<1x197x1xf64>
-    %1821 = stablehlo.convert %1820 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1822 = stablehlo.reduce(%1807 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1823 = stablehlo.reshape %1822 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1824 = stablehlo.broadcast_in_dim %1823, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1825 = stablehlo.divide %1824, %31 : tensor<1x197x1xf32>
-    %1826 = stablehlo.broadcast_in_dim %1821, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1827 = stablehlo.add %1826, %36 : tensor<1x197x1xf32>
-    %1828 = stablehlo.rsqrt %1827 : tensor<1x197x1xf32>
-    %1829 = stablehlo.broadcast_in_dim %1807, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1830 = stablehlo.broadcast_in_dim %1825, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1831 = stablehlo.subtract %1829, %1830 : tensor<1x197x1024xf32>
-    %1832 = stablehlo.broadcast_in_dim %1831, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1833 = stablehlo.broadcast_in_dim %1828, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1834 = stablehlo.multiply %1832, %1833 : tensor<1x197x1024xf32>
-    %1835 = stablehlo.convert %arg57 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1836 = stablehlo.broadcast_in_dim %1834, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1837 = stablehlo.broadcast_in_dim %1835, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1838 = stablehlo.multiply %1836, %1837 : tensor<1x197x1024xf32>
-    %1839 = stablehlo.convert %arg58 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1840 = stablehlo.broadcast_in_dim %1838, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1841 = stablehlo.broadcast_in_dim %1839, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1842 = stablehlo.add %1840, %1841 : tensor<1x197x1024xf32>
-    %1843 = stablehlo.convert %1842 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1844 = stablehlo.reshape %1843 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1845 = stablehlo.convert %1844 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1846 = stablehlo.dot_general %1845, %arg258, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1847 = stablehlo.broadcast_in_dim %1846, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1848 = stablehlo.multiply %1847, %60 : tensor<197x1024xf32>
-    %1849 = stablehlo.broadcast_in_dim %1848, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1850 = stablehlo.broadcast_in_dim %arg259, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1851 = stablehlo.add %1849, %1850 : tensor<197x1024xf32>
-    %1852 = stablehlo.convert %1851 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1853 = stablehlo.reshape %1852 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1854 = stablehlo.dot_general %1844, %arg260, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %1855 = stablehlo.reshape %1854 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1856 = stablehlo.reshape %1855 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1857 = stablehlo.transpose %1856, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1858 = stablehlo.dot_general %1845, %arg261, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1859 = stablehlo.broadcast_in_dim %1858, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1860 = stablehlo.multiply %1859, %60 : tensor<197x1024xf32>
-    %1861 = stablehlo.broadcast_in_dim %1860, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1862 = stablehlo.broadcast_in_dim %arg262, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1863 = stablehlo.add %1861, %1862 : tensor<197x1024xf32>
-    %1864 = stablehlo.convert %1863 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1865 = stablehlo.reshape %1864 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1866 = stablehlo.reshape %1865 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1867 = stablehlo.transpose %1866, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1868 = stablehlo.reshape %1853 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %1869 = stablehlo.transpose %1868, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1870 = stablehlo.transpose %1857, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %1871 = stablehlo.reshape %1869 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1872 = stablehlo.reshape %1870 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1873 = stablehlo.broadcast_in_dim %1872, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %1874 = stablehlo.dot_general %1871, %1873, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %1875 = stablehlo.reshape %1874 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1876 = stablehlo.broadcast_in_dim %1875, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %1877 = stablehlo.divide %1876, %92 : tensor<1x16x197x197xbf16>
-    %1878 = stablehlo.add %1877, %arg263 : tensor<1x16x197x197xbf16>
-    %1879 = stablehlo.convert %1878 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %1880 = stablehlo.reduce(%1879 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1881 = stablehlo.reshape %1880 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1882 = stablehlo.broadcast_in_dim %1879, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1883 = stablehlo.broadcast_in_dim %1881, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1884 = stablehlo.subtract %1882, %1883 : tensor<1x16x197x197xf32>
-    %1885 = stablehlo.exponential %1884 : tensor<1x16x197x197xf32>
-    %1886 = stablehlo.reduce(%1885 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %1887 = stablehlo.reshape %1886 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %1888 = stablehlo.broadcast_in_dim %1885, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %1889 = stablehlo.broadcast_in_dim %1887, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %1890 = stablehlo.divide %1888, %1889 : tensor<1x16x197x197xf32>
-    %1891 = stablehlo.convert %1890 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %1892 = stablehlo.reshape %1891 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %1893 = stablehlo.reshape %1867 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1894 = stablehlo.broadcast_in_dim %1893, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1895 = stablehlo.dot_general %1892, %1894, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %1896 = stablehlo.reshape %1895 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %1897 = stablehlo.transpose %1896, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %1898 = stablehlo.reshape %1897 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %1899 = stablehlo.reshape %1898 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1900 = stablehlo.convert %1899 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1901 = stablehlo.dot_general %1900, %arg264, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %1902 = stablehlo.broadcast_in_dim %1901, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1903 = stablehlo.multiply %1902, %60 : tensor<197x1024xf32>
-    %1904 = stablehlo.broadcast_in_dim %1903, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1905 = stablehlo.broadcast_in_dim %arg265, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1906 = stablehlo.add %1904, %1905 : tensor<197x1024xf32>
-    %1907 = stablehlo.convert %1906 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %1908 = stablehlo.reshape %1907 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1909 = stablehlo.broadcast_in_dim %arg59, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1910 = stablehlo.broadcast_in_dim %1908, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %1911 = stablehlo.multiply %1909, %1910 : tensor<1x197x1024xbf16>
-    %1912 = stablehlo.add %1911, %1806 : tensor<1x197x1024xbf16>
-    %1913 = stablehlo.convert %1912 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %1914 = stablehlo.convert %1913 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %1915 = stablehlo.reduce(%1914 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1916 = stablehlo.reshape %1915 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1917 = stablehlo.broadcast_in_dim %1916, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1918 = stablehlo.divide %1917, %15 : tensor<1x197x1xf64>
-    %1919 = stablehlo.broadcast_in_dim %1914, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %1920 = stablehlo.broadcast_in_dim %1918, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %1921 = stablehlo.subtract %1919, %1920 : tensor<1x197x1024xf64>
-    %1922 = stablehlo.multiply %1921, %1921 : tensor<1x197x1024xf64>
-    %1923 = stablehlo.reduce(%1922 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %1924 = stablehlo.reshape %1923 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %1925 = stablehlo.broadcast_in_dim %1924, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %1926 = stablehlo.divide %1925, %15 : tensor<1x197x1xf64>
-    %1927 = stablehlo.convert %1926 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %1928 = stablehlo.reduce(%1913 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %1929 = stablehlo.reshape %1928 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %1930 = stablehlo.broadcast_in_dim %1929, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1931 = stablehlo.divide %1930, %31 : tensor<1x197x1xf32>
-    %1932 = stablehlo.broadcast_in_dim %1927, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %1933 = stablehlo.add %1932, %36 : tensor<1x197x1xf32>
-    %1934 = stablehlo.rsqrt %1933 : tensor<1x197x1xf32>
-    %1935 = stablehlo.broadcast_in_dim %1913, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1936 = stablehlo.broadcast_in_dim %1931, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1937 = stablehlo.subtract %1935, %1936 : tensor<1x197x1024xf32>
-    %1938 = stablehlo.broadcast_in_dim %1937, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1939 = stablehlo.broadcast_in_dim %1934, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %1940 = stablehlo.multiply %1938, %1939 : tensor<1x197x1024xf32>
-    %1941 = stablehlo.convert %arg60 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1942 = stablehlo.broadcast_in_dim %1940, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1943 = stablehlo.broadcast_in_dim %1941, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1944 = stablehlo.multiply %1942, %1943 : tensor<1x197x1024xf32>
-    %1945 = stablehlo.convert %arg61 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %1946 = stablehlo.broadcast_in_dim %1944, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %1947 = stablehlo.broadcast_in_dim %1945, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %1948 = stablehlo.add %1946, %1947 : tensor<1x197x1024xf32>
-    %1949 = stablehlo.convert %1948 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %1950 = stablehlo.reshape %1949 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %1951 = stablehlo.convert %1950 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %1952 = stablehlo.dot_general %1951, %arg266, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %1953 = stablehlo.broadcast_in_dim %1952, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1954 = stablehlo.multiply %1953, %170 : tensor<197x4096xf32>
-    %1955 = stablehlo.broadcast_in_dim %1954, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %1956 = stablehlo.broadcast_in_dim %arg267, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %1957 = stablehlo.add %1955, %1956 : tensor<197x4096xf32>
-    %1958 = stablehlo.convert %1957 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %1959 = stablehlo.reshape %1958 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %1960 = stablehlo.multiply %1959, %cst_4 : tensor<1x197x4096xbf16>
-    %1961 = stablehlo.multiply %1959, %178 : tensor<1x197x4096xbf16>
-    %1962 = stablehlo.convert %1961 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %1963 = stablehlo.clamp %cst_5, %1962, %cst_6 : tensor<1x197x4096xf32>
-    %1964 = stablehlo.multiply %1963, %1963 : tensor<1x197x4096xf32>
-    %1965 = stablehlo.multiply %cst_7, %1964 : tensor<1x197x4096xf32>
-    %1966 = stablehlo.add %1965, %cst_8 : tensor<1x197x4096xf32>
-    %1967 = stablehlo.multiply %1966, %1964 : tensor<1x197x4096xf32>
-    %1968 = stablehlo.add %1967, %cst_9 : tensor<1x197x4096xf32>
-    %1969 = stablehlo.multiply %1968, %1964 : tensor<1x197x4096xf32>
-    %1970 = stablehlo.add %1969, %cst_10 : tensor<1x197x4096xf32>
-    %1971 = stablehlo.multiply %1970, %1964 : tensor<1x197x4096xf32>
-    %1972 = stablehlo.add %1971, %cst_11 : tensor<1x197x4096xf32>
-    %1973 = stablehlo.multiply %1972, %1964 : tensor<1x197x4096xf32>
-    %1974 = stablehlo.add %1973, %cst_12 : tensor<1x197x4096xf32>
-    %1975 = stablehlo.multiply %1974, %1964 : tensor<1x197x4096xf32>
-    %1976 = stablehlo.add %1975, %cst_13 : tensor<1x197x4096xf32>
-    %1977 = stablehlo.multiply %cst_14, %1964 : tensor<1x197x4096xf32>
-    %1978 = stablehlo.add %1977, %cst_15 : tensor<1x197x4096xf32>
-    %1979 = stablehlo.multiply %1978, %1964 : tensor<1x197x4096xf32>
-    %1980 = stablehlo.add %1979, %cst_16 : tensor<1x197x4096xf32>
-    %1981 = stablehlo.multiply %1980, %1964 : tensor<1x197x4096xf32>
-    %1982 = stablehlo.add %1981, %cst_17 : tensor<1x197x4096xf32>
-    %1983 = stablehlo.multiply %1982, %1964 : tensor<1x197x4096xf32>
-    %1984 = stablehlo.add %1983, %cst_18 : tensor<1x197x4096xf32>
-    %1985 = stablehlo.multiply %1963, %1976 : tensor<1x197x4096xf32>
-    %1986 = stablehlo.divide %1985, %1984 : tensor<1x197x4096xf32>
-    %1987 = stablehlo.clamp %cst_19, %1986, %cst_20 : tensor<1x197x4096xf32>
-    %1988 = stablehlo.convert %1987 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %1989 = stablehlo.add %1988, %cst_2 : tensor<1x197x4096xbf16>
-    %1990 = stablehlo.multiply %1989, %1960 : tensor<1x197x4096xbf16>
-    %1991 = stablehlo.reshape %1990 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %1992 = stablehlo.convert %1991 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %1993 = stablehlo.dot_general %1992, %arg268, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %1994 = stablehlo.broadcast_in_dim %1993, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1995 = stablehlo.multiply %1994, %60 : tensor<197x1024xf32>
-    %1996 = stablehlo.broadcast_in_dim %1995, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %1997 = stablehlo.broadcast_in_dim %arg269, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %1998 = stablehlo.add %1996, %1997 : tensor<197x1024xf32>
-    %1999 = stablehlo.convert %1998 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2000 = stablehlo.reshape %1999 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2001 = stablehlo.broadcast_in_dim %arg62, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2002 = stablehlo.broadcast_in_dim %2000, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2003 = stablehlo.multiply %2001, %2002 : tensor<1x197x1024xbf16>
-    %2004 = stablehlo.add %2003, %1912 : tensor<1x197x1024xbf16>
-    %2005 = stablehlo.convert %2004 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2006 = stablehlo.convert %2005 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2007 = stablehlo.reduce(%2006 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2008 = stablehlo.reshape %2007 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2009 = stablehlo.broadcast_in_dim %2008, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2010 = stablehlo.divide %2009, %15 : tensor<1x197x1xf64>
-    %2011 = stablehlo.broadcast_in_dim %2006, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2012 = stablehlo.broadcast_in_dim %2010, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2013 = stablehlo.subtract %2011, %2012 : tensor<1x197x1024xf64>
-    %2014 = stablehlo.multiply %2013, %2013 : tensor<1x197x1024xf64>
-    %2015 = stablehlo.reduce(%2014 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2016 = stablehlo.reshape %2015 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2017 = stablehlo.broadcast_in_dim %2016, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2018 = stablehlo.divide %2017, %15 : tensor<1x197x1xf64>
-    %2019 = stablehlo.convert %2018 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2020 = stablehlo.reduce(%2005 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2021 = stablehlo.reshape %2020 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2022 = stablehlo.broadcast_in_dim %2021, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2023 = stablehlo.divide %2022, %31 : tensor<1x197x1xf32>
-    %2024 = stablehlo.broadcast_in_dim %2019, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2025 = stablehlo.add %2024, %36 : tensor<1x197x1xf32>
-    %2026 = stablehlo.rsqrt %2025 : tensor<1x197x1xf32>
-    %2027 = stablehlo.broadcast_in_dim %2005, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2028 = stablehlo.broadcast_in_dim %2023, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2029 = stablehlo.subtract %2027, %2028 : tensor<1x197x1024xf32>
-    %2030 = stablehlo.broadcast_in_dim %2029, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2031 = stablehlo.broadcast_in_dim %2026, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2032 = stablehlo.multiply %2030, %2031 : tensor<1x197x1024xf32>
-    %2033 = stablehlo.convert %arg63 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2034 = stablehlo.broadcast_in_dim %2032, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2035 = stablehlo.broadcast_in_dim %2033, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2036 = stablehlo.multiply %2034, %2035 : tensor<1x197x1024xf32>
-    %2037 = stablehlo.convert %arg64 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2038 = stablehlo.broadcast_in_dim %2036, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2039 = stablehlo.broadcast_in_dim %2037, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2040 = stablehlo.add %2038, %2039 : tensor<1x197x1024xf32>
-    %2041 = stablehlo.convert %2040 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2042 = stablehlo.reshape %2041 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2043 = stablehlo.convert %2042 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2044 = stablehlo.dot_general %2043, %arg270, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2045 = stablehlo.broadcast_in_dim %2044, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2046 = stablehlo.multiply %2045, %60 : tensor<197x1024xf32>
-    %2047 = stablehlo.broadcast_in_dim %2046, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2048 = stablehlo.broadcast_in_dim %arg271, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2049 = stablehlo.add %2047, %2048 : tensor<197x1024xf32>
-    %2050 = stablehlo.convert %2049 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2051 = stablehlo.reshape %2050 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2052 = stablehlo.dot_general %2042, %arg272, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %2053 = stablehlo.reshape %2052 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2054 = stablehlo.reshape %2053 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2055 = stablehlo.transpose %2054, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2056 = stablehlo.dot_general %2043, %arg273, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2057 = stablehlo.broadcast_in_dim %2056, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2058 = stablehlo.multiply %2057, %60 : tensor<197x1024xf32>
-    %2059 = stablehlo.broadcast_in_dim %2058, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2060 = stablehlo.broadcast_in_dim %arg274, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2061 = stablehlo.add %2059, %2060 : tensor<197x1024xf32>
-    %2062 = stablehlo.convert %2061 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2063 = stablehlo.reshape %2062 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2064 = stablehlo.reshape %2063 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2065 = stablehlo.transpose %2064, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2066 = stablehlo.reshape %2051 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2067 = stablehlo.transpose %2066, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2068 = stablehlo.transpose %2055, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %2069 = stablehlo.reshape %2067 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2070 = stablehlo.reshape %2068 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2071 = stablehlo.broadcast_in_dim %2070, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2072 = stablehlo.dot_general %2069, %2071, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %2073 = stablehlo.reshape %2072 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2074 = stablehlo.broadcast_in_dim %2073, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2075 = stablehlo.divide %2074, %92 : tensor<1x16x197x197xbf16>
-    %2076 = stablehlo.add %2075, %arg275 : tensor<1x16x197x197xbf16>
-    %2077 = stablehlo.convert %2076 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %2078 = stablehlo.reduce(%2077 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2079 = stablehlo.reshape %2078 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2080 = stablehlo.broadcast_in_dim %2077, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2081 = stablehlo.broadcast_in_dim %2079, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2082 = stablehlo.subtract %2080, %2081 : tensor<1x16x197x197xf32>
-    %2083 = stablehlo.exponential %2082 : tensor<1x16x197x197xf32>
-    %2084 = stablehlo.reduce(%2083 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2085 = stablehlo.reshape %2084 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2086 = stablehlo.broadcast_in_dim %2083, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2087 = stablehlo.broadcast_in_dim %2085, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2088 = stablehlo.divide %2086, %2087 : tensor<1x16x197x197xf32>
-    %2089 = stablehlo.convert %2088 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %2090 = stablehlo.reshape %2089 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %2091 = stablehlo.reshape %2065 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2092 = stablehlo.broadcast_in_dim %2091, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2093 = stablehlo.dot_general %2090, %2092, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2094 = stablehlo.reshape %2093 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2095 = stablehlo.transpose %2094, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %2096 = stablehlo.reshape %2095 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %2097 = stablehlo.reshape %2096 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2098 = stablehlo.convert %2097 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2099 = stablehlo.dot_general %2098, %arg276, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2100 = stablehlo.broadcast_in_dim %2099, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2101 = stablehlo.multiply %2100, %60 : tensor<197x1024xf32>
-    %2102 = stablehlo.broadcast_in_dim %2101, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2103 = stablehlo.broadcast_in_dim %arg277, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2104 = stablehlo.add %2102, %2103 : tensor<197x1024xf32>
-    %2105 = stablehlo.convert %2104 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2106 = stablehlo.reshape %2105 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2107 = stablehlo.broadcast_in_dim %arg65, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2108 = stablehlo.broadcast_in_dim %2106, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2109 = stablehlo.multiply %2107, %2108 : tensor<1x197x1024xbf16>
-    %2110 = stablehlo.add %2109, %2004 : tensor<1x197x1024xbf16>
-    %2111 = stablehlo.convert %2110 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2112 = stablehlo.convert %2111 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2113 = stablehlo.reduce(%2112 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2114 = stablehlo.reshape %2113 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2115 = stablehlo.broadcast_in_dim %2114, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2116 = stablehlo.divide %2115, %15 : tensor<1x197x1xf64>
-    %2117 = stablehlo.broadcast_in_dim %2112, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2118 = stablehlo.broadcast_in_dim %2116, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2119 = stablehlo.subtract %2117, %2118 : tensor<1x197x1024xf64>
-    %2120 = stablehlo.multiply %2119, %2119 : tensor<1x197x1024xf64>
-    %2121 = stablehlo.reduce(%2120 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2122 = stablehlo.reshape %2121 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2123 = stablehlo.broadcast_in_dim %2122, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2124 = stablehlo.divide %2123, %15 : tensor<1x197x1xf64>
-    %2125 = stablehlo.convert %2124 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2126 = stablehlo.reduce(%2111 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2127 = stablehlo.reshape %2126 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2128 = stablehlo.broadcast_in_dim %2127, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2129 = stablehlo.divide %2128, %31 : tensor<1x197x1xf32>
-    %2130 = stablehlo.broadcast_in_dim %2125, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2131 = stablehlo.add %2130, %36 : tensor<1x197x1xf32>
-    %2132 = stablehlo.rsqrt %2131 : tensor<1x197x1xf32>
-    %2133 = stablehlo.broadcast_in_dim %2111, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2134 = stablehlo.broadcast_in_dim %2129, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2135 = stablehlo.subtract %2133, %2134 : tensor<1x197x1024xf32>
-    %2136 = stablehlo.broadcast_in_dim %2135, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2137 = stablehlo.broadcast_in_dim %2132, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2138 = stablehlo.multiply %2136, %2137 : tensor<1x197x1024xf32>
-    %2139 = stablehlo.convert %arg66 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2140 = stablehlo.broadcast_in_dim %2138, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2141 = stablehlo.broadcast_in_dim %2139, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2142 = stablehlo.multiply %2140, %2141 : tensor<1x197x1024xf32>
-    %2143 = stablehlo.convert %arg67 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2144 = stablehlo.broadcast_in_dim %2142, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2145 = stablehlo.broadcast_in_dim %2143, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2146 = stablehlo.add %2144, %2145 : tensor<1x197x1024xf32>
-    %2147 = stablehlo.convert %2146 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2148 = stablehlo.reshape %2147 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2149 = stablehlo.convert %2148 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2150 = stablehlo.dot_general %2149, %arg278, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %2151 = stablehlo.broadcast_in_dim %2150, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2152 = stablehlo.multiply %2151, %170 : tensor<197x4096xf32>
-    %2153 = stablehlo.broadcast_in_dim %2152, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2154 = stablehlo.broadcast_in_dim %arg279, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %2155 = stablehlo.add %2153, %2154 : tensor<197x4096xf32>
-    %2156 = stablehlo.convert %2155 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %2157 = stablehlo.reshape %2156 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %2158 = stablehlo.multiply %2157, %cst_4 : tensor<1x197x4096xbf16>
-    %2159 = stablehlo.multiply %2157, %178 : tensor<1x197x4096xbf16>
-    %2160 = stablehlo.convert %2159 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %2161 = stablehlo.clamp %cst_5, %2160, %cst_6 : tensor<1x197x4096xf32>
-    %2162 = stablehlo.multiply %2161, %2161 : tensor<1x197x4096xf32>
-    %2163 = stablehlo.multiply %cst_7, %2162 : tensor<1x197x4096xf32>
-    %2164 = stablehlo.add %2163, %cst_8 : tensor<1x197x4096xf32>
-    %2165 = stablehlo.multiply %2164, %2162 : tensor<1x197x4096xf32>
-    %2166 = stablehlo.add %2165, %cst_9 : tensor<1x197x4096xf32>
-    %2167 = stablehlo.multiply %2166, %2162 : tensor<1x197x4096xf32>
-    %2168 = stablehlo.add %2167, %cst_10 : tensor<1x197x4096xf32>
-    %2169 = stablehlo.multiply %2168, %2162 : tensor<1x197x4096xf32>
-    %2170 = stablehlo.add %2169, %cst_11 : tensor<1x197x4096xf32>
-    %2171 = stablehlo.multiply %2170, %2162 : tensor<1x197x4096xf32>
-    %2172 = stablehlo.add %2171, %cst_12 : tensor<1x197x4096xf32>
-    %2173 = stablehlo.multiply %2172, %2162 : tensor<1x197x4096xf32>
-    %2174 = stablehlo.add %2173, %cst_13 : tensor<1x197x4096xf32>
-    %2175 = stablehlo.multiply %cst_14, %2162 : tensor<1x197x4096xf32>
-    %2176 = stablehlo.add %2175, %cst_15 : tensor<1x197x4096xf32>
-    %2177 = stablehlo.multiply %2176, %2162 : tensor<1x197x4096xf32>
-    %2178 = stablehlo.add %2177, %cst_16 : tensor<1x197x4096xf32>
-    %2179 = stablehlo.multiply %2178, %2162 : tensor<1x197x4096xf32>
-    %2180 = stablehlo.add %2179, %cst_17 : tensor<1x197x4096xf32>
-    %2181 = stablehlo.multiply %2180, %2162 : tensor<1x197x4096xf32>
-    %2182 = stablehlo.add %2181, %cst_18 : tensor<1x197x4096xf32>
-    %2183 = stablehlo.multiply %2161, %2174 : tensor<1x197x4096xf32>
-    %2184 = stablehlo.divide %2183, %2182 : tensor<1x197x4096xf32>
-    %2185 = stablehlo.clamp %cst_19, %2184, %cst_20 : tensor<1x197x4096xf32>
-    %2186 = stablehlo.convert %2185 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %2187 = stablehlo.add %2186, %cst_2 : tensor<1x197x4096xbf16>
-    %2188 = stablehlo.multiply %2187, %2158 : tensor<1x197x4096xbf16>
-    %2189 = stablehlo.reshape %2188 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %2190 = stablehlo.convert %2189 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %2191 = stablehlo.dot_general %2190, %arg280, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %2192 = stablehlo.broadcast_in_dim %2191, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2193 = stablehlo.multiply %2192, %60 : tensor<197x1024xf32>
-    %2194 = stablehlo.broadcast_in_dim %2193, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2195 = stablehlo.broadcast_in_dim %arg281, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2196 = stablehlo.add %2194, %2195 : tensor<197x1024xf32>
-    %2197 = stablehlo.convert %2196 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2198 = stablehlo.reshape %2197 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2199 = stablehlo.broadcast_in_dim %arg68, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2200 = stablehlo.broadcast_in_dim %2198, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2201 = stablehlo.multiply %2199, %2200 : tensor<1x197x1024xbf16>
-    %2202 = stablehlo.add %2201, %2110 : tensor<1x197x1024xbf16>
-    %2203 = stablehlo.convert %2202 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2204 = stablehlo.convert %2203 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2205 = stablehlo.reduce(%2204 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2206 = stablehlo.reshape %2205 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2207 = stablehlo.broadcast_in_dim %2206, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2208 = stablehlo.divide %2207, %15 : tensor<1x197x1xf64>
-    %2209 = stablehlo.broadcast_in_dim %2204, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2210 = stablehlo.broadcast_in_dim %2208, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2211 = stablehlo.subtract %2209, %2210 : tensor<1x197x1024xf64>
-    %2212 = stablehlo.multiply %2211, %2211 : tensor<1x197x1024xf64>
-    %2213 = stablehlo.reduce(%2212 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2214 = stablehlo.reshape %2213 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2215 = stablehlo.broadcast_in_dim %2214, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2216 = stablehlo.divide %2215, %15 : tensor<1x197x1xf64>
-    %2217 = stablehlo.convert %2216 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2218 = stablehlo.reduce(%2203 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2219 = stablehlo.reshape %2218 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2220 = stablehlo.broadcast_in_dim %2219, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2221 = stablehlo.divide %2220, %31 : tensor<1x197x1xf32>
-    %2222 = stablehlo.broadcast_in_dim %2217, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2223 = stablehlo.add %2222, %36 : tensor<1x197x1xf32>
-    %2224 = stablehlo.rsqrt %2223 : tensor<1x197x1xf32>
-    %2225 = stablehlo.broadcast_in_dim %2203, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2226 = stablehlo.broadcast_in_dim %2221, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2227 = stablehlo.subtract %2225, %2226 : tensor<1x197x1024xf32>
-    %2228 = stablehlo.broadcast_in_dim %2227, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2229 = stablehlo.broadcast_in_dim %2224, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2230 = stablehlo.multiply %2228, %2229 : tensor<1x197x1024xf32>
-    %2231 = stablehlo.convert %arg69 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2232 = stablehlo.broadcast_in_dim %2230, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2233 = stablehlo.broadcast_in_dim %2231, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2234 = stablehlo.multiply %2232, %2233 : tensor<1x197x1024xf32>
-    %2235 = stablehlo.convert %arg70 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2236 = stablehlo.broadcast_in_dim %2234, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2237 = stablehlo.broadcast_in_dim %2235, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2238 = stablehlo.add %2236, %2237 : tensor<1x197x1024xf32>
-    %2239 = stablehlo.convert %2238 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2240 = stablehlo.reshape %2239 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2241 = stablehlo.convert %2240 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2242 = stablehlo.dot_general %2241, %arg282, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2243 = stablehlo.broadcast_in_dim %2242, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2244 = stablehlo.multiply %2243, %60 : tensor<197x1024xf32>
-    %2245 = stablehlo.broadcast_in_dim %2244, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2246 = stablehlo.broadcast_in_dim %arg283, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2247 = stablehlo.add %2245, %2246 : tensor<197x1024xf32>
-    %2248 = stablehlo.convert %2247 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2249 = stablehlo.reshape %2248 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2250 = stablehlo.dot_general %2240, %arg284, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %2251 = stablehlo.reshape %2250 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2252 = stablehlo.reshape %2251 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2253 = stablehlo.transpose %2252, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2254 = stablehlo.dot_general %2241, %arg285, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2255 = stablehlo.broadcast_in_dim %2254, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2256 = stablehlo.multiply %2255, %60 : tensor<197x1024xf32>
-    %2257 = stablehlo.broadcast_in_dim %2256, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2258 = stablehlo.broadcast_in_dim %arg286, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2259 = stablehlo.add %2257, %2258 : tensor<197x1024xf32>
-    %2260 = stablehlo.convert %2259 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2261 = stablehlo.reshape %2260 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2262 = stablehlo.reshape %2261 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2263 = stablehlo.transpose %2262, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2264 = stablehlo.reshape %2249 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2265 = stablehlo.transpose %2264, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2266 = stablehlo.transpose %2253, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %2267 = stablehlo.reshape %2265 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2268 = stablehlo.reshape %2266 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2269 = stablehlo.broadcast_in_dim %2268, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2270 = stablehlo.dot_general %2267, %2269, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %2271 = stablehlo.reshape %2270 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2272 = stablehlo.broadcast_in_dim %2271, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2273 = stablehlo.divide %2272, %92 : tensor<1x16x197x197xbf16>
-    %2274 = stablehlo.add %2273, %arg287 : tensor<1x16x197x197xbf16>
-    %2275 = stablehlo.convert %2274 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %2276 = stablehlo.reduce(%2275 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2277 = stablehlo.reshape %2276 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2278 = stablehlo.broadcast_in_dim %2275, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2279 = stablehlo.broadcast_in_dim %2277, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2280 = stablehlo.subtract %2278, %2279 : tensor<1x16x197x197xf32>
-    %2281 = stablehlo.exponential %2280 : tensor<1x16x197x197xf32>
-    %2282 = stablehlo.reduce(%2281 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2283 = stablehlo.reshape %2282 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2284 = stablehlo.broadcast_in_dim %2281, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2285 = stablehlo.broadcast_in_dim %2283, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2286 = stablehlo.divide %2284, %2285 : tensor<1x16x197x197xf32>
-    %2287 = stablehlo.convert %2286 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %2288 = stablehlo.reshape %2287 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %2289 = stablehlo.reshape %2263 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2290 = stablehlo.broadcast_in_dim %2289, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2291 = stablehlo.dot_general %2288, %2290, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2292 = stablehlo.reshape %2291 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2293 = stablehlo.transpose %2292, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %2294 = stablehlo.reshape %2293 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %2295 = stablehlo.reshape %2294 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2296 = stablehlo.convert %2295 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2297 = stablehlo.dot_general %2296, %arg288, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2298 = stablehlo.broadcast_in_dim %2297, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2299 = stablehlo.multiply %2298, %60 : tensor<197x1024xf32>
-    %2300 = stablehlo.broadcast_in_dim %2299, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2301 = stablehlo.broadcast_in_dim %arg289, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2302 = stablehlo.add %2300, %2301 : tensor<197x1024xf32>
-    %2303 = stablehlo.convert %2302 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2304 = stablehlo.reshape %2303 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2305 = stablehlo.broadcast_in_dim %arg71, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2306 = stablehlo.broadcast_in_dim %2304, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2307 = stablehlo.multiply %2305, %2306 : tensor<1x197x1024xbf16>
-    %2308 = stablehlo.add %2307, %2202 : tensor<1x197x1024xbf16>
-    %2309 = stablehlo.convert %2308 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2310 = stablehlo.convert %2309 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2311 = stablehlo.reduce(%2310 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2312 = stablehlo.reshape %2311 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2313 = stablehlo.broadcast_in_dim %2312, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2314 = stablehlo.divide %2313, %15 : tensor<1x197x1xf64>
-    %2315 = stablehlo.broadcast_in_dim %2310, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2316 = stablehlo.broadcast_in_dim %2314, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2317 = stablehlo.subtract %2315, %2316 : tensor<1x197x1024xf64>
-    %2318 = stablehlo.multiply %2317, %2317 : tensor<1x197x1024xf64>
-    %2319 = stablehlo.reduce(%2318 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2320 = stablehlo.reshape %2319 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2321 = stablehlo.broadcast_in_dim %2320, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2322 = stablehlo.divide %2321, %15 : tensor<1x197x1xf64>
-    %2323 = stablehlo.convert %2322 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2324 = stablehlo.reduce(%2309 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2325 = stablehlo.reshape %2324 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2326 = stablehlo.broadcast_in_dim %2325, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2327 = stablehlo.divide %2326, %31 : tensor<1x197x1xf32>
-    %2328 = stablehlo.broadcast_in_dim %2323, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2329 = stablehlo.add %2328, %36 : tensor<1x197x1xf32>
-    %2330 = stablehlo.rsqrt %2329 : tensor<1x197x1xf32>
-    %2331 = stablehlo.broadcast_in_dim %2309, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2332 = stablehlo.broadcast_in_dim %2327, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2333 = stablehlo.subtract %2331, %2332 : tensor<1x197x1024xf32>
-    %2334 = stablehlo.broadcast_in_dim %2333, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2335 = stablehlo.broadcast_in_dim %2330, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2336 = stablehlo.multiply %2334, %2335 : tensor<1x197x1024xf32>
-    %2337 = stablehlo.convert %arg72 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2338 = stablehlo.broadcast_in_dim %2336, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2339 = stablehlo.broadcast_in_dim %2337, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2340 = stablehlo.multiply %2338, %2339 : tensor<1x197x1024xf32>
-    %2341 = stablehlo.convert %arg73 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2342 = stablehlo.broadcast_in_dim %2340, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2343 = stablehlo.broadcast_in_dim %2341, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2344 = stablehlo.add %2342, %2343 : tensor<1x197x1024xf32>
-    %2345 = stablehlo.convert %2344 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2346 = stablehlo.reshape %2345 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2347 = stablehlo.convert %2346 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2348 = stablehlo.dot_general %2347, %arg290, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %2349 = stablehlo.broadcast_in_dim %2348, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2350 = stablehlo.multiply %2349, %170 : tensor<197x4096xf32>
-    %2351 = stablehlo.broadcast_in_dim %2350, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2352 = stablehlo.broadcast_in_dim %arg291, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %2353 = stablehlo.add %2351, %2352 : tensor<197x4096xf32>
-    %2354 = stablehlo.convert %2353 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %2355 = stablehlo.reshape %2354 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %2356 = stablehlo.multiply %2355, %cst_4 : tensor<1x197x4096xbf16>
-    %2357 = stablehlo.multiply %2355, %178 : tensor<1x197x4096xbf16>
-    %2358 = stablehlo.convert %2357 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %2359 = stablehlo.clamp %cst_5, %2358, %cst_6 : tensor<1x197x4096xf32>
-    %2360 = stablehlo.multiply %2359, %2359 : tensor<1x197x4096xf32>
-    %2361 = stablehlo.multiply %cst_7, %2360 : tensor<1x197x4096xf32>
-    %2362 = stablehlo.add %2361, %cst_8 : tensor<1x197x4096xf32>
-    %2363 = stablehlo.multiply %2362, %2360 : tensor<1x197x4096xf32>
-    %2364 = stablehlo.add %2363, %cst_9 : tensor<1x197x4096xf32>
-    %2365 = stablehlo.multiply %2364, %2360 : tensor<1x197x4096xf32>
-    %2366 = stablehlo.add %2365, %cst_10 : tensor<1x197x4096xf32>
-    %2367 = stablehlo.multiply %2366, %2360 : tensor<1x197x4096xf32>
-    %2368 = stablehlo.add %2367, %cst_11 : tensor<1x197x4096xf32>
-    %2369 = stablehlo.multiply %2368, %2360 : tensor<1x197x4096xf32>
-    %2370 = stablehlo.add %2369, %cst_12 : tensor<1x197x4096xf32>
-    %2371 = stablehlo.multiply %2370, %2360 : tensor<1x197x4096xf32>
-    %2372 = stablehlo.add %2371, %cst_13 : tensor<1x197x4096xf32>
-    %2373 = stablehlo.multiply %cst_14, %2360 : tensor<1x197x4096xf32>
-    %2374 = stablehlo.add %2373, %cst_15 : tensor<1x197x4096xf32>
-    %2375 = stablehlo.multiply %2374, %2360 : tensor<1x197x4096xf32>
-    %2376 = stablehlo.add %2375, %cst_16 : tensor<1x197x4096xf32>
-    %2377 = stablehlo.multiply %2376, %2360 : tensor<1x197x4096xf32>
-    %2378 = stablehlo.add %2377, %cst_17 : tensor<1x197x4096xf32>
-    %2379 = stablehlo.multiply %2378, %2360 : tensor<1x197x4096xf32>
-    %2380 = stablehlo.add %2379, %cst_18 : tensor<1x197x4096xf32>
-    %2381 = stablehlo.multiply %2359, %2372 : tensor<1x197x4096xf32>
-    %2382 = stablehlo.divide %2381, %2380 : tensor<1x197x4096xf32>
-    %2383 = stablehlo.clamp %cst_19, %2382, %cst_20 : tensor<1x197x4096xf32>
-    %2384 = stablehlo.convert %2383 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %2385 = stablehlo.add %2384, %cst_2 : tensor<1x197x4096xbf16>
-    %2386 = stablehlo.multiply %2385, %2356 : tensor<1x197x4096xbf16>
-    %2387 = stablehlo.reshape %2386 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %2388 = stablehlo.convert %2387 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %2389 = stablehlo.dot_general %2388, %arg292, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %2390 = stablehlo.broadcast_in_dim %2389, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2391 = stablehlo.multiply %2390, %60 : tensor<197x1024xf32>
-    %2392 = stablehlo.broadcast_in_dim %2391, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2393 = stablehlo.broadcast_in_dim %arg293, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2394 = stablehlo.add %2392, %2393 : tensor<197x1024xf32>
-    %2395 = stablehlo.convert %2394 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2396 = stablehlo.reshape %2395 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2397 = stablehlo.broadcast_in_dim %arg74, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2398 = stablehlo.broadcast_in_dim %2396, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2399 = stablehlo.multiply %2397, %2398 : tensor<1x197x1024xbf16>
-    %2400 = stablehlo.add %2399, %2308 : tensor<1x197x1024xbf16>
-    %2401 = stablehlo.convert %2400 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2402 = stablehlo.convert %2401 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2403 = stablehlo.reduce(%2402 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2404 = stablehlo.reshape %2403 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2405 = stablehlo.broadcast_in_dim %2404, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2406 = stablehlo.divide %2405, %15 : tensor<1x197x1xf64>
-    %2407 = stablehlo.broadcast_in_dim %2402, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2408 = stablehlo.broadcast_in_dim %2406, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2409 = stablehlo.subtract %2407, %2408 : tensor<1x197x1024xf64>
-    %2410 = stablehlo.multiply %2409, %2409 : tensor<1x197x1024xf64>
-    %2411 = stablehlo.reduce(%2410 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2412 = stablehlo.reshape %2411 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2413 = stablehlo.broadcast_in_dim %2412, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2414 = stablehlo.divide %2413, %15 : tensor<1x197x1xf64>
-    %2415 = stablehlo.convert %2414 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2416 = stablehlo.reduce(%2401 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2417 = stablehlo.reshape %2416 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2418 = stablehlo.broadcast_in_dim %2417, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2419 = stablehlo.divide %2418, %31 : tensor<1x197x1xf32>
-    %2420 = stablehlo.broadcast_in_dim %2415, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2421 = stablehlo.add %2420, %36 : tensor<1x197x1xf32>
-    %2422 = stablehlo.rsqrt %2421 : tensor<1x197x1xf32>
-    %2423 = stablehlo.broadcast_in_dim %2401, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2424 = stablehlo.broadcast_in_dim %2419, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2425 = stablehlo.subtract %2423, %2424 : tensor<1x197x1024xf32>
-    %2426 = stablehlo.broadcast_in_dim %2425, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2427 = stablehlo.broadcast_in_dim %2422, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2428 = stablehlo.multiply %2426, %2427 : tensor<1x197x1024xf32>
-    %2429 = stablehlo.convert %arg75 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2430 = stablehlo.broadcast_in_dim %2428, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2431 = stablehlo.broadcast_in_dim %2429, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2432 = stablehlo.multiply %2430, %2431 : tensor<1x197x1024xf32>
-    %2433 = stablehlo.convert %arg76 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2434 = stablehlo.broadcast_in_dim %2432, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2435 = stablehlo.broadcast_in_dim %2433, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2436 = stablehlo.add %2434, %2435 : tensor<1x197x1024xf32>
-    %2437 = stablehlo.convert %2436 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2438 = stablehlo.reshape %2437 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2439 = stablehlo.convert %2438 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2440 = stablehlo.dot_general %2439, %arg294, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2441 = stablehlo.broadcast_in_dim %2440, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2442 = stablehlo.multiply %2441, %60 : tensor<197x1024xf32>
-    %2443 = stablehlo.broadcast_in_dim %2442, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2444 = stablehlo.broadcast_in_dim %arg295, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2445 = stablehlo.add %2443, %2444 : tensor<197x1024xf32>
-    %2446 = stablehlo.convert %2445 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2447 = stablehlo.reshape %2446 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2448 = stablehlo.dot_general %2438, %arg296, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %2449 = stablehlo.reshape %2448 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2450 = stablehlo.reshape %2449 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2451 = stablehlo.transpose %2450, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2452 = stablehlo.dot_general %2439, %arg297, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2453 = stablehlo.broadcast_in_dim %2452, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2454 = stablehlo.multiply %2453, %60 : tensor<197x1024xf32>
-    %2455 = stablehlo.broadcast_in_dim %2454, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2456 = stablehlo.broadcast_in_dim %arg298, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2457 = stablehlo.add %2455, %2456 : tensor<197x1024xf32>
-    %2458 = stablehlo.convert %2457 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2459 = stablehlo.reshape %2458 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2460 = stablehlo.reshape %2459 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2461 = stablehlo.transpose %2460, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2462 = stablehlo.reshape %2447 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2463 = stablehlo.transpose %2462, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2464 = stablehlo.transpose %2451, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %2465 = stablehlo.reshape %2463 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2466 = stablehlo.reshape %2464 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2467 = stablehlo.broadcast_in_dim %2466, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2468 = stablehlo.dot_general %2465, %2467, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %2469 = stablehlo.reshape %2468 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2470 = stablehlo.broadcast_in_dim %2469, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2471 = stablehlo.divide %2470, %92 : tensor<1x16x197x197xbf16>
-    %2472 = stablehlo.add %2471, %arg299 : tensor<1x16x197x197xbf16>
-    %2473 = stablehlo.convert %2472 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %2474 = stablehlo.reduce(%2473 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2475 = stablehlo.reshape %2474 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2476 = stablehlo.broadcast_in_dim %2473, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2477 = stablehlo.broadcast_in_dim %2475, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2478 = stablehlo.subtract %2476, %2477 : tensor<1x16x197x197xf32>
-    %2479 = stablehlo.exponential %2478 : tensor<1x16x197x197xf32>
-    %2480 = stablehlo.reduce(%2479 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2481 = stablehlo.reshape %2480 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2482 = stablehlo.broadcast_in_dim %2479, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2483 = stablehlo.broadcast_in_dim %2481, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2484 = stablehlo.divide %2482, %2483 : tensor<1x16x197x197xf32>
-    %2485 = stablehlo.convert %2484 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %2486 = stablehlo.reshape %2485 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %2487 = stablehlo.reshape %2461 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2488 = stablehlo.broadcast_in_dim %2487, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2489 = stablehlo.dot_general %2486, %2488, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2490 = stablehlo.reshape %2489 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2491 = stablehlo.transpose %2490, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %2492 = stablehlo.reshape %2491 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %2493 = stablehlo.reshape %2492 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2494 = stablehlo.convert %2493 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2495 = stablehlo.dot_general %2494, %arg300, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2496 = stablehlo.broadcast_in_dim %2495, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2497 = stablehlo.multiply %2496, %60 : tensor<197x1024xf32>
-    %2498 = stablehlo.broadcast_in_dim %2497, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2499 = stablehlo.broadcast_in_dim %arg301, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2500 = stablehlo.add %2498, %2499 : tensor<197x1024xf32>
-    %2501 = stablehlo.convert %2500 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2502 = stablehlo.reshape %2501 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2503 = stablehlo.broadcast_in_dim %arg77, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2504 = stablehlo.broadcast_in_dim %2502, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2505 = stablehlo.multiply %2503, %2504 : tensor<1x197x1024xbf16>
-    %2506 = stablehlo.add %2505, %2400 : tensor<1x197x1024xbf16>
-    %2507 = stablehlo.convert %2506 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2508 = stablehlo.convert %2507 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2509 = stablehlo.reduce(%2508 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2510 = stablehlo.reshape %2509 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2511 = stablehlo.broadcast_in_dim %2510, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2512 = stablehlo.divide %2511, %15 : tensor<1x197x1xf64>
-    %2513 = stablehlo.broadcast_in_dim %2508, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2514 = stablehlo.broadcast_in_dim %2512, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2515 = stablehlo.subtract %2513, %2514 : tensor<1x197x1024xf64>
-    %2516 = stablehlo.multiply %2515, %2515 : tensor<1x197x1024xf64>
-    %2517 = stablehlo.reduce(%2516 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2518 = stablehlo.reshape %2517 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2519 = stablehlo.broadcast_in_dim %2518, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2520 = stablehlo.divide %2519, %15 : tensor<1x197x1xf64>
-    %2521 = stablehlo.convert %2520 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2522 = stablehlo.reduce(%2507 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2523 = stablehlo.reshape %2522 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2524 = stablehlo.broadcast_in_dim %2523, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2525 = stablehlo.divide %2524, %31 : tensor<1x197x1xf32>
-    %2526 = stablehlo.broadcast_in_dim %2521, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2527 = stablehlo.add %2526, %36 : tensor<1x197x1xf32>
-    %2528 = stablehlo.rsqrt %2527 : tensor<1x197x1xf32>
-    %2529 = stablehlo.broadcast_in_dim %2507, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2530 = stablehlo.broadcast_in_dim %2525, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2531 = stablehlo.subtract %2529, %2530 : tensor<1x197x1024xf32>
-    %2532 = stablehlo.broadcast_in_dim %2531, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2533 = stablehlo.broadcast_in_dim %2528, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2534 = stablehlo.multiply %2532, %2533 : tensor<1x197x1024xf32>
-    %2535 = stablehlo.convert %arg78 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2536 = stablehlo.broadcast_in_dim %2534, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2537 = stablehlo.broadcast_in_dim %2535, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2538 = stablehlo.multiply %2536, %2537 : tensor<1x197x1024xf32>
-    %2539 = stablehlo.convert %arg79 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2540 = stablehlo.broadcast_in_dim %2538, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2541 = stablehlo.broadcast_in_dim %2539, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2542 = stablehlo.add %2540, %2541 : tensor<1x197x1024xf32>
-    %2543 = stablehlo.convert %2542 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2544 = stablehlo.reshape %2543 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2545 = stablehlo.convert %2544 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2546 = stablehlo.dot_general %2545, %arg302, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %2547 = stablehlo.broadcast_in_dim %2546, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2548 = stablehlo.multiply %2547, %170 : tensor<197x4096xf32>
-    %2549 = stablehlo.broadcast_in_dim %2548, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2550 = stablehlo.broadcast_in_dim %arg303, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %2551 = stablehlo.add %2549, %2550 : tensor<197x4096xf32>
-    %2552 = stablehlo.convert %2551 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %2553 = stablehlo.reshape %2552 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %2554 = stablehlo.multiply %2553, %cst_4 : tensor<1x197x4096xbf16>
-    %2555 = stablehlo.multiply %2553, %178 : tensor<1x197x4096xbf16>
-    %2556 = stablehlo.convert %2555 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %2557 = stablehlo.clamp %cst_5, %2556, %cst_6 : tensor<1x197x4096xf32>
-    %2558 = stablehlo.multiply %2557, %2557 : tensor<1x197x4096xf32>
-    %2559 = stablehlo.multiply %cst_7, %2558 : tensor<1x197x4096xf32>
-    %2560 = stablehlo.add %2559, %cst_8 : tensor<1x197x4096xf32>
-    %2561 = stablehlo.multiply %2560, %2558 : tensor<1x197x4096xf32>
-    %2562 = stablehlo.add %2561, %cst_9 : tensor<1x197x4096xf32>
-    %2563 = stablehlo.multiply %2562, %2558 : tensor<1x197x4096xf32>
-    %2564 = stablehlo.add %2563, %cst_10 : tensor<1x197x4096xf32>
-    %2565 = stablehlo.multiply %2564, %2558 : tensor<1x197x4096xf32>
-    %2566 = stablehlo.add %2565, %cst_11 : tensor<1x197x4096xf32>
-    %2567 = stablehlo.multiply %2566, %2558 : tensor<1x197x4096xf32>
-    %2568 = stablehlo.add %2567, %cst_12 : tensor<1x197x4096xf32>
-    %2569 = stablehlo.multiply %2568, %2558 : tensor<1x197x4096xf32>
-    %2570 = stablehlo.add %2569, %cst_13 : tensor<1x197x4096xf32>
-    %2571 = stablehlo.multiply %cst_14, %2558 : tensor<1x197x4096xf32>
-    %2572 = stablehlo.add %2571, %cst_15 : tensor<1x197x4096xf32>
-    %2573 = stablehlo.multiply %2572, %2558 : tensor<1x197x4096xf32>
-    %2574 = stablehlo.add %2573, %cst_16 : tensor<1x197x4096xf32>
-    %2575 = stablehlo.multiply %2574, %2558 : tensor<1x197x4096xf32>
-    %2576 = stablehlo.add %2575, %cst_17 : tensor<1x197x4096xf32>
-    %2577 = stablehlo.multiply %2576, %2558 : tensor<1x197x4096xf32>
-    %2578 = stablehlo.add %2577, %cst_18 : tensor<1x197x4096xf32>
-    %2579 = stablehlo.multiply %2557, %2570 : tensor<1x197x4096xf32>
-    %2580 = stablehlo.divide %2579, %2578 : tensor<1x197x4096xf32>
-    %2581 = stablehlo.clamp %cst_19, %2580, %cst_20 : tensor<1x197x4096xf32>
-    %2582 = stablehlo.convert %2581 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %2583 = stablehlo.add %2582, %cst_2 : tensor<1x197x4096xbf16>
-    %2584 = stablehlo.multiply %2583, %2554 : tensor<1x197x4096xbf16>
-    %2585 = stablehlo.reshape %2584 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %2586 = stablehlo.convert %2585 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %2587 = stablehlo.dot_general %2586, %arg304, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %2588 = stablehlo.broadcast_in_dim %2587, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2589 = stablehlo.multiply %2588, %60 : tensor<197x1024xf32>
-    %2590 = stablehlo.broadcast_in_dim %2589, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2591 = stablehlo.broadcast_in_dim %arg305, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2592 = stablehlo.add %2590, %2591 : tensor<197x1024xf32>
-    %2593 = stablehlo.convert %2592 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2594 = stablehlo.reshape %2593 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2595 = stablehlo.broadcast_in_dim %arg80, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2596 = stablehlo.broadcast_in_dim %2594, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2597 = stablehlo.multiply %2595, %2596 : tensor<1x197x1024xbf16>
-    %2598 = stablehlo.add %2597, %2506 : tensor<1x197x1024xbf16>
-    %2599 = stablehlo.convert %2598 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2600 = stablehlo.convert %2599 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2601 = stablehlo.reduce(%2600 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2602 = stablehlo.reshape %2601 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2603 = stablehlo.broadcast_in_dim %2602, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2604 = stablehlo.divide %2603, %15 : tensor<1x197x1xf64>
-    %2605 = stablehlo.broadcast_in_dim %2600, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2606 = stablehlo.broadcast_in_dim %2604, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2607 = stablehlo.subtract %2605, %2606 : tensor<1x197x1024xf64>
-    %2608 = stablehlo.multiply %2607, %2607 : tensor<1x197x1024xf64>
-    %2609 = stablehlo.reduce(%2608 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2610 = stablehlo.reshape %2609 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2611 = stablehlo.broadcast_in_dim %2610, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2612 = stablehlo.divide %2611, %15 : tensor<1x197x1xf64>
-    %2613 = stablehlo.convert %2612 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2614 = stablehlo.reduce(%2599 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2615 = stablehlo.reshape %2614 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2616 = stablehlo.broadcast_in_dim %2615, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2617 = stablehlo.divide %2616, %31 : tensor<1x197x1xf32>
-    %2618 = stablehlo.broadcast_in_dim %2613, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2619 = stablehlo.add %2618, %36 : tensor<1x197x1xf32>
-    %2620 = stablehlo.rsqrt %2619 : tensor<1x197x1xf32>
-    %2621 = stablehlo.broadcast_in_dim %2599, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2622 = stablehlo.broadcast_in_dim %2617, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2623 = stablehlo.subtract %2621, %2622 : tensor<1x197x1024xf32>
-    %2624 = stablehlo.broadcast_in_dim %2623, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2625 = stablehlo.broadcast_in_dim %2620, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2626 = stablehlo.multiply %2624, %2625 : tensor<1x197x1024xf32>
-    %2627 = stablehlo.convert %arg81 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2628 = stablehlo.broadcast_in_dim %2626, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2629 = stablehlo.broadcast_in_dim %2627, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2630 = stablehlo.multiply %2628, %2629 : tensor<1x197x1024xf32>
-    %2631 = stablehlo.convert %arg82 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2632 = stablehlo.broadcast_in_dim %2630, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2633 = stablehlo.broadcast_in_dim %2631, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2634 = stablehlo.add %2632, %2633 : tensor<1x197x1024xf32>
-    %2635 = stablehlo.convert %2634 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2636 = stablehlo.reshape %2635 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2637 = stablehlo.convert %2636 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2638 = stablehlo.dot_general %2637, %arg306, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2639 = stablehlo.broadcast_in_dim %2638, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2640 = stablehlo.multiply %2639, %60 : tensor<197x1024xf32>
-    %2641 = stablehlo.broadcast_in_dim %2640, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2642 = stablehlo.broadcast_in_dim %arg307, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2643 = stablehlo.add %2641, %2642 : tensor<197x1024xf32>
-    %2644 = stablehlo.convert %2643 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2645 = stablehlo.reshape %2644 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2646 = stablehlo.dot_general %2636, %arg308, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %2647 = stablehlo.reshape %2646 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2648 = stablehlo.reshape %2647 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2649 = stablehlo.transpose %2648, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2650 = stablehlo.dot_general %2637, %arg309, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2651 = stablehlo.broadcast_in_dim %2650, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2652 = stablehlo.multiply %2651, %60 : tensor<197x1024xf32>
-    %2653 = stablehlo.broadcast_in_dim %2652, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2654 = stablehlo.broadcast_in_dim %arg310, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2655 = stablehlo.add %2653, %2654 : tensor<197x1024xf32>
-    %2656 = stablehlo.convert %2655 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2657 = stablehlo.reshape %2656 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2658 = stablehlo.reshape %2657 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2659 = stablehlo.transpose %2658, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2660 = stablehlo.reshape %2645 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2661 = stablehlo.transpose %2660, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2662 = stablehlo.transpose %2649, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %2663 = stablehlo.reshape %2661 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2664 = stablehlo.reshape %2662 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2665 = stablehlo.broadcast_in_dim %2664, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2666 = stablehlo.dot_general %2663, %2665, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %2667 = stablehlo.reshape %2666 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2668 = stablehlo.broadcast_in_dim %2667, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2669 = stablehlo.divide %2668, %92 : tensor<1x16x197x197xbf16>
-    %2670 = stablehlo.add %2669, %arg311 : tensor<1x16x197x197xbf16>
-    %2671 = stablehlo.convert %2670 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %2672 = stablehlo.reduce(%2671 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2673 = stablehlo.reshape %2672 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2674 = stablehlo.broadcast_in_dim %2671, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2675 = stablehlo.broadcast_in_dim %2673, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2676 = stablehlo.subtract %2674, %2675 : tensor<1x16x197x197xf32>
-    %2677 = stablehlo.exponential %2676 : tensor<1x16x197x197xf32>
-    %2678 = stablehlo.reduce(%2677 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2679 = stablehlo.reshape %2678 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2680 = stablehlo.broadcast_in_dim %2677, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2681 = stablehlo.broadcast_in_dim %2679, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2682 = stablehlo.divide %2680, %2681 : tensor<1x16x197x197xf32>
-    %2683 = stablehlo.convert %2682 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %2684 = stablehlo.reshape %2683 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %2685 = stablehlo.reshape %2659 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2686 = stablehlo.broadcast_in_dim %2685, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2687 = stablehlo.dot_general %2684, %2686, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2688 = stablehlo.reshape %2687 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2689 = stablehlo.transpose %2688, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %2690 = stablehlo.reshape %2689 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %2691 = stablehlo.reshape %2690 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2692 = stablehlo.convert %2691 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2693 = stablehlo.dot_general %2692, %arg312, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2694 = stablehlo.broadcast_in_dim %2693, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2695 = stablehlo.multiply %2694, %60 : tensor<197x1024xf32>
-    %2696 = stablehlo.broadcast_in_dim %2695, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2697 = stablehlo.broadcast_in_dim %arg313, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2698 = stablehlo.add %2696, %2697 : tensor<197x1024xf32>
-    %2699 = stablehlo.convert %2698 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2700 = stablehlo.reshape %2699 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2701 = stablehlo.broadcast_in_dim %arg83, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2702 = stablehlo.broadcast_in_dim %2700, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2703 = stablehlo.multiply %2701, %2702 : tensor<1x197x1024xbf16>
-    %2704 = stablehlo.add %2703, %2598 : tensor<1x197x1024xbf16>
-    %2705 = stablehlo.convert %2704 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2706 = stablehlo.convert %2705 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2707 = stablehlo.reduce(%2706 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2708 = stablehlo.reshape %2707 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2709 = stablehlo.broadcast_in_dim %2708, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2710 = stablehlo.divide %2709, %15 : tensor<1x197x1xf64>
-    %2711 = stablehlo.broadcast_in_dim %2706, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2712 = stablehlo.broadcast_in_dim %2710, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2713 = stablehlo.subtract %2711, %2712 : tensor<1x197x1024xf64>
-    %2714 = stablehlo.multiply %2713, %2713 : tensor<1x197x1024xf64>
-    %2715 = stablehlo.reduce(%2714 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2716 = stablehlo.reshape %2715 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2717 = stablehlo.broadcast_in_dim %2716, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2718 = stablehlo.divide %2717, %15 : tensor<1x197x1xf64>
-    %2719 = stablehlo.convert %2718 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2720 = stablehlo.reduce(%2705 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2721 = stablehlo.reshape %2720 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2722 = stablehlo.broadcast_in_dim %2721, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2723 = stablehlo.divide %2722, %31 : tensor<1x197x1xf32>
-    %2724 = stablehlo.broadcast_in_dim %2719, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2725 = stablehlo.add %2724, %36 : tensor<1x197x1xf32>
-    %2726 = stablehlo.rsqrt %2725 : tensor<1x197x1xf32>
-    %2727 = stablehlo.broadcast_in_dim %2705, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2728 = stablehlo.broadcast_in_dim %2723, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2729 = stablehlo.subtract %2727, %2728 : tensor<1x197x1024xf32>
-    %2730 = stablehlo.broadcast_in_dim %2729, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2731 = stablehlo.broadcast_in_dim %2726, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2732 = stablehlo.multiply %2730, %2731 : tensor<1x197x1024xf32>
-    %2733 = stablehlo.convert %arg84 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2734 = stablehlo.broadcast_in_dim %2732, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2735 = stablehlo.broadcast_in_dim %2733, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2736 = stablehlo.multiply %2734, %2735 : tensor<1x197x1024xf32>
-    %2737 = stablehlo.convert %arg85 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2738 = stablehlo.broadcast_in_dim %2736, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2739 = stablehlo.broadcast_in_dim %2737, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2740 = stablehlo.add %2738, %2739 : tensor<1x197x1024xf32>
-    %2741 = stablehlo.convert %2740 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2742 = stablehlo.reshape %2741 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2743 = stablehlo.convert %2742 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2744 = stablehlo.dot_general %2743, %arg314, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %2745 = stablehlo.broadcast_in_dim %2744, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2746 = stablehlo.multiply %2745, %170 : tensor<197x4096xf32>
-    %2747 = stablehlo.broadcast_in_dim %2746, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2748 = stablehlo.broadcast_in_dim %arg315, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %2749 = stablehlo.add %2747, %2748 : tensor<197x4096xf32>
-    %2750 = stablehlo.convert %2749 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %2751 = stablehlo.reshape %2750 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %2752 = stablehlo.multiply %2751, %cst_4 : tensor<1x197x4096xbf16>
-    %2753 = stablehlo.multiply %2751, %178 : tensor<1x197x4096xbf16>
-    %2754 = stablehlo.convert %2753 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %2755 = stablehlo.clamp %cst_5, %2754, %cst_6 : tensor<1x197x4096xf32>
-    %2756 = stablehlo.multiply %2755, %2755 : tensor<1x197x4096xf32>
-    %2757 = stablehlo.multiply %cst_7, %2756 : tensor<1x197x4096xf32>
-    %2758 = stablehlo.add %2757, %cst_8 : tensor<1x197x4096xf32>
-    %2759 = stablehlo.multiply %2758, %2756 : tensor<1x197x4096xf32>
-    %2760 = stablehlo.add %2759, %cst_9 : tensor<1x197x4096xf32>
-    %2761 = stablehlo.multiply %2760, %2756 : tensor<1x197x4096xf32>
-    %2762 = stablehlo.add %2761, %cst_10 : tensor<1x197x4096xf32>
-    %2763 = stablehlo.multiply %2762, %2756 : tensor<1x197x4096xf32>
-    %2764 = stablehlo.add %2763, %cst_11 : tensor<1x197x4096xf32>
-    %2765 = stablehlo.multiply %2764, %2756 : tensor<1x197x4096xf32>
-    %2766 = stablehlo.add %2765, %cst_12 : tensor<1x197x4096xf32>
-    %2767 = stablehlo.multiply %2766, %2756 : tensor<1x197x4096xf32>
-    %2768 = stablehlo.add %2767, %cst_13 : tensor<1x197x4096xf32>
-    %2769 = stablehlo.multiply %cst_14, %2756 : tensor<1x197x4096xf32>
-    %2770 = stablehlo.add %2769, %cst_15 : tensor<1x197x4096xf32>
-    %2771 = stablehlo.multiply %2770, %2756 : tensor<1x197x4096xf32>
-    %2772 = stablehlo.add %2771, %cst_16 : tensor<1x197x4096xf32>
-    %2773 = stablehlo.multiply %2772, %2756 : tensor<1x197x4096xf32>
-    %2774 = stablehlo.add %2773, %cst_17 : tensor<1x197x4096xf32>
-    %2775 = stablehlo.multiply %2774, %2756 : tensor<1x197x4096xf32>
-    %2776 = stablehlo.add %2775, %cst_18 : tensor<1x197x4096xf32>
-    %2777 = stablehlo.multiply %2755, %2768 : tensor<1x197x4096xf32>
-    %2778 = stablehlo.divide %2777, %2776 : tensor<1x197x4096xf32>
-    %2779 = stablehlo.clamp %cst_19, %2778, %cst_20 : tensor<1x197x4096xf32>
-    %2780 = stablehlo.convert %2779 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %2781 = stablehlo.add %2780, %cst_2 : tensor<1x197x4096xbf16>
-    %2782 = stablehlo.multiply %2781, %2752 : tensor<1x197x4096xbf16>
-    %2783 = stablehlo.reshape %2782 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %2784 = stablehlo.convert %2783 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %2785 = stablehlo.dot_general %2784, %arg316, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %2786 = stablehlo.broadcast_in_dim %2785, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2787 = stablehlo.multiply %2786, %60 : tensor<197x1024xf32>
-    %2788 = stablehlo.broadcast_in_dim %2787, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2789 = stablehlo.broadcast_in_dim %arg317, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2790 = stablehlo.add %2788, %2789 : tensor<197x1024xf32>
-    %2791 = stablehlo.convert %2790 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2792 = stablehlo.reshape %2791 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2793 = stablehlo.broadcast_in_dim %arg86, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2794 = stablehlo.broadcast_in_dim %2792, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2795 = stablehlo.multiply %2793, %2794 : tensor<1x197x1024xbf16>
-    %2796 = stablehlo.add %2795, %2704 : tensor<1x197x1024xbf16>
-    %2797 = stablehlo.convert %2796 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2798 = stablehlo.convert %2797 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2799 = stablehlo.reduce(%2798 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2800 = stablehlo.reshape %2799 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2801 = stablehlo.broadcast_in_dim %2800, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2802 = stablehlo.divide %2801, %15 : tensor<1x197x1xf64>
-    %2803 = stablehlo.broadcast_in_dim %2798, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2804 = stablehlo.broadcast_in_dim %2802, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2805 = stablehlo.subtract %2803, %2804 : tensor<1x197x1024xf64>
-    %2806 = stablehlo.multiply %2805, %2805 : tensor<1x197x1024xf64>
-    %2807 = stablehlo.reduce(%2806 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2808 = stablehlo.reshape %2807 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2809 = stablehlo.broadcast_in_dim %2808, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2810 = stablehlo.divide %2809, %15 : tensor<1x197x1xf64>
-    %2811 = stablehlo.convert %2810 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2812 = stablehlo.reduce(%2797 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2813 = stablehlo.reshape %2812 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2814 = stablehlo.broadcast_in_dim %2813, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2815 = stablehlo.divide %2814, %31 : tensor<1x197x1xf32>
-    %2816 = stablehlo.broadcast_in_dim %2811, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2817 = stablehlo.add %2816, %36 : tensor<1x197x1xf32>
-    %2818 = stablehlo.rsqrt %2817 : tensor<1x197x1xf32>
-    %2819 = stablehlo.broadcast_in_dim %2797, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2820 = stablehlo.broadcast_in_dim %2815, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2821 = stablehlo.subtract %2819, %2820 : tensor<1x197x1024xf32>
-    %2822 = stablehlo.broadcast_in_dim %2821, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2823 = stablehlo.broadcast_in_dim %2818, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2824 = stablehlo.multiply %2822, %2823 : tensor<1x197x1024xf32>
-    %2825 = stablehlo.convert %arg87 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2826 = stablehlo.broadcast_in_dim %2824, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2827 = stablehlo.broadcast_in_dim %2825, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2828 = stablehlo.multiply %2826, %2827 : tensor<1x197x1024xf32>
-    %2829 = stablehlo.convert %arg88 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2830 = stablehlo.broadcast_in_dim %2828, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2831 = stablehlo.broadcast_in_dim %2829, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2832 = stablehlo.add %2830, %2831 : tensor<1x197x1024xf32>
-    %2833 = stablehlo.convert %2832 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2834 = stablehlo.reshape %2833 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2835 = stablehlo.convert %2834 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2836 = stablehlo.dot_general %2835, %arg318, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2837 = stablehlo.broadcast_in_dim %2836, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2838 = stablehlo.multiply %2837, %60 : tensor<197x1024xf32>
-    %2839 = stablehlo.broadcast_in_dim %2838, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2840 = stablehlo.broadcast_in_dim %arg319, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2841 = stablehlo.add %2839, %2840 : tensor<197x1024xf32>
-    %2842 = stablehlo.convert %2841 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2843 = stablehlo.reshape %2842 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2844 = stablehlo.dot_general %2834, %arg320, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %2845 = stablehlo.reshape %2844 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2846 = stablehlo.reshape %2845 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2847 = stablehlo.transpose %2846, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2848 = stablehlo.dot_general %2835, %arg321, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2849 = stablehlo.broadcast_in_dim %2848, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2850 = stablehlo.multiply %2849, %60 : tensor<197x1024xf32>
-    %2851 = stablehlo.broadcast_in_dim %2850, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2852 = stablehlo.broadcast_in_dim %arg322, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2853 = stablehlo.add %2851, %2852 : tensor<197x1024xf32>
-    %2854 = stablehlo.convert %2853 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2855 = stablehlo.reshape %2854 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2856 = stablehlo.reshape %2855 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2857 = stablehlo.transpose %2856, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2858 = stablehlo.reshape %2843 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %2859 = stablehlo.transpose %2858, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2860 = stablehlo.transpose %2847, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %2861 = stablehlo.reshape %2859 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2862 = stablehlo.reshape %2860 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2863 = stablehlo.broadcast_in_dim %2862, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %2864 = stablehlo.dot_general %2861, %2863, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %2865 = stablehlo.reshape %2864 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2866 = stablehlo.broadcast_in_dim %2865, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %2867 = stablehlo.divide %2866, %92 : tensor<1x16x197x197xbf16>
-    %2868 = stablehlo.add %2867, %arg323 : tensor<1x16x197x197xbf16>
-    %2869 = stablehlo.convert %2868 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %2870 = stablehlo.reduce(%2869 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2871 = stablehlo.reshape %2870 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2872 = stablehlo.broadcast_in_dim %2869, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2873 = stablehlo.broadcast_in_dim %2871, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2874 = stablehlo.subtract %2872, %2873 : tensor<1x16x197x197xf32>
-    %2875 = stablehlo.exponential %2874 : tensor<1x16x197x197xf32>
-    %2876 = stablehlo.reduce(%2875 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %2877 = stablehlo.reshape %2876 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %2878 = stablehlo.broadcast_in_dim %2875, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %2879 = stablehlo.broadcast_in_dim %2877, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %2880 = stablehlo.divide %2878, %2879 : tensor<1x16x197x197xf32>
-    %2881 = stablehlo.convert %2880 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %2882 = stablehlo.reshape %2881 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %2883 = stablehlo.reshape %2857 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2884 = stablehlo.broadcast_in_dim %2883, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2885 = stablehlo.dot_general %2882, %2884, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %2886 = stablehlo.reshape %2885 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %2887 = stablehlo.transpose %2886, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %2888 = stablehlo.reshape %2887 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %2889 = stablehlo.reshape %2888 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2890 = stablehlo.convert %2889 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2891 = stablehlo.dot_general %2890, %arg324, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %2892 = stablehlo.broadcast_in_dim %2891, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2893 = stablehlo.multiply %2892, %60 : tensor<197x1024xf32>
-    %2894 = stablehlo.broadcast_in_dim %2893, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2895 = stablehlo.broadcast_in_dim %arg325, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2896 = stablehlo.add %2894, %2895 : tensor<197x1024xf32>
-    %2897 = stablehlo.convert %2896 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2898 = stablehlo.reshape %2897 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2899 = stablehlo.broadcast_in_dim %arg89, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2900 = stablehlo.broadcast_in_dim %2898, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2901 = stablehlo.multiply %2899, %2900 : tensor<1x197x1024xbf16>
-    %2902 = stablehlo.add %2901, %2796 : tensor<1x197x1024xbf16>
-    %2903 = stablehlo.convert %2902 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2904 = stablehlo.convert %2903 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2905 = stablehlo.reduce(%2904 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2906 = stablehlo.reshape %2905 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2907 = stablehlo.broadcast_in_dim %2906, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2908 = stablehlo.divide %2907, %15 : tensor<1x197x1xf64>
-    %2909 = stablehlo.broadcast_in_dim %2904, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %2910 = stablehlo.broadcast_in_dim %2908, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %2911 = stablehlo.subtract %2909, %2910 : tensor<1x197x1024xf64>
-    %2912 = stablehlo.multiply %2911, %2911 : tensor<1x197x1024xf64>
-    %2913 = stablehlo.reduce(%2912 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2914 = stablehlo.reshape %2913 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2915 = stablehlo.broadcast_in_dim %2914, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %2916 = stablehlo.divide %2915, %15 : tensor<1x197x1xf64>
-    %2917 = stablehlo.convert %2916 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %2918 = stablehlo.reduce(%2903 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %2919 = stablehlo.reshape %2918 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %2920 = stablehlo.broadcast_in_dim %2919, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2921 = stablehlo.divide %2920, %31 : tensor<1x197x1xf32>
-    %2922 = stablehlo.broadcast_in_dim %2917, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %2923 = stablehlo.add %2922, %36 : tensor<1x197x1xf32>
-    %2924 = stablehlo.rsqrt %2923 : tensor<1x197x1xf32>
-    %2925 = stablehlo.broadcast_in_dim %2903, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2926 = stablehlo.broadcast_in_dim %2921, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2927 = stablehlo.subtract %2925, %2926 : tensor<1x197x1024xf32>
-    %2928 = stablehlo.broadcast_in_dim %2927, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2929 = stablehlo.broadcast_in_dim %2924, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %2930 = stablehlo.multiply %2928, %2929 : tensor<1x197x1024xf32>
-    %2931 = stablehlo.convert %arg90 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2932 = stablehlo.broadcast_in_dim %2930, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2933 = stablehlo.broadcast_in_dim %2931, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2934 = stablehlo.multiply %2932, %2933 : tensor<1x197x1024xf32>
-    %2935 = stablehlo.convert %arg91 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %2936 = stablehlo.broadcast_in_dim %2934, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %2937 = stablehlo.broadcast_in_dim %2935, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %2938 = stablehlo.add %2936, %2937 : tensor<1x197x1024xf32>
-    %2939 = stablehlo.convert %2938 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %2940 = stablehlo.reshape %2939 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %2941 = stablehlo.convert %2940 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %2942 = stablehlo.dot_general %2941, %arg326, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %2943 = stablehlo.broadcast_in_dim %2942, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2944 = stablehlo.multiply %2943, %170 : tensor<197x4096xf32>
-    %2945 = stablehlo.broadcast_in_dim %2944, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %2946 = stablehlo.broadcast_in_dim %arg327, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %2947 = stablehlo.add %2945, %2946 : tensor<197x4096xf32>
-    %2948 = stablehlo.convert %2947 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %2949 = stablehlo.reshape %2948 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %2950 = stablehlo.multiply %2949, %cst_4 : tensor<1x197x4096xbf16>
-    %2951 = stablehlo.multiply %2949, %178 : tensor<1x197x4096xbf16>
-    %2952 = stablehlo.convert %2951 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %2953 = stablehlo.clamp %cst_5, %2952, %cst_6 : tensor<1x197x4096xf32>
-    %2954 = stablehlo.multiply %2953, %2953 : tensor<1x197x4096xf32>
-    %2955 = stablehlo.multiply %cst_7, %2954 : tensor<1x197x4096xf32>
-    %2956 = stablehlo.add %2955, %cst_8 : tensor<1x197x4096xf32>
-    %2957 = stablehlo.multiply %2956, %2954 : tensor<1x197x4096xf32>
-    %2958 = stablehlo.add %2957, %cst_9 : tensor<1x197x4096xf32>
-    %2959 = stablehlo.multiply %2958, %2954 : tensor<1x197x4096xf32>
-    %2960 = stablehlo.add %2959, %cst_10 : tensor<1x197x4096xf32>
-    %2961 = stablehlo.multiply %2960, %2954 : tensor<1x197x4096xf32>
-    %2962 = stablehlo.add %2961, %cst_11 : tensor<1x197x4096xf32>
-    %2963 = stablehlo.multiply %2962, %2954 : tensor<1x197x4096xf32>
-    %2964 = stablehlo.add %2963, %cst_12 : tensor<1x197x4096xf32>
-    %2965 = stablehlo.multiply %2964, %2954 : tensor<1x197x4096xf32>
-    %2966 = stablehlo.add %2965, %cst_13 : tensor<1x197x4096xf32>
-    %2967 = stablehlo.multiply %cst_14, %2954 : tensor<1x197x4096xf32>
-    %2968 = stablehlo.add %2967, %cst_15 : tensor<1x197x4096xf32>
-    %2969 = stablehlo.multiply %2968, %2954 : tensor<1x197x4096xf32>
-    %2970 = stablehlo.add %2969, %cst_16 : tensor<1x197x4096xf32>
-    %2971 = stablehlo.multiply %2970, %2954 : tensor<1x197x4096xf32>
-    %2972 = stablehlo.add %2971, %cst_17 : tensor<1x197x4096xf32>
-    %2973 = stablehlo.multiply %2972, %2954 : tensor<1x197x4096xf32>
-    %2974 = stablehlo.add %2973, %cst_18 : tensor<1x197x4096xf32>
-    %2975 = stablehlo.multiply %2953, %2966 : tensor<1x197x4096xf32>
-    %2976 = stablehlo.divide %2975, %2974 : tensor<1x197x4096xf32>
-    %2977 = stablehlo.clamp %cst_19, %2976, %cst_20 : tensor<1x197x4096xf32>
-    %2978 = stablehlo.convert %2977 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %2979 = stablehlo.add %2978, %cst_2 : tensor<1x197x4096xbf16>
-    %2980 = stablehlo.multiply %2979, %2950 : tensor<1x197x4096xbf16>
-    %2981 = stablehlo.reshape %2980 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %2982 = stablehlo.convert %2981 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %2983 = stablehlo.dot_general %2982, %arg328, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %2984 = stablehlo.broadcast_in_dim %2983, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2985 = stablehlo.multiply %2984, %60 : tensor<197x1024xf32>
-    %2986 = stablehlo.broadcast_in_dim %2985, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %2987 = stablehlo.broadcast_in_dim %arg329, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %2988 = stablehlo.add %2986, %2987 : tensor<197x1024xf32>
-    %2989 = stablehlo.convert %2988 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %2990 = stablehlo.reshape %2989 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2991 = stablehlo.broadcast_in_dim %arg92, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2992 = stablehlo.broadcast_in_dim %2990, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %2993 = stablehlo.multiply %2991, %2992 : tensor<1x197x1024xbf16>
-    %2994 = stablehlo.add %2993, %2902 : tensor<1x197x1024xbf16>
-    %2995 = stablehlo.convert %2994 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %2996 = stablehlo.convert %2995 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %2997 = stablehlo.reduce(%2996 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %2998 = stablehlo.reshape %2997 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %2999 = stablehlo.broadcast_in_dim %2998, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3000 = stablehlo.divide %2999, %15 : tensor<1x197x1xf64>
-    %3001 = stablehlo.broadcast_in_dim %2996, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3002 = stablehlo.broadcast_in_dim %3000, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3003 = stablehlo.subtract %3001, %3002 : tensor<1x197x1024xf64>
-    %3004 = stablehlo.multiply %3003, %3003 : tensor<1x197x1024xf64>
-    %3005 = stablehlo.reduce(%3004 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3006 = stablehlo.reshape %3005 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3007 = stablehlo.broadcast_in_dim %3006, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3008 = stablehlo.divide %3007, %15 : tensor<1x197x1xf64>
-    %3009 = stablehlo.convert %3008 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3010 = stablehlo.reduce(%2995 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3011 = stablehlo.reshape %3010 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3012 = stablehlo.broadcast_in_dim %3011, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3013 = stablehlo.divide %3012, %31 : tensor<1x197x1xf32>
-    %3014 = stablehlo.broadcast_in_dim %3009, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3015 = stablehlo.add %3014, %36 : tensor<1x197x1xf32>
-    %3016 = stablehlo.rsqrt %3015 : tensor<1x197x1xf32>
-    %3017 = stablehlo.broadcast_in_dim %2995, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3018 = stablehlo.broadcast_in_dim %3013, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3019 = stablehlo.subtract %3017, %3018 : tensor<1x197x1024xf32>
-    %3020 = stablehlo.broadcast_in_dim %3019, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3021 = stablehlo.broadcast_in_dim %3016, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3022 = stablehlo.multiply %3020, %3021 : tensor<1x197x1024xf32>
-    %3023 = stablehlo.convert %arg93 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3024 = stablehlo.broadcast_in_dim %3022, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3025 = stablehlo.broadcast_in_dim %3023, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3026 = stablehlo.multiply %3024, %3025 : tensor<1x197x1024xf32>
-    %3027 = stablehlo.convert %arg94 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3028 = stablehlo.broadcast_in_dim %3026, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3029 = stablehlo.broadcast_in_dim %3027, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3030 = stablehlo.add %3028, %3029 : tensor<1x197x1024xf32>
-    %3031 = stablehlo.convert %3030 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3032 = stablehlo.reshape %3031 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3033 = stablehlo.convert %3032 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3034 = stablehlo.dot_general %3033, %arg330, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3035 = stablehlo.broadcast_in_dim %3034, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3036 = stablehlo.multiply %3035, %60 : tensor<197x1024xf32>
-    %3037 = stablehlo.broadcast_in_dim %3036, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3038 = stablehlo.broadcast_in_dim %arg331, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3039 = stablehlo.add %3037, %3038 : tensor<197x1024xf32>
-    %3040 = stablehlo.convert %3039 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3041 = stablehlo.reshape %3040 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3042 = stablehlo.dot_general %3032, %arg332, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %3043 = stablehlo.reshape %3042 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3044 = stablehlo.reshape %3043 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3045 = stablehlo.transpose %3044, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3046 = stablehlo.dot_general %3033, %arg333, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3047 = stablehlo.broadcast_in_dim %3046, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3048 = stablehlo.multiply %3047, %60 : tensor<197x1024xf32>
-    %3049 = stablehlo.broadcast_in_dim %3048, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3050 = stablehlo.broadcast_in_dim %arg334, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3051 = stablehlo.add %3049, %3050 : tensor<197x1024xf32>
-    %3052 = stablehlo.convert %3051 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3053 = stablehlo.reshape %3052 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3054 = stablehlo.reshape %3053 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3055 = stablehlo.transpose %3054, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3056 = stablehlo.reshape %3041 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3057 = stablehlo.transpose %3056, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3058 = stablehlo.transpose %3045, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %3059 = stablehlo.reshape %3057 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3060 = stablehlo.reshape %3058 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3061 = stablehlo.broadcast_in_dim %3060, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3062 = stablehlo.dot_general %3059, %3061, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %3063 = stablehlo.reshape %3062 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3064 = stablehlo.broadcast_in_dim %3063, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3065 = stablehlo.divide %3064, %92 : tensor<1x16x197x197xbf16>
-    %3066 = stablehlo.add %3065, %arg335 : tensor<1x16x197x197xbf16>
-    %3067 = stablehlo.convert %3066 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %3068 = stablehlo.reduce(%3067 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3069 = stablehlo.reshape %3068 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3070 = stablehlo.broadcast_in_dim %3067, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3071 = stablehlo.broadcast_in_dim %3069, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3072 = stablehlo.subtract %3070, %3071 : tensor<1x16x197x197xf32>
-    %3073 = stablehlo.exponential %3072 : tensor<1x16x197x197xf32>
-    %3074 = stablehlo.reduce(%3073 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3075 = stablehlo.reshape %3074 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3076 = stablehlo.broadcast_in_dim %3073, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3077 = stablehlo.broadcast_in_dim %3075, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3078 = stablehlo.divide %3076, %3077 : tensor<1x16x197x197xf32>
-    %3079 = stablehlo.convert %3078 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %3080 = stablehlo.reshape %3079 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %3081 = stablehlo.reshape %3055 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3082 = stablehlo.broadcast_in_dim %3081, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3083 = stablehlo.dot_general %3080, %3082, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3084 = stablehlo.reshape %3083 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3085 = stablehlo.transpose %3084, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %3086 = stablehlo.reshape %3085 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %3087 = stablehlo.reshape %3086 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3088 = stablehlo.convert %3087 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3089 = stablehlo.dot_general %3088, %arg336, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3090 = stablehlo.broadcast_in_dim %3089, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3091 = stablehlo.multiply %3090, %60 : tensor<197x1024xf32>
-    %3092 = stablehlo.broadcast_in_dim %3091, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3093 = stablehlo.broadcast_in_dim %arg337, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3094 = stablehlo.add %3092, %3093 : tensor<197x1024xf32>
-    %3095 = stablehlo.convert %3094 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3096 = stablehlo.reshape %3095 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3097 = stablehlo.broadcast_in_dim %arg95, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3098 = stablehlo.broadcast_in_dim %3096, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3099 = stablehlo.multiply %3097, %3098 : tensor<1x197x1024xbf16>
-    %3100 = stablehlo.add %3099, %2994 : tensor<1x197x1024xbf16>
-    %3101 = stablehlo.convert %3100 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3102 = stablehlo.convert %3101 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3103 = stablehlo.reduce(%3102 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3104 = stablehlo.reshape %3103 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3105 = stablehlo.broadcast_in_dim %3104, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3106 = stablehlo.divide %3105, %15 : tensor<1x197x1xf64>
-    %3107 = stablehlo.broadcast_in_dim %3102, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3108 = stablehlo.broadcast_in_dim %3106, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3109 = stablehlo.subtract %3107, %3108 : tensor<1x197x1024xf64>
-    %3110 = stablehlo.multiply %3109, %3109 : tensor<1x197x1024xf64>
-    %3111 = stablehlo.reduce(%3110 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3112 = stablehlo.reshape %3111 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3113 = stablehlo.broadcast_in_dim %3112, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3114 = stablehlo.divide %3113, %15 : tensor<1x197x1xf64>
-    %3115 = stablehlo.convert %3114 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3116 = stablehlo.reduce(%3101 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3117 = stablehlo.reshape %3116 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3118 = stablehlo.broadcast_in_dim %3117, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3119 = stablehlo.divide %3118, %31 : tensor<1x197x1xf32>
-    %3120 = stablehlo.broadcast_in_dim %3115, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3121 = stablehlo.add %3120, %36 : tensor<1x197x1xf32>
-    %3122 = stablehlo.rsqrt %3121 : tensor<1x197x1xf32>
-    %3123 = stablehlo.broadcast_in_dim %3101, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3124 = stablehlo.broadcast_in_dim %3119, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3125 = stablehlo.subtract %3123, %3124 : tensor<1x197x1024xf32>
-    %3126 = stablehlo.broadcast_in_dim %3125, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3127 = stablehlo.broadcast_in_dim %3122, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3128 = stablehlo.multiply %3126, %3127 : tensor<1x197x1024xf32>
-    %3129 = stablehlo.convert %arg96 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3130 = stablehlo.broadcast_in_dim %3128, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3131 = stablehlo.broadcast_in_dim %3129, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3132 = stablehlo.multiply %3130, %3131 : tensor<1x197x1024xf32>
-    %3133 = stablehlo.convert %arg97 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3134 = stablehlo.broadcast_in_dim %3132, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3135 = stablehlo.broadcast_in_dim %3133, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3136 = stablehlo.add %3134, %3135 : tensor<1x197x1024xf32>
-    %3137 = stablehlo.convert %3136 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3138 = stablehlo.reshape %3137 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3139 = stablehlo.convert %3138 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3140 = stablehlo.dot_general %3139, %arg338, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %3141 = stablehlo.broadcast_in_dim %3140, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3142 = stablehlo.multiply %3141, %170 : tensor<197x4096xf32>
-    %3143 = stablehlo.broadcast_in_dim %3142, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3144 = stablehlo.broadcast_in_dim %arg339, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %3145 = stablehlo.add %3143, %3144 : tensor<197x4096xf32>
-    %3146 = stablehlo.convert %3145 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %3147 = stablehlo.reshape %3146 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %3148 = stablehlo.multiply %3147, %cst_4 : tensor<1x197x4096xbf16>
-    %3149 = stablehlo.multiply %3147, %178 : tensor<1x197x4096xbf16>
-    %3150 = stablehlo.convert %3149 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %3151 = stablehlo.clamp %cst_5, %3150, %cst_6 : tensor<1x197x4096xf32>
-    %3152 = stablehlo.multiply %3151, %3151 : tensor<1x197x4096xf32>
-    %3153 = stablehlo.multiply %cst_7, %3152 : tensor<1x197x4096xf32>
-    %3154 = stablehlo.add %3153, %cst_8 : tensor<1x197x4096xf32>
-    %3155 = stablehlo.multiply %3154, %3152 : tensor<1x197x4096xf32>
-    %3156 = stablehlo.add %3155, %cst_9 : tensor<1x197x4096xf32>
-    %3157 = stablehlo.multiply %3156, %3152 : tensor<1x197x4096xf32>
-    %3158 = stablehlo.add %3157, %cst_10 : tensor<1x197x4096xf32>
-    %3159 = stablehlo.multiply %3158, %3152 : tensor<1x197x4096xf32>
-    %3160 = stablehlo.add %3159, %cst_11 : tensor<1x197x4096xf32>
-    %3161 = stablehlo.multiply %3160, %3152 : tensor<1x197x4096xf32>
-    %3162 = stablehlo.add %3161, %cst_12 : tensor<1x197x4096xf32>
-    %3163 = stablehlo.multiply %3162, %3152 : tensor<1x197x4096xf32>
-    %3164 = stablehlo.add %3163, %cst_13 : tensor<1x197x4096xf32>
-    %3165 = stablehlo.multiply %cst_14, %3152 : tensor<1x197x4096xf32>
-    %3166 = stablehlo.add %3165, %cst_15 : tensor<1x197x4096xf32>
-    %3167 = stablehlo.multiply %3166, %3152 : tensor<1x197x4096xf32>
-    %3168 = stablehlo.add %3167, %cst_16 : tensor<1x197x4096xf32>
-    %3169 = stablehlo.multiply %3168, %3152 : tensor<1x197x4096xf32>
-    %3170 = stablehlo.add %3169, %cst_17 : tensor<1x197x4096xf32>
-    %3171 = stablehlo.multiply %3170, %3152 : tensor<1x197x4096xf32>
-    %3172 = stablehlo.add %3171, %cst_18 : tensor<1x197x4096xf32>
-    %3173 = stablehlo.multiply %3151, %3164 : tensor<1x197x4096xf32>
-    %3174 = stablehlo.divide %3173, %3172 : tensor<1x197x4096xf32>
-    %3175 = stablehlo.clamp %cst_19, %3174, %cst_20 : tensor<1x197x4096xf32>
-    %3176 = stablehlo.convert %3175 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %3177 = stablehlo.add %3176, %cst_2 : tensor<1x197x4096xbf16>
-    %3178 = stablehlo.multiply %3177, %3148 : tensor<1x197x4096xbf16>
-    %3179 = stablehlo.reshape %3178 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %3180 = stablehlo.convert %3179 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %3181 = stablehlo.dot_general %3180, %arg340, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %3182 = stablehlo.broadcast_in_dim %3181, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3183 = stablehlo.multiply %3182, %60 : tensor<197x1024xf32>
-    %3184 = stablehlo.broadcast_in_dim %3183, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3185 = stablehlo.broadcast_in_dim %arg341, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3186 = stablehlo.add %3184, %3185 : tensor<197x1024xf32>
-    %3187 = stablehlo.convert %3186 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3188 = stablehlo.reshape %3187 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3189 = stablehlo.broadcast_in_dim %arg98, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3190 = stablehlo.broadcast_in_dim %3188, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3191 = stablehlo.multiply %3189, %3190 : tensor<1x197x1024xbf16>
-    %3192 = stablehlo.add %3191, %3100 : tensor<1x197x1024xbf16>
-    %3193 = stablehlo.convert %3192 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3194 = stablehlo.convert %3193 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3195 = stablehlo.reduce(%3194 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3196 = stablehlo.reshape %3195 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3197 = stablehlo.broadcast_in_dim %3196, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3198 = stablehlo.divide %3197, %15 : tensor<1x197x1xf64>
-    %3199 = stablehlo.broadcast_in_dim %3194, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3200 = stablehlo.broadcast_in_dim %3198, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3201 = stablehlo.subtract %3199, %3200 : tensor<1x197x1024xf64>
-    %3202 = stablehlo.multiply %3201, %3201 : tensor<1x197x1024xf64>
-    %3203 = stablehlo.reduce(%3202 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3204 = stablehlo.reshape %3203 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3205 = stablehlo.broadcast_in_dim %3204, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3206 = stablehlo.divide %3205, %15 : tensor<1x197x1xf64>
-    %3207 = stablehlo.convert %3206 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3208 = stablehlo.reduce(%3193 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3209 = stablehlo.reshape %3208 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3210 = stablehlo.broadcast_in_dim %3209, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3211 = stablehlo.divide %3210, %31 : tensor<1x197x1xf32>
-    %3212 = stablehlo.broadcast_in_dim %3207, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3213 = stablehlo.add %3212, %36 : tensor<1x197x1xf32>
-    %3214 = stablehlo.rsqrt %3213 : tensor<1x197x1xf32>
-    %3215 = stablehlo.broadcast_in_dim %3193, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3216 = stablehlo.broadcast_in_dim %3211, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3217 = stablehlo.subtract %3215, %3216 : tensor<1x197x1024xf32>
-    %3218 = stablehlo.broadcast_in_dim %3217, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3219 = stablehlo.broadcast_in_dim %3214, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3220 = stablehlo.multiply %3218, %3219 : tensor<1x197x1024xf32>
-    %3221 = stablehlo.convert %arg99 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3222 = stablehlo.broadcast_in_dim %3220, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3223 = stablehlo.broadcast_in_dim %3221, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3224 = stablehlo.multiply %3222, %3223 : tensor<1x197x1024xf32>
-    %3225 = stablehlo.convert %arg100 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3226 = stablehlo.broadcast_in_dim %3224, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3227 = stablehlo.broadcast_in_dim %3225, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3228 = stablehlo.add %3226, %3227 : tensor<1x197x1024xf32>
-    %3229 = stablehlo.convert %3228 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3230 = stablehlo.reshape %3229 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3231 = stablehlo.convert %3230 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3232 = stablehlo.dot_general %3231, %arg342, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3233 = stablehlo.broadcast_in_dim %3232, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3234 = stablehlo.multiply %3233, %60 : tensor<197x1024xf32>
-    %3235 = stablehlo.broadcast_in_dim %3234, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3236 = stablehlo.broadcast_in_dim %arg343, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3237 = stablehlo.add %3235, %3236 : tensor<197x1024xf32>
-    %3238 = stablehlo.convert %3237 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3239 = stablehlo.reshape %3238 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3240 = stablehlo.dot_general %3230, %arg344, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %3241 = stablehlo.reshape %3240 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3242 = stablehlo.reshape %3241 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3243 = stablehlo.transpose %3242, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3244 = stablehlo.dot_general %3231, %arg345, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3245 = stablehlo.broadcast_in_dim %3244, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3246 = stablehlo.multiply %3245, %60 : tensor<197x1024xf32>
-    %3247 = stablehlo.broadcast_in_dim %3246, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3248 = stablehlo.broadcast_in_dim %arg346, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3249 = stablehlo.add %3247, %3248 : tensor<197x1024xf32>
-    %3250 = stablehlo.convert %3249 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3251 = stablehlo.reshape %3250 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3252 = stablehlo.reshape %3251 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3253 = stablehlo.transpose %3252, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3254 = stablehlo.reshape %3239 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3255 = stablehlo.transpose %3254, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3256 = stablehlo.transpose %3243, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %3257 = stablehlo.reshape %3255 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3258 = stablehlo.reshape %3256 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3259 = stablehlo.broadcast_in_dim %3258, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3260 = stablehlo.dot_general %3257, %3259, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %3261 = stablehlo.reshape %3260 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3262 = stablehlo.broadcast_in_dim %3261, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3263 = stablehlo.divide %3262, %92 : tensor<1x16x197x197xbf16>
-    %3264 = stablehlo.add %3263, %arg347 : tensor<1x16x197x197xbf16>
-    %3265 = stablehlo.convert %3264 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %3266 = stablehlo.reduce(%3265 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3267 = stablehlo.reshape %3266 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3268 = stablehlo.broadcast_in_dim %3265, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3269 = stablehlo.broadcast_in_dim %3267, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3270 = stablehlo.subtract %3268, %3269 : tensor<1x16x197x197xf32>
-    %3271 = stablehlo.exponential %3270 : tensor<1x16x197x197xf32>
-    %3272 = stablehlo.reduce(%3271 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3273 = stablehlo.reshape %3272 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3274 = stablehlo.broadcast_in_dim %3271, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3275 = stablehlo.broadcast_in_dim %3273, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3276 = stablehlo.divide %3274, %3275 : tensor<1x16x197x197xf32>
-    %3277 = stablehlo.convert %3276 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %3278 = stablehlo.reshape %3277 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %3279 = stablehlo.reshape %3253 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3280 = stablehlo.broadcast_in_dim %3279, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3281 = stablehlo.dot_general %3278, %3280, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3282 = stablehlo.reshape %3281 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3283 = stablehlo.transpose %3282, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %3284 = stablehlo.reshape %3283 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %3285 = stablehlo.reshape %3284 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3286 = stablehlo.convert %3285 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3287 = stablehlo.dot_general %3286, %arg348, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3288 = stablehlo.broadcast_in_dim %3287, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3289 = stablehlo.multiply %3288, %60 : tensor<197x1024xf32>
-    %3290 = stablehlo.broadcast_in_dim %3289, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3291 = stablehlo.broadcast_in_dim %arg349, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3292 = stablehlo.add %3290, %3291 : tensor<197x1024xf32>
-    %3293 = stablehlo.convert %3292 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3294 = stablehlo.reshape %3293 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3295 = stablehlo.broadcast_in_dim %arg101, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3296 = stablehlo.broadcast_in_dim %3294, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3297 = stablehlo.multiply %3295, %3296 : tensor<1x197x1024xbf16>
-    %3298 = stablehlo.add %3297, %3192 : tensor<1x197x1024xbf16>
-    %3299 = stablehlo.convert %3298 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3300 = stablehlo.convert %3299 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3301 = stablehlo.reduce(%3300 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3302 = stablehlo.reshape %3301 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3303 = stablehlo.broadcast_in_dim %3302, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3304 = stablehlo.divide %3303, %15 : tensor<1x197x1xf64>
-    %3305 = stablehlo.broadcast_in_dim %3300, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3306 = stablehlo.broadcast_in_dim %3304, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3307 = stablehlo.subtract %3305, %3306 : tensor<1x197x1024xf64>
-    %3308 = stablehlo.multiply %3307, %3307 : tensor<1x197x1024xf64>
-    %3309 = stablehlo.reduce(%3308 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3310 = stablehlo.reshape %3309 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3311 = stablehlo.broadcast_in_dim %3310, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3312 = stablehlo.divide %3311, %15 : tensor<1x197x1xf64>
-    %3313 = stablehlo.convert %3312 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3314 = stablehlo.reduce(%3299 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3315 = stablehlo.reshape %3314 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3316 = stablehlo.broadcast_in_dim %3315, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3317 = stablehlo.divide %3316, %31 : tensor<1x197x1xf32>
-    %3318 = stablehlo.broadcast_in_dim %3313, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3319 = stablehlo.add %3318, %36 : tensor<1x197x1xf32>
-    %3320 = stablehlo.rsqrt %3319 : tensor<1x197x1xf32>
-    %3321 = stablehlo.broadcast_in_dim %3299, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3322 = stablehlo.broadcast_in_dim %3317, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3323 = stablehlo.subtract %3321, %3322 : tensor<1x197x1024xf32>
-    %3324 = stablehlo.broadcast_in_dim %3323, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3325 = stablehlo.broadcast_in_dim %3320, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3326 = stablehlo.multiply %3324, %3325 : tensor<1x197x1024xf32>
-    %3327 = stablehlo.convert %arg102 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3328 = stablehlo.broadcast_in_dim %3326, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3329 = stablehlo.broadcast_in_dim %3327, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3330 = stablehlo.multiply %3328, %3329 : tensor<1x197x1024xf32>
-    %3331 = stablehlo.convert %arg103 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3332 = stablehlo.broadcast_in_dim %3330, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3333 = stablehlo.broadcast_in_dim %3331, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3334 = stablehlo.add %3332, %3333 : tensor<1x197x1024xf32>
-    %3335 = stablehlo.convert %3334 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3336 = stablehlo.reshape %3335 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3337 = stablehlo.convert %3336 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3338 = stablehlo.dot_general %3337, %arg350, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %3339 = stablehlo.broadcast_in_dim %3338, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3340 = stablehlo.multiply %3339, %170 : tensor<197x4096xf32>
-    %3341 = stablehlo.broadcast_in_dim %3340, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3342 = stablehlo.broadcast_in_dim %arg351, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %3343 = stablehlo.add %3341, %3342 : tensor<197x4096xf32>
-    %3344 = stablehlo.convert %3343 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %3345 = stablehlo.reshape %3344 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %3346 = stablehlo.multiply %3345, %cst_4 : tensor<1x197x4096xbf16>
-    %3347 = stablehlo.multiply %3345, %178 : tensor<1x197x4096xbf16>
-    %3348 = stablehlo.convert %3347 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %3349 = stablehlo.clamp %cst_5, %3348, %cst_6 : tensor<1x197x4096xf32>
-    %3350 = stablehlo.multiply %3349, %3349 : tensor<1x197x4096xf32>
-    %3351 = stablehlo.multiply %cst_7, %3350 : tensor<1x197x4096xf32>
-    %3352 = stablehlo.add %3351, %cst_8 : tensor<1x197x4096xf32>
-    %3353 = stablehlo.multiply %3352, %3350 : tensor<1x197x4096xf32>
-    %3354 = stablehlo.add %3353, %cst_9 : tensor<1x197x4096xf32>
-    %3355 = stablehlo.multiply %3354, %3350 : tensor<1x197x4096xf32>
-    %3356 = stablehlo.add %3355, %cst_10 : tensor<1x197x4096xf32>
-    %3357 = stablehlo.multiply %3356, %3350 : tensor<1x197x4096xf32>
-    %3358 = stablehlo.add %3357, %cst_11 : tensor<1x197x4096xf32>
-    %3359 = stablehlo.multiply %3358, %3350 : tensor<1x197x4096xf32>
-    %3360 = stablehlo.add %3359, %cst_12 : tensor<1x197x4096xf32>
-    %3361 = stablehlo.multiply %3360, %3350 : tensor<1x197x4096xf32>
-    %3362 = stablehlo.add %3361, %cst_13 : tensor<1x197x4096xf32>
-    %3363 = stablehlo.multiply %cst_14, %3350 : tensor<1x197x4096xf32>
-    %3364 = stablehlo.add %3363, %cst_15 : tensor<1x197x4096xf32>
-    %3365 = stablehlo.multiply %3364, %3350 : tensor<1x197x4096xf32>
-    %3366 = stablehlo.add %3365, %cst_16 : tensor<1x197x4096xf32>
-    %3367 = stablehlo.multiply %3366, %3350 : tensor<1x197x4096xf32>
-    %3368 = stablehlo.add %3367, %cst_17 : tensor<1x197x4096xf32>
-    %3369 = stablehlo.multiply %3368, %3350 : tensor<1x197x4096xf32>
-    %3370 = stablehlo.add %3369, %cst_18 : tensor<1x197x4096xf32>
-    %3371 = stablehlo.multiply %3349, %3362 : tensor<1x197x4096xf32>
-    %3372 = stablehlo.divide %3371, %3370 : tensor<1x197x4096xf32>
-    %3373 = stablehlo.clamp %cst_19, %3372, %cst_20 : tensor<1x197x4096xf32>
-    %3374 = stablehlo.convert %3373 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %3375 = stablehlo.add %3374, %cst_2 : tensor<1x197x4096xbf16>
-    %3376 = stablehlo.multiply %3375, %3346 : tensor<1x197x4096xbf16>
-    %3377 = stablehlo.reshape %3376 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %3378 = stablehlo.convert %3377 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %3379 = stablehlo.dot_general %3378, %arg352, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %3380 = stablehlo.broadcast_in_dim %3379, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3381 = stablehlo.multiply %3380, %60 : tensor<197x1024xf32>
-    %3382 = stablehlo.broadcast_in_dim %3381, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3383 = stablehlo.broadcast_in_dim %arg353, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3384 = stablehlo.add %3382, %3383 : tensor<197x1024xf32>
-    %3385 = stablehlo.convert %3384 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3386 = stablehlo.reshape %3385 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3387 = stablehlo.broadcast_in_dim %arg104, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3388 = stablehlo.broadcast_in_dim %3386, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3389 = stablehlo.multiply %3387, %3388 : tensor<1x197x1024xbf16>
-    %3390 = stablehlo.add %3389, %3298 : tensor<1x197x1024xbf16>
-    %3391 = stablehlo.convert %3390 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3392 = stablehlo.convert %3391 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3393 = stablehlo.reduce(%3392 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3394 = stablehlo.reshape %3393 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3395 = stablehlo.broadcast_in_dim %3394, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3396 = stablehlo.divide %3395, %15 : tensor<1x197x1xf64>
-    %3397 = stablehlo.broadcast_in_dim %3392, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3398 = stablehlo.broadcast_in_dim %3396, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3399 = stablehlo.subtract %3397, %3398 : tensor<1x197x1024xf64>
-    %3400 = stablehlo.multiply %3399, %3399 : tensor<1x197x1024xf64>
-    %3401 = stablehlo.reduce(%3400 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3402 = stablehlo.reshape %3401 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3403 = stablehlo.broadcast_in_dim %3402, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3404 = stablehlo.divide %3403, %15 : tensor<1x197x1xf64>
-    %3405 = stablehlo.convert %3404 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3406 = stablehlo.reduce(%3391 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3407 = stablehlo.reshape %3406 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3408 = stablehlo.broadcast_in_dim %3407, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3409 = stablehlo.divide %3408, %31 : tensor<1x197x1xf32>
-    %3410 = stablehlo.broadcast_in_dim %3405, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3411 = stablehlo.add %3410, %36 : tensor<1x197x1xf32>
-    %3412 = stablehlo.rsqrt %3411 : tensor<1x197x1xf32>
-    %3413 = stablehlo.broadcast_in_dim %3391, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3414 = stablehlo.broadcast_in_dim %3409, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3415 = stablehlo.subtract %3413, %3414 : tensor<1x197x1024xf32>
-    %3416 = stablehlo.broadcast_in_dim %3415, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3417 = stablehlo.broadcast_in_dim %3412, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3418 = stablehlo.multiply %3416, %3417 : tensor<1x197x1024xf32>
-    %3419 = stablehlo.convert %arg105 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3420 = stablehlo.broadcast_in_dim %3418, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3421 = stablehlo.broadcast_in_dim %3419, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3422 = stablehlo.multiply %3420, %3421 : tensor<1x197x1024xf32>
-    %3423 = stablehlo.convert %arg106 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3424 = stablehlo.broadcast_in_dim %3422, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3425 = stablehlo.broadcast_in_dim %3423, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3426 = stablehlo.add %3424, %3425 : tensor<1x197x1024xf32>
-    %3427 = stablehlo.convert %3426 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3428 = stablehlo.reshape %3427 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3429 = stablehlo.convert %3428 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3430 = stablehlo.dot_general %3429, %arg354, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3431 = stablehlo.broadcast_in_dim %3430, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3432 = stablehlo.multiply %3431, %60 : tensor<197x1024xf32>
-    %3433 = stablehlo.broadcast_in_dim %3432, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3434 = stablehlo.broadcast_in_dim %arg355, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3435 = stablehlo.add %3433, %3434 : tensor<197x1024xf32>
-    %3436 = stablehlo.convert %3435 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3437 = stablehlo.reshape %3436 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3438 = stablehlo.dot_general %3428, %arg356, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %3439 = stablehlo.reshape %3438 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3440 = stablehlo.reshape %3439 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3441 = stablehlo.transpose %3440, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3442 = stablehlo.dot_general %3429, %arg357, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3443 = stablehlo.broadcast_in_dim %3442, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3444 = stablehlo.multiply %3443, %60 : tensor<197x1024xf32>
-    %3445 = stablehlo.broadcast_in_dim %3444, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3446 = stablehlo.broadcast_in_dim %arg358, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3447 = stablehlo.add %3445, %3446 : tensor<197x1024xf32>
-    %3448 = stablehlo.convert %3447 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3449 = stablehlo.reshape %3448 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3450 = stablehlo.reshape %3449 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3451 = stablehlo.transpose %3450, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3452 = stablehlo.reshape %3437 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3453 = stablehlo.transpose %3452, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3454 = stablehlo.transpose %3441, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %3455 = stablehlo.reshape %3453 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3456 = stablehlo.reshape %3454 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3457 = stablehlo.broadcast_in_dim %3456, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3458 = stablehlo.dot_general %3455, %3457, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %3459 = stablehlo.reshape %3458 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3460 = stablehlo.broadcast_in_dim %3459, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3461 = stablehlo.divide %3460, %92 : tensor<1x16x197x197xbf16>
-    %3462 = stablehlo.add %3461, %arg359 : tensor<1x16x197x197xbf16>
-    %3463 = stablehlo.convert %3462 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %3464 = stablehlo.reduce(%3463 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3465 = stablehlo.reshape %3464 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3466 = stablehlo.broadcast_in_dim %3463, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3467 = stablehlo.broadcast_in_dim %3465, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3468 = stablehlo.subtract %3466, %3467 : tensor<1x16x197x197xf32>
-    %3469 = stablehlo.exponential %3468 : tensor<1x16x197x197xf32>
-    %3470 = stablehlo.reduce(%3469 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3471 = stablehlo.reshape %3470 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3472 = stablehlo.broadcast_in_dim %3469, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3473 = stablehlo.broadcast_in_dim %3471, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3474 = stablehlo.divide %3472, %3473 : tensor<1x16x197x197xf32>
-    %3475 = stablehlo.convert %3474 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %3476 = stablehlo.reshape %3475 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %3477 = stablehlo.reshape %3451 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3478 = stablehlo.broadcast_in_dim %3477, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3479 = stablehlo.dot_general %3476, %3478, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3480 = stablehlo.reshape %3479 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3481 = stablehlo.transpose %3480, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %3482 = stablehlo.reshape %3481 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %3483 = stablehlo.reshape %3482 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3484 = stablehlo.convert %3483 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3485 = stablehlo.dot_general %3484, %arg360, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3486 = stablehlo.broadcast_in_dim %3485, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3487 = stablehlo.multiply %3486, %60 : tensor<197x1024xf32>
-    %3488 = stablehlo.broadcast_in_dim %3487, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3489 = stablehlo.broadcast_in_dim %arg361, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3490 = stablehlo.add %3488, %3489 : tensor<197x1024xf32>
-    %3491 = stablehlo.convert %3490 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3492 = stablehlo.reshape %3491 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3493 = stablehlo.broadcast_in_dim %arg107, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3494 = stablehlo.broadcast_in_dim %3492, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3495 = stablehlo.multiply %3493, %3494 : tensor<1x197x1024xbf16>
-    %3496 = stablehlo.add %3495, %3390 : tensor<1x197x1024xbf16>
-    %3497 = stablehlo.convert %3496 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3498 = stablehlo.convert %3497 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3499 = stablehlo.reduce(%3498 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3500 = stablehlo.reshape %3499 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3501 = stablehlo.broadcast_in_dim %3500, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3502 = stablehlo.divide %3501, %15 : tensor<1x197x1xf64>
-    %3503 = stablehlo.broadcast_in_dim %3498, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3504 = stablehlo.broadcast_in_dim %3502, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3505 = stablehlo.subtract %3503, %3504 : tensor<1x197x1024xf64>
-    %3506 = stablehlo.multiply %3505, %3505 : tensor<1x197x1024xf64>
-    %3507 = stablehlo.reduce(%3506 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3508 = stablehlo.reshape %3507 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3509 = stablehlo.broadcast_in_dim %3508, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3510 = stablehlo.divide %3509, %15 : tensor<1x197x1xf64>
-    %3511 = stablehlo.convert %3510 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3512 = stablehlo.reduce(%3497 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3513 = stablehlo.reshape %3512 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3514 = stablehlo.broadcast_in_dim %3513, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3515 = stablehlo.divide %3514, %31 : tensor<1x197x1xf32>
-    %3516 = stablehlo.broadcast_in_dim %3511, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3517 = stablehlo.add %3516, %36 : tensor<1x197x1xf32>
-    %3518 = stablehlo.rsqrt %3517 : tensor<1x197x1xf32>
-    %3519 = stablehlo.broadcast_in_dim %3497, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3520 = stablehlo.broadcast_in_dim %3515, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3521 = stablehlo.subtract %3519, %3520 : tensor<1x197x1024xf32>
-    %3522 = stablehlo.broadcast_in_dim %3521, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3523 = stablehlo.broadcast_in_dim %3518, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3524 = stablehlo.multiply %3522, %3523 : tensor<1x197x1024xf32>
-    %3525 = stablehlo.convert %arg108 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3526 = stablehlo.broadcast_in_dim %3524, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3527 = stablehlo.broadcast_in_dim %3525, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3528 = stablehlo.multiply %3526, %3527 : tensor<1x197x1024xf32>
-    %3529 = stablehlo.convert %arg109 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3530 = stablehlo.broadcast_in_dim %3528, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3531 = stablehlo.broadcast_in_dim %3529, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3532 = stablehlo.add %3530, %3531 : tensor<1x197x1024xf32>
-    %3533 = stablehlo.convert %3532 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3534 = stablehlo.reshape %3533 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3535 = stablehlo.convert %3534 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3536 = stablehlo.dot_general %3535, %arg362, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %3537 = stablehlo.broadcast_in_dim %3536, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3538 = stablehlo.multiply %3537, %170 : tensor<197x4096xf32>
-    %3539 = stablehlo.broadcast_in_dim %3538, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3540 = stablehlo.broadcast_in_dim %arg363, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %3541 = stablehlo.add %3539, %3540 : tensor<197x4096xf32>
-    %3542 = stablehlo.convert %3541 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %3543 = stablehlo.reshape %3542 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %3544 = stablehlo.multiply %3543, %cst_4 : tensor<1x197x4096xbf16>
-    %3545 = stablehlo.multiply %3543, %178 : tensor<1x197x4096xbf16>
-    %3546 = stablehlo.convert %3545 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %3547 = stablehlo.clamp %cst_5, %3546, %cst_6 : tensor<1x197x4096xf32>
-    %3548 = stablehlo.multiply %3547, %3547 : tensor<1x197x4096xf32>
-    %3549 = stablehlo.multiply %cst_7, %3548 : tensor<1x197x4096xf32>
-    %3550 = stablehlo.add %3549, %cst_8 : tensor<1x197x4096xf32>
-    %3551 = stablehlo.multiply %3550, %3548 : tensor<1x197x4096xf32>
-    %3552 = stablehlo.add %3551, %cst_9 : tensor<1x197x4096xf32>
-    %3553 = stablehlo.multiply %3552, %3548 : tensor<1x197x4096xf32>
-    %3554 = stablehlo.add %3553, %cst_10 : tensor<1x197x4096xf32>
-    %3555 = stablehlo.multiply %3554, %3548 : tensor<1x197x4096xf32>
-    %3556 = stablehlo.add %3555, %cst_11 : tensor<1x197x4096xf32>
-    %3557 = stablehlo.multiply %3556, %3548 : tensor<1x197x4096xf32>
-    %3558 = stablehlo.add %3557, %cst_12 : tensor<1x197x4096xf32>
-    %3559 = stablehlo.multiply %3558, %3548 : tensor<1x197x4096xf32>
-    %3560 = stablehlo.add %3559, %cst_13 : tensor<1x197x4096xf32>
-    %3561 = stablehlo.multiply %cst_14, %3548 : tensor<1x197x4096xf32>
-    %3562 = stablehlo.add %3561, %cst_15 : tensor<1x197x4096xf32>
-    %3563 = stablehlo.multiply %3562, %3548 : tensor<1x197x4096xf32>
-    %3564 = stablehlo.add %3563, %cst_16 : tensor<1x197x4096xf32>
-    %3565 = stablehlo.multiply %3564, %3548 : tensor<1x197x4096xf32>
-    %3566 = stablehlo.add %3565, %cst_17 : tensor<1x197x4096xf32>
-    %3567 = stablehlo.multiply %3566, %3548 : tensor<1x197x4096xf32>
-    %3568 = stablehlo.add %3567, %cst_18 : tensor<1x197x4096xf32>
-    %3569 = stablehlo.multiply %3547, %3560 : tensor<1x197x4096xf32>
-    %3570 = stablehlo.divide %3569, %3568 : tensor<1x197x4096xf32>
-    %3571 = stablehlo.clamp %cst_19, %3570, %cst_20 : tensor<1x197x4096xf32>
-    %3572 = stablehlo.convert %3571 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %3573 = stablehlo.add %3572, %cst_2 : tensor<1x197x4096xbf16>
-    %3574 = stablehlo.multiply %3573, %3544 : tensor<1x197x4096xbf16>
-    %3575 = stablehlo.reshape %3574 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %3576 = stablehlo.convert %3575 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %3577 = stablehlo.dot_general %3576, %arg364, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %3578 = stablehlo.broadcast_in_dim %3577, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3579 = stablehlo.multiply %3578, %60 : tensor<197x1024xf32>
-    %3580 = stablehlo.broadcast_in_dim %3579, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3581 = stablehlo.broadcast_in_dim %arg365, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3582 = stablehlo.add %3580, %3581 : tensor<197x1024xf32>
-    %3583 = stablehlo.convert %3582 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3584 = stablehlo.reshape %3583 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3585 = stablehlo.broadcast_in_dim %arg110, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3586 = stablehlo.broadcast_in_dim %3584, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3587 = stablehlo.multiply %3585, %3586 : tensor<1x197x1024xbf16>
-    %3588 = stablehlo.add %3587, %3496 : tensor<1x197x1024xbf16>
-    %3589 = stablehlo.convert %3588 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3590 = stablehlo.convert %3589 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3591 = stablehlo.reduce(%3590 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3592 = stablehlo.reshape %3591 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3593 = stablehlo.broadcast_in_dim %3592, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3594 = stablehlo.divide %3593, %15 : tensor<1x197x1xf64>
-    %3595 = stablehlo.broadcast_in_dim %3590, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3596 = stablehlo.broadcast_in_dim %3594, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3597 = stablehlo.subtract %3595, %3596 : tensor<1x197x1024xf64>
-    %3598 = stablehlo.multiply %3597, %3597 : tensor<1x197x1024xf64>
-    %3599 = stablehlo.reduce(%3598 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3600 = stablehlo.reshape %3599 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3601 = stablehlo.broadcast_in_dim %3600, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3602 = stablehlo.divide %3601, %15 : tensor<1x197x1xf64>
-    %3603 = stablehlo.convert %3602 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3604 = stablehlo.reduce(%3589 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3605 = stablehlo.reshape %3604 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3606 = stablehlo.broadcast_in_dim %3605, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3607 = stablehlo.divide %3606, %31 : tensor<1x197x1xf32>
-    %3608 = stablehlo.broadcast_in_dim %3603, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3609 = stablehlo.add %3608, %36 : tensor<1x197x1xf32>
-    %3610 = stablehlo.rsqrt %3609 : tensor<1x197x1xf32>
-    %3611 = stablehlo.broadcast_in_dim %3589, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3612 = stablehlo.broadcast_in_dim %3607, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3613 = stablehlo.subtract %3611, %3612 : tensor<1x197x1024xf32>
-    %3614 = stablehlo.broadcast_in_dim %3613, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3615 = stablehlo.broadcast_in_dim %3610, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3616 = stablehlo.multiply %3614, %3615 : tensor<1x197x1024xf32>
-    %3617 = stablehlo.convert %arg111 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3618 = stablehlo.broadcast_in_dim %3616, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3619 = stablehlo.broadcast_in_dim %3617, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3620 = stablehlo.multiply %3618, %3619 : tensor<1x197x1024xf32>
-    %3621 = stablehlo.convert %arg112 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3622 = stablehlo.broadcast_in_dim %3620, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3623 = stablehlo.broadcast_in_dim %3621, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3624 = stablehlo.add %3622, %3623 : tensor<1x197x1024xf32>
-    %3625 = stablehlo.convert %3624 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3626 = stablehlo.reshape %3625 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3627 = stablehlo.convert %3626 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3628 = stablehlo.dot_general %3627, %arg366, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3629 = stablehlo.broadcast_in_dim %3628, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3630 = stablehlo.multiply %3629, %60 : tensor<197x1024xf32>
-    %3631 = stablehlo.broadcast_in_dim %3630, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3632 = stablehlo.broadcast_in_dim %arg367, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3633 = stablehlo.add %3631, %3632 : tensor<197x1024xf32>
-    %3634 = stablehlo.convert %3633 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3635 = stablehlo.reshape %3634 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3636 = stablehlo.dot_general %3626, %arg368, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %3637 = stablehlo.reshape %3636 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3638 = stablehlo.reshape %3637 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3639 = stablehlo.transpose %3638, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3640 = stablehlo.dot_general %3627, %arg369, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3641 = stablehlo.broadcast_in_dim %3640, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3642 = stablehlo.multiply %3641, %60 : tensor<197x1024xf32>
-    %3643 = stablehlo.broadcast_in_dim %3642, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3644 = stablehlo.broadcast_in_dim %arg370, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3645 = stablehlo.add %3643, %3644 : tensor<197x1024xf32>
-    %3646 = stablehlo.convert %3645 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3647 = stablehlo.reshape %3646 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3648 = stablehlo.reshape %3647 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3649 = stablehlo.transpose %3648, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3650 = stablehlo.reshape %3635 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3651 = stablehlo.transpose %3650, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3652 = stablehlo.transpose %3639, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %3653 = stablehlo.reshape %3651 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3654 = stablehlo.reshape %3652 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3655 = stablehlo.broadcast_in_dim %3654, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3656 = stablehlo.dot_general %3653, %3655, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %3657 = stablehlo.reshape %3656 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3658 = stablehlo.broadcast_in_dim %3657, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3659 = stablehlo.divide %3658, %92 : tensor<1x16x197x197xbf16>
-    %3660 = stablehlo.add %3659, %arg371 : tensor<1x16x197x197xbf16>
-    %3661 = stablehlo.convert %3660 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %3662 = stablehlo.reduce(%3661 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3663 = stablehlo.reshape %3662 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3664 = stablehlo.broadcast_in_dim %3661, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3665 = stablehlo.broadcast_in_dim %3663, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3666 = stablehlo.subtract %3664, %3665 : tensor<1x16x197x197xf32>
-    %3667 = stablehlo.exponential %3666 : tensor<1x16x197x197xf32>
-    %3668 = stablehlo.reduce(%3667 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3669 = stablehlo.reshape %3668 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3670 = stablehlo.broadcast_in_dim %3667, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3671 = stablehlo.broadcast_in_dim %3669, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3672 = stablehlo.divide %3670, %3671 : tensor<1x16x197x197xf32>
-    %3673 = stablehlo.convert %3672 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %3674 = stablehlo.reshape %3673 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %3675 = stablehlo.reshape %3649 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3676 = stablehlo.broadcast_in_dim %3675, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3677 = stablehlo.dot_general %3674, %3676, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3678 = stablehlo.reshape %3677 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3679 = stablehlo.transpose %3678, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %3680 = stablehlo.reshape %3679 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %3681 = stablehlo.reshape %3680 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3682 = stablehlo.convert %3681 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3683 = stablehlo.dot_general %3682, %arg372, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3684 = stablehlo.broadcast_in_dim %3683, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3685 = stablehlo.multiply %3684, %60 : tensor<197x1024xf32>
-    %3686 = stablehlo.broadcast_in_dim %3685, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3687 = stablehlo.broadcast_in_dim %arg373, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3688 = stablehlo.add %3686, %3687 : tensor<197x1024xf32>
-    %3689 = stablehlo.convert %3688 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3690 = stablehlo.reshape %3689 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3691 = stablehlo.broadcast_in_dim %arg113, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3692 = stablehlo.broadcast_in_dim %3690, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3693 = stablehlo.multiply %3691, %3692 : tensor<1x197x1024xbf16>
-    %3694 = stablehlo.add %3693, %3588 : tensor<1x197x1024xbf16>
-    %3695 = stablehlo.convert %3694 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3696 = stablehlo.convert %3695 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3697 = stablehlo.reduce(%3696 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3698 = stablehlo.reshape %3697 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3699 = stablehlo.broadcast_in_dim %3698, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3700 = stablehlo.divide %3699, %15 : tensor<1x197x1xf64>
-    %3701 = stablehlo.broadcast_in_dim %3696, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3702 = stablehlo.broadcast_in_dim %3700, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3703 = stablehlo.subtract %3701, %3702 : tensor<1x197x1024xf64>
-    %3704 = stablehlo.multiply %3703, %3703 : tensor<1x197x1024xf64>
-    %3705 = stablehlo.reduce(%3704 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3706 = stablehlo.reshape %3705 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3707 = stablehlo.broadcast_in_dim %3706, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3708 = stablehlo.divide %3707, %15 : tensor<1x197x1xf64>
-    %3709 = stablehlo.convert %3708 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3710 = stablehlo.reduce(%3695 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3711 = stablehlo.reshape %3710 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3712 = stablehlo.broadcast_in_dim %3711, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3713 = stablehlo.divide %3712, %31 : tensor<1x197x1xf32>
-    %3714 = stablehlo.broadcast_in_dim %3709, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3715 = stablehlo.add %3714, %36 : tensor<1x197x1xf32>
-    %3716 = stablehlo.rsqrt %3715 : tensor<1x197x1xf32>
-    %3717 = stablehlo.broadcast_in_dim %3695, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3718 = stablehlo.broadcast_in_dim %3713, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3719 = stablehlo.subtract %3717, %3718 : tensor<1x197x1024xf32>
-    %3720 = stablehlo.broadcast_in_dim %3719, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3721 = stablehlo.broadcast_in_dim %3716, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3722 = stablehlo.multiply %3720, %3721 : tensor<1x197x1024xf32>
-    %3723 = stablehlo.convert %arg114 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3724 = stablehlo.broadcast_in_dim %3722, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3725 = stablehlo.broadcast_in_dim %3723, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3726 = stablehlo.multiply %3724, %3725 : tensor<1x197x1024xf32>
-    %3727 = stablehlo.convert %arg115 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3728 = stablehlo.broadcast_in_dim %3726, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3729 = stablehlo.broadcast_in_dim %3727, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3730 = stablehlo.add %3728, %3729 : tensor<1x197x1024xf32>
-    %3731 = stablehlo.convert %3730 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3732 = stablehlo.reshape %3731 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3733 = stablehlo.convert %3732 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3734 = stablehlo.dot_general %3733, %arg374, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %3735 = stablehlo.broadcast_in_dim %3734, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3736 = stablehlo.multiply %3735, %170 : tensor<197x4096xf32>
-    %3737 = stablehlo.broadcast_in_dim %3736, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3738 = stablehlo.broadcast_in_dim %arg375, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %3739 = stablehlo.add %3737, %3738 : tensor<197x4096xf32>
-    %3740 = stablehlo.convert %3739 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %3741 = stablehlo.reshape %3740 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %3742 = stablehlo.multiply %3741, %cst_4 : tensor<1x197x4096xbf16>
-    %3743 = stablehlo.multiply %3741, %178 : tensor<1x197x4096xbf16>
-    %3744 = stablehlo.convert %3743 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %3745 = stablehlo.clamp %cst_5, %3744, %cst_6 : tensor<1x197x4096xf32>
-    %3746 = stablehlo.multiply %3745, %3745 : tensor<1x197x4096xf32>
-    %3747 = stablehlo.multiply %cst_7, %3746 : tensor<1x197x4096xf32>
-    %3748 = stablehlo.add %3747, %cst_8 : tensor<1x197x4096xf32>
-    %3749 = stablehlo.multiply %3748, %3746 : tensor<1x197x4096xf32>
-    %3750 = stablehlo.add %3749, %cst_9 : tensor<1x197x4096xf32>
-    %3751 = stablehlo.multiply %3750, %3746 : tensor<1x197x4096xf32>
-    %3752 = stablehlo.add %3751, %cst_10 : tensor<1x197x4096xf32>
-    %3753 = stablehlo.multiply %3752, %3746 : tensor<1x197x4096xf32>
-    %3754 = stablehlo.add %3753, %cst_11 : tensor<1x197x4096xf32>
-    %3755 = stablehlo.multiply %3754, %3746 : tensor<1x197x4096xf32>
-    %3756 = stablehlo.add %3755, %cst_12 : tensor<1x197x4096xf32>
-    %3757 = stablehlo.multiply %3756, %3746 : tensor<1x197x4096xf32>
-    %3758 = stablehlo.add %3757, %cst_13 : tensor<1x197x4096xf32>
-    %3759 = stablehlo.multiply %cst_14, %3746 : tensor<1x197x4096xf32>
-    %3760 = stablehlo.add %3759, %cst_15 : tensor<1x197x4096xf32>
-    %3761 = stablehlo.multiply %3760, %3746 : tensor<1x197x4096xf32>
-    %3762 = stablehlo.add %3761, %cst_16 : tensor<1x197x4096xf32>
-    %3763 = stablehlo.multiply %3762, %3746 : tensor<1x197x4096xf32>
-    %3764 = stablehlo.add %3763, %cst_17 : tensor<1x197x4096xf32>
-    %3765 = stablehlo.multiply %3764, %3746 : tensor<1x197x4096xf32>
-    %3766 = stablehlo.add %3765, %cst_18 : tensor<1x197x4096xf32>
-    %3767 = stablehlo.multiply %3745, %3758 : tensor<1x197x4096xf32>
-    %3768 = stablehlo.divide %3767, %3766 : tensor<1x197x4096xf32>
-    %3769 = stablehlo.clamp %cst_19, %3768, %cst_20 : tensor<1x197x4096xf32>
-    %3770 = stablehlo.convert %3769 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %3771 = stablehlo.add %3770, %cst_2 : tensor<1x197x4096xbf16>
-    %3772 = stablehlo.multiply %3771, %3742 : tensor<1x197x4096xbf16>
-    %3773 = stablehlo.reshape %3772 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %3774 = stablehlo.convert %3773 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %3775 = stablehlo.dot_general %3774, %arg376, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %3776 = stablehlo.broadcast_in_dim %3775, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3777 = stablehlo.multiply %3776, %60 : tensor<197x1024xf32>
-    %3778 = stablehlo.broadcast_in_dim %3777, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3779 = stablehlo.broadcast_in_dim %arg377, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3780 = stablehlo.add %3778, %3779 : tensor<197x1024xf32>
-    %3781 = stablehlo.convert %3780 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3782 = stablehlo.reshape %3781 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3783 = stablehlo.broadcast_in_dim %arg116, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3784 = stablehlo.broadcast_in_dim %3782, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3785 = stablehlo.multiply %3783, %3784 : tensor<1x197x1024xbf16>
-    %3786 = stablehlo.add %3785, %3694 : tensor<1x197x1024xbf16>
-    %3787 = stablehlo.convert %3786 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3788 = stablehlo.convert %3787 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3789 = stablehlo.reduce(%3788 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3790 = stablehlo.reshape %3789 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3791 = stablehlo.broadcast_in_dim %3790, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3792 = stablehlo.divide %3791, %15 : tensor<1x197x1xf64>
-    %3793 = stablehlo.broadcast_in_dim %3788, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3794 = stablehlo.broadcast_in_dim %3792, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3795 = stablehlo.subtract %3793, %3794 : tensor<1x197x1024xf64>
-    %3796 = stablehlo.multiply %3795, %3795 : tensor<1x197x1024xf64>
-    %3797 = stablehlo.reduce(%3796 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3798 = stablehlo.reshape %3797 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3799 = stablehlo.broadcast_in_dim %3798, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3800 = stablehlo.divide %3799, %15 : tensor<1x197x1xf64>
-    %3801 = stablehlo.convert %3800 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3802 = stablehlo.reduce(%3787 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3803 = stablehlo.reshape %3802 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3804 = stablehlo.broadcast_in_dim %3803, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3805 = stablehlo.divide %3804, %31 : tensor<1x197x1xf32>
-    %3806 = stablehlo.broadcast_in_dim %3801, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3807 = stablehlo.add %3806, %36 : tensor<1x197x1xf32>
-    %3808 = stablehlo.rsqrt %3807 : tensor<1x197x1xf32>
-    %3809 = stablehlo.broadcast_in_dim %3787, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3810 = stablehlo.broadcast_in_dim %3805, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3811 = stablehlo.subtract %3809, %3810 : tensor<1x197x1024xf32>
-    %3812 = stablehlo.broadcast_in_dim %3811, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3813 = stablehlo.broadcast_in_dim %3808, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3814 = stablehlo.multiply %3812, %3813 : tensor<1x197x1024xf32>
-    %3815 = stablehlo.convert %arg117 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3816 = stablehlo.broadcast_in_dim %3814, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3817 = stablehlo.broadcast_in_dim %3815, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3818 = stablehlo.multiply %3816, %3817 : tensor<1x197x1024xf32>
-    %3819 = stablehlo.convert %arg118 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3820 = stablehlo.broadcast_in_dim %3818, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3821 = stablehlo.broadcast_in_dim %3819, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3822 = stablehlo.add %3820, %3821 : tensor<1x197x1024xf32>
-    %3823 = stablehlo.convert %3822 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3824 = stablehlo.reshape %3823 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3825 = stablehlo.convert %3824 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3826 = stablehlo.dot_general %3825, %arg378, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3827 = stablehlo.broadcast_in_dim %3826, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3828 = stablehlo.multiply %3827, %60 : tensor<197x1024xf32>
-    %3829 = stablehlo.broadcast_in_dim %3828, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3830 = stablehlo.broadcast_in_dim %arg379, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3831 = stablehlo.add %3829, %3830 : tensor<197x1024xf32>
-    %3832 = stablehlo.convert %3831 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3833 = stablehlo.reshape %3832 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3834 = stablehlo.dot_general %3824, %arg380, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %3835 = stablehlo.reshape %3834 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3836 = stablehlo.reshape %3835 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3837 = stablehlo.transpose %3836, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3838 = stablehlo.dot_general %3825, %arg381, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3839 = stablehlo.broadcast_in_dim %3838, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3840 = stablehlo.multiply %3839, %60 : tensor<197x1024xf32>
-    %3841 = stablehlo.broadcast_in_dim %3840, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3842 = stablehlo.broadcast_in_dim %arg382, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3843 = stablehlo.add %3841, %3842 : tensor<197x1024xf32>
-    %3844 = stablehlo.convert %3843 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3845 = stablehlo.reshape %3844 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3846 = stablehlo.reshape %3845 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3847 = stablehlo.transpose %3846, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3848 = stablehlo.reshape %3833 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %3849 = stablehlo.transpose %3848, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3850 = stablehlo.transpose %3837, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %3851 = stablehlo.reshape %3849 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3852 = stablehlo.reshape %3850 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3853 = stablehlo.broadcast_in_dim %3852, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %3854 = stablehlo.dot_general %3851, %3853, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %3855 = stablehlo.reshape %3854 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3856 = stablehlo.broadcast_in_dim %3855, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %3857 = stablehlo.divide %3856, %92 : tensor<1x16x197x197xbf16>
-    %3858 = stablehlo.add %3857, %arg383 : tensor<1x16x197x197xbf16>
-    %3859 = stablehlo.convert %3858 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %3860 = stablehlo.reduce(%3859 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3861 = stablehlo.reshape %3860 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3862 = stablehlo.broadcast_in_dim %3859, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3863 = stablehlo.broadcast_in_dim %3861, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3864 = stablehlo.subtract %3862, %3863 : tensor<1x16x197x197xf32>
-    %3865 = stablehlo.exponential %3864 : tensor<1x16x197x197xf32>
-    %3866 = stablehlo.reduce(%3865 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %3867 = stablehlo.reshape %3866 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %3868 = stablehlo.broadcast_in_dim %3865, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %3869 = stablehlo.broadcast_in_dim %3867, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %3870 = stablehlo.divide %3868, %3869 : tensor<1x16x197x197xf32>
-    %3871 = stablehlo.convert %3870 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %3872 = stablehlo.reshape %3871 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %3873 = stablehlo.reshape %3847 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3874 = stablehlo.broadcast_in_dim %3873, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3875 = stablehlo.dot_general %3872, %3874, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %3876 = stablehlo.reshape %3875 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %3877 = stablehlo.transpose %3876, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %3878 = stablehlo.reshape %3877 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %3879 = stablehlo.reshape %3878 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3880 = stablehlo.convert %3879 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3881 = stablehlo.dot_general %3880, %arg384, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %3882 = stablehlo.broadcast_in_dim %3881, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3883 = stablehlo.multiply %3882, %60 : tensor<197x1024xf32>
-    %3884 = stablehlo.broadcast_in_dim %3883, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3885 = stablehlo.broadcast_in_dim %arg385, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3886 = stablehlo.add %3884, %3885 : tensor<197x1024xf32>
-    %3887 = stablehlo.convert %3886 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3888 = stablehlo.reshape %3887 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3889 = stablehlo.broadcast_in_dim %arg119, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3890 = stablehlo.broadcast_in_dim %3888, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3891 = stablehlo.multiply %3889, %3890 : tensor<1x197x1024xbf16>
-    %3892 = stablehlo.add %3891, %3786 : tensor<1x197x1024xbf16>
-    %3893 = stablehlo.convert %3892 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3894 = stablehlo.convert %3893 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3895 = stablehlo.reduce(%3894 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3896 = stablehlo.reshape %3895 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3897 = stablehlo.broadcast_in_dim %3896, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3898 = stablehlo.divide %3897, %15 : tensor<1x197x1xf64>
-    %3899 = stablehlo.broadcast_in_dim %3894, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3900 = stablehlo.broadcast_in_dim %3898, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3901 = stablehlo.subtract %3899, %3900 : tensor<1x197x1024xf64>
-    %3902 = stablehlo.multiply %3901, %3901 : tensor<1x197x1024xf64>
-    %3903 = stablehlo.reduce(%3902 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3904 = stablehlo.reshape %3903 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3905 = stablehlo.broadcast_in_dim %3904, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3906 = stablehlo.divide %3905, %15 : tensor<1x197x1xf64>
-    %3907 = stablehlo.convert %3906 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %3908 = stablehlo.reduce(%3893 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %3909 = stablehlo.reshape %3908 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %3910 = stablehlo.broadcast_in_dim %3909, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3911 = stablehlo.divide %3910, %31 : tensor<1x197x1xf32>
-    %3912 = stablehlo.broadcast_in_dim %3907, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %3913 = stablehlo.add %3912, %36 : tensor<1x197x1xf32>
-    %3914 = stablehlo.rsqrt %3913 : tensor<1x197x1xf32>
-    %3915 = stablehlo.broadcast_in_dim %3893, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3916 = stablehlo.broadcast_in_dim %3911, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3917 = stablehlo.subtract %3915, %3916 : tensor<1x197x1024xf32>
-    %3918 = stablehlo.broadcast_in_dim %3917, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3919 = stablehlo.broadcast_in_dim %3914, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %3920 = stablehlo.multiply %3918, %3919 : tensor<1x197x1024xf32>
-    %3921 = stablehlo.convert %arg120 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3922 = stablehlo.broadcast_in_dim %3920, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3923 = stablehlo.broadcast_in_dim %3921, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3924 = stablehlo.multiply %3922, %3923 : tensor<1x197x1024xf32>
-    %3925 = stablehlo.convert %arg121 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %3926 = stablehlo.broadcast_in_dim %3924, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %3927 = stablehlo.broadcast_in_dim %3925, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %3928 = stablehlo.add %3926, %3927 : tensor<1x197x1024xf32>
-    %3929 = stablehlo.convert %3928 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %3930 = stablehlo.reshape %3929 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %3931 = stablehlo.convert %3930 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %3932 = stablehlo.dot_general %3931, %arg386, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %3933 = stablehlo.broadcast_in_dim %3932, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3934 = stablehlo.multiply %3933, %170 : tensor<197x4096xf32>
-    %3935 = stablehlo.broadcast_in_dim %3934, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %3936 = stablehlo.broadcast_in_dim %arg387, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %3937 = stablehlo.add %3935, %3936 : tensor<197x4096xf32>
-    %3938 = stablehlo.convert %3937 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %3939 = stablehlo.reshape %3938 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %3940 = stablehlo.multiply %3939, %cst_4 : tensor<1x197x4096xbf16>
-    %3941 = stablehlo.multiply %3939, %178 : tensor<1x197x4096xbf16>
-    %3942 = stablehlo.convert %3941 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %3943 = stablehlo.clamp %cst_5, %3942, %cst_6 : tensor<1x197x4096xf32>
-    %3944 = stablehlo.multiply %3943, %3943 : tensor<1x197x4096xf32>
-    %3945 = stablehlo.multiply %cst_7, %3944 : tensor<1x197x4096xf32>
-    %3946 = stablehlo.add %3945, %cst_8 : tensor<1x197x4096xf32>
-    %3947 = stablehlo.multiply %3946, %3944 : tensor<1x197x4096xf32>
-    %3948 = stablehlo.add %3947, %cst_9 : tensor<1x197x4096xf32>
-    %3949 = stablehlo.multiply %3948, %3944 : tensor<1x197x4096xf32>
-    %3950 = stablehlo.add %3949, %cst_10 : tensor<1x197x4096xf32>
-    %3951 = stablehlo.multiply %3950, %3944 : tensor<1x197x4096xf32>
-    %3952 = stablehlo.add %3951, %cst_11 : tensor<1x197x4096xf32>
-    %3953 = stablehlo.multiply %3952, %3944 : tensor<1x197x4096xf32>
-    %3954 = stablehlo.add %3953, %cst_12 : tensor<1x197x4096xf32>
-    %3955 = stablehlo.multiply %3954, %3944 : tensor<1x197x4096xf32>
-    %3956 = stablehlo.add %3955, %cst_13 : tensor<1x197x4096xf32>
-    %3957 = stablehlo.multiply %cst_14, %3944 : tensor<1x197x4096xf32>
-    %3958 = stablehlo.add %3957, %cst_15 : tensor<1x197x4096xf32>
-    %3959 = stablehlo.multiply %3958, %3944 : tensor<1x197x4096xf32>
-    %3960 = stablehlo.add %3959, %cst_16 : tensor<1x197x4096xf32>
-    %3961 = stablehlo.multiply %3960, %3944 : tensor<1x197x4096xf32>
-    %3962 = stablehlo.add %3961, %cst_17 : tensor<1x197x4096xf32>
-    %3963 = stablehlo.multiply %3962, %3944 : tensor<1x197x4096xf32>
-    %3964 = stablehlo.add %3963, %cst_18 : tensor<1x197x4096xf32>
-    %3965 = stablehlo.multiply %3943, %3956 : tensor<1x197x4096xf32>
-    %3966 = stablehlo.divide %3965, %3964 : tensor<1x197x4096xf32>
-    %3967 = stablehlo.clamp %cst_19, %3966, %cst_20 : tensor<1x197x4096xf32>
-    %3968 = stablehlo.convert %3967 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %3969 = stablehlo.add %3968, %cst_2 : tensor<1x197x4096xbf16>
-    %3970 = stablehlo.multiply %3969, %3940 : tensor<1x197x4096xbf16>
-    %3971 = stablehlo.reshape %3970 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %3972 = stablehlo.convert %3971 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %3973 = stablehlo.dot_general %3972, %arg388, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %3974 = stablehlo.broadcast_in_dim %3973, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3975 = stablehlo.multiply %3974, %60 : tensor<197x1024xf32>
-    %3976 = stablehlo.broadcast_in_dim %3975, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %3977 = stablehlo.broadcast_in_dim %arg389, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %3978 = stablehlo.add %3976, %3977 : tensor<197x1024xf32>
-    %3979 = stablehlo.convert %3978 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %3980 = stablehlo.reshape %3979 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3981 = stablehlo.broadcast_in_dim %arg122, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3982 = stablehlo.broadcast_in_dim %3980, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %3983 = stablehlo.multiply %3981, %3982 : tensor<1x197x1024xbf16>
-    %3984 = stablehlo.add %3983, %3892 : tensor<1x197x1024xbf16>
-    %3985 = stablehlo.convert %3984 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %3986 = stablehlo.convert %3985 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %3987 = stablehlo.reduce(%3986 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3988 = stablehlo.reshape %3987 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3989 = stablehlo.broadcast_in_dim %3988, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3990 = stablehlo.divide %3989, %15 : tensor<1x197x1xf64>
-    %3991 = stablehlo.broadcast_in_dim %3986, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %3992 = stablehlo.broadcast_in_dim %3990, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %3993 = stablehlo.subtract %3991, %3992 : tensor<1x197x1024xf64>
-    %3994 = stablehlo.multiply %3993, %3993 : tensor<1x197x1024xf64>
-    %3995 = stablehlo.reduce(%3994 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %3996 = stablehlo.reshape %3995 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %3997 = stablehlo.broadcast_in_dim %3996, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %3998 = stablehlo.divide %3997, %15 : tensor<1x197x1xf64>
-    %3999 = stablehlo.convert %3998 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4000 = stablehlo.reduce(%3985 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4001 = stablehlo.reshape %4000 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4002 = stablehlo.broadcast_in_dim %4001, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4003 = stablehlo.divide %4002, %31 : tensor<1x197x1xf32>
-    %4004 = stablehlo.broadcast_in_dim %3999, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4005 = stablehlo.add %4004, %36 : tensor<1x197x1xf32>
-    %4006 = stablehlo.rsqrt %4005 : tensor<1x197x1xf32>
-    %4007 = stablehlo.broadcast_in_dim %3985, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4008 = stablehlo.broadcast_in_dim %4003, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4009 = stablehlo.subtract %4007, %4008 : tensor<1x197x1024xf32>
-    %4010 = stablehlo.broadcast_in_dim %4009, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4011 = stablehlo.broadcast_in_dim %4006, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4012 = stablehlo.multiply %4010, %4011 : tensor<1x197x1024xf32>
-    %4013 = stablehlo.convert %arg123 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4014 = stablehlo.broadcast_in_dim %4012, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4015 = stablehlo.broadcast_in_dim %4013, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4016 = stablehlo.multiply %4014, %4015 : tensor<1x197x1024xf32>
-    %4017 = stablehlo.convert %arg124 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4018 = stablehlo.broadcast_in_dim %4016, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4019 = stablehlo.broadcast_in_dim %4017, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4020 = stablehlo.add %4018, %4019 : tensor<1x197x1024xf32>
-    %4021 = stablehlo.convert %4020 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4022 = stablehlo.reshape %4021 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4023 = stablehlo.convert %4022 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4024 = stablehlo.dot_general %4023, %arg390, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4025 = stablehlo.broadcast_in_dim %4024, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4026 = stablehlo.multiply %4025, %60 : tensor<197x1024xf32>
-    %4027 = stablehlo.broadcast_in_dim %4026, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4028 = stablehlo.broadcast_in_dim %arg391, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4029 = stablehlo.add %4027, %4028 : tensor<197x1024xf32>
-    %4030 = stablehlo.convert %4029 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4031 = stablehlo.reshape %4030 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4032 = stablehlo.dot_general %4022, %arg392, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %4033 = stablehlo.reshape %4032 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4034 = stablehlo.reshape %4033 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4035 = stablehlo.transpose %4034, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4036 = stablehlo.dot_general %4023, %arg393, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4037 = stablehlo.broadcast_in_dim %4036, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4038 = stablehlo.multiply %4037, %60 : tensor<197x1024xf32>
-    %4039 = stablehlo.broadcast_in_dim %4038, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4040 = stablehlo.broadcast_in_dim %arg394, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4041 = stablehlo.add %4039, %4040 : tensor<197x1024xf32>
-    %4042 = stablehlo.convert %4041 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4043 = stablehlo.reshape %4042 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4044 = stablehlo.reshape %4043 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4045 = stablehlo.transpose %4044, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4046 = stablehlo.reshape %4031 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4047 = stablehlo.transpose %4046, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4048 = stablehlo.transpose %4035, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %4049 = stablehlo.reshape %4047 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4050 = stablehlo.reshape %4048 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4051 = stablehlo.broadcast_in_dim %4050, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4052 = stablehlo.dot_general %4049, %4051, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %4053 = stablehlo.reshape %4052 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4054 = stablehlo.broadcast_in_dim %4053, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4055 = stablehlo.divide %4054, %92 : tensor<1x16x197x197xbf16>
-    %4056 = stablehlo.add %4055, %arg395 : tensor<1x16x197x197xbf16>
-    %4057 = stablehlo.convert %4056 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %4058 = stablehlo.reduce(%4057 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4059 = stablehlo.reshape %4058 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4060 = stablehlo.broadcast_in_dim %4057, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4061 = stablehlo.broadcast_in_dim %4059, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4062 = stablehlo.subtract %4060, %4061 : tensor<1x16x197x197xf32>
-    %4063 = stablehlo.exponential %4062 : tensor<1x16x197x197xf32>
-    %4064 = stablehlo.reduce(%4063 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4065 = stablehlo.reshape %4064 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4066 = stablehlo.broadcast_in_dim %4063, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4067 = stablehlo.broadcast_in_dim %4065, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4068 = stablehlo.divide %4066, %4067 : tensor<1x16x197x197xf32>
-    %4069 = stablehlo.convert %4068 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %4070 = stablehlo.reshape %4069 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %4071 = stablehlo.reshape %4045 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4072 = stablehlo.broadcast_in_dim %4071, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4073 = stablehlo.dot_general %4070, %4072, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4074 = stablehlo.reshape %4073 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4075 = stablehlo.transpose %4074, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %4076 = stablehlo.reshape %4075 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %4077 = stablehlo.reshape %4076 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4078 = stablehlo.convert %4077 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4079 = stablehlo.dot_general %4078, %arg396, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4080 = stablehlo.broadcast_in_dim %4079, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4081 = stablehlo.multiply %4080, %60 : tensor<197x1024xf32>
-    %4082 = stablehlo.broadcast_in_dim %4081, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4083 = stablehlo.broadcast_in_dim %arg397, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4084 = stablehlo.add %4082, %4083 : tensor<197x1024xf32>
-    %4085 = stablehlo.convert %4084 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4086 = stablehlo.reshape %4085 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4087 = stablehlo.broadcast_in_dim %arg125, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4088 = stablehlo.broadcast_in_dim %4086, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4089 = stablehlo.multiply %4087, %4088 : tensor<1x197x1024xbf16>
-    %4090 = stablehlo.add %4089, %3984 : tensor<1x197x1024xbf16>
-    %4091 = stablehlo.convert %4090 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4092 = stablehlo.convert %4091 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4093 = stablehlo.reduce(%4092 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4094 = stablehlo.reshape %4093 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4095 = stablehlo.broadcast_in_dim %4094, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4096 = stablehlo.divide %4095, %15 : tensor<1x197x1xf64>
-    %4097 = stablehlo.broadcast_in_dim %4092, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4098 = stablehlo.broadcast_in_dim %4096, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4099 = stablehlo.subtract %4097, %4098 : tensor<1x197x1024xf64>
-    %4100 = stablehlo.multiply %4099, %4099 : tensor<1x197x1024xf64>
-    %4101 = stablehlo.reduce(%4100 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4102 = stablehlo.reshape %4101 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4103 = stablehlo.broadcast_in_dim %4102, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4104 = stablehlo.divide %4103, %15 : tensor<1x197x1xf64>
-    %4105 = stablehlo.convert %4104 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4106 = stablehlo.reduce(%4091 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4107 = stablehlo.reshape %4106 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4108 = stablehlo.broadcast_in_dim %4107, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4109 = stablehlo.divide %4108, %31 : tensor<1x197x1xf32>
-    %4110 = stablehlo.broadcast_in_dim %4105, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4111 = stablehlo.add %4110, %36 : tensor<1x197x1xf32>
-    %4112 = stablehlo.rsqrt %4111 : tensor<1x197x1xf32>
-    %4113 = stablehlo.broadcast_in_dim %4091, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4114 = stablehlo.broadcast_in_dim %4109, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4115 = stablehlo.subtract %4113, %4114 : tensor<1x197x1024xf32>
-    %4116 = stablehlo.broadcast_in_dim %4115, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4117 = stablehlo.broadcast_in_dim %4112, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4118 = stablehlo.multiply %4116, %4117 : tensor<1x197x1024xf32>
-    %4119 = stablehlo.convert %arg126 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4120 = stablehlo.broadcast_in_dim %4118, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4121 = stablehlo.broadcast_in_dim %4119, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4122 = stablehlo.multiply %4120, %4121 : tensor<1x197x1024xf32>
-    %4123 = stablehlo.convert %arg127 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4124 = stablehlo.broadcast_in_dim %4122, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4125 = stablehlo.broadcast_in_dim %4123, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4126 = stablehlo.add %4124, %4125 : tensor<1x197x1024xf32>
-    %4127 = stablehlo.convert %4126 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4128 = stablehlo.reshape %4127 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4129 = stablehlo.convert %4128 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4130 = stablehlo.dot_general %4129, %arg398, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %4131 = stablehlo.broadcast_in_dim %4130, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4132 = stablehlo.multiply %4131, %170 : tensor<197x4096xf32>
-    %4133 = stablehlo.broadcast_in_dim %4132, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4134 = stablehlo.broadcast_in_dim %arg399, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %4135 = stablehlo.add %4133, %4134 : tensor<197x4096xf32>
-    %4136 = stablehlo.convert %4135 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %4137 = stablehlo.reshape %4136 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %4138 = stablehlo.multiply %4137, %cst_4 : tensor<1x197x4096xbf16>
-    %4139 = stablehlo.multiply %4137, %178 : tensor<1x197x4096xbf16>
-    %4140 = stablehlo.convert %4139 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %4141 = stablehlo.clamp %cst_5, %4140, %cst_6 : tensor<1x197x4096xf32>
-    %4142 = stablehlo.multiply %4141, %4141 : tensor<1x197x4096xf32>
-    %4143 = stablehlo.multiply %cst_7, %4142 : tensor<1x197x4096xf32>
-    %4144 = stablehlo.add %4143, %cst_8 : tensor<1x197x4096xf32>
-    %4145 = stablehlo.multiply %4144, %4142 : tensor<1x197x4096xf32>
-    %4146 = stablehlo.add %4145, %cst_9 : tensor<1x197x4096xf32>
-    %4147 = stablehlo.multiply %4146, %4142 : tensor<1x197x4096xf32>
-    %4148 = stablehlo.add %4147, %cst_10 : tensor<1x197x4096xf32>
-    %4149 = stablehlo.multiply %4148, %4142 : tensor<1x197x4096xf32>
-    %4150 = stablehlo.add %4149, %cst_11 : tensor<1x197x4096xf32>
-    %4151 = stablehlo.multiply %4150, %4142 : tensor<1x197x4096xf32>
-    %4152 = stablehlo.add %4151, %cst_12 : tensor<1x197x4096xf32>
-    %4153 = stablehlo.multiply %4152, %4142 : tensor<1x197x4096xf32>
-    %4154 = stablehlo.add %4153, %cst_13 : tensor<1x197x4096xf32>
-    %4155 = stablehlo.multiply %cst_14, %4142 : tensor<1x197x4096xf32>
-    %4156 = stablehlo.add %4155, %cst_15 : tensor<1x197x4096xf32>
-    %4157 = stablehlo.multiply %4156, %4142 : tensor<1x197x4096xf32>
-    %4158 = stablehlo.add %4157, %cst_16 : tensor<1x197x4096xf32>
-    %4159 = stablehlo.multiply %4158, %4142 : tensor<1x197x4096xf32>
-    %4160 = stablehlo.add %4159, %cst_17 : tensor<1x197x4096xf32>
-    %4161 = stablehlo.multiply %4160, %4142 : tensor<1x197x4096xf32>
-    %4162 = stablehlo.add %4161, %cst_18 : tensor<1x197x4096xf32>
-    %4163 = stablehlo.multiply %4141, %4154 : tensor<1x197x4096xf32>
-    %4164 = stablehlo.divide %4163, %4162 : tensor<1x197x4096xf32>
-    %4165 = stablehlo.clamp %cst_19, %4164, %cst_20 : tensor<1x197x4096xf32>
-    %4166 = stablehlo.convert %4165 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %4167 = stablehlo.add %4166, %cst_2 : tensor<1x197x4096xbf16>
-    %4168 = stablehlo.multiply %4167, %4138 : tensor<1x197x4096xbf16>
-    %4169 = stablehlo.reshape %4168 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %4170 = stablehlo.convert %4169 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %4171 = stablehlo.dot_general %4170, %arg400, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %4172 = stablehlo.broadcast_in_dim %4171, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4173 = stablehlo.multiply %4172, %60 : tensor<197x1024xf32>
-    %4174 = stablehlo.broadcast_in_dim %4173, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4175 = stablehlo.broadcast_in_dim %arg401, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4176 = stablehlo.add %4174, %4175 : tensor<197x1024xf32>
-    %4177 = stablehlo.convert %4176 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4178 = stablehlo.reshape %4177 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4179 = stablehlo.broadcast_in_dim %arg128, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4180 = stablehlo.broadcast_in_dim %4178, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4181 = stablehlo.multiply %4179, %4180 : tensor<1x197x1024xbf16>
-    %4182 = stablehlo.add %4181, %4090 : tensor<1x197x1024xbf16>
-    %4183 = stablehlo.convert %4182 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4184 = stablehlo.convert %4183 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4185 = stablehlo.reduce(%4184 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4186 = stablehlo.reshape %4185 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4187 = stablehlo.broadcast_in_dim %4186, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4188 = stablehlo.divide %4187, %15 : tensor<1x197x1xf64>
-    %4189 = stablehlo.broadcast_in_dim %4184, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4190 = stablehlo.broadcast_in_dim %4188, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4191 = stablehlo.subtract %4189, %4190 : tensor<1x197x1024xf64>
-    %4192 = stablehlo.multiply %4191, %4191 : tensor<1x197x1024xf64>
-    %4193 = stablehlo.reduce(%4192 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4194 = stablehlo.reshape %4193 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4195 = stablehlo.broadcast_in_dim %4194, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4196 = stablehlo.divide %4195, %15 : tensor<1x197x1xf64>
-    %4197 = stablehlo.convert %4196 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4198 = stablehlo.reduce(%4183 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4199 = stablehlo.reshape %4198 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4200 = stablehlo.broadcast_in_dim %4199, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4201 = stablehlo.divide %4200, %31 : tensor<1x197x1xf32>
-    %4202 = stablehlo.broadcast_in_dim %4197, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4203 = stablehlo.add %4202, %36 : tensor<1x197x1xf32>
-    %4204 = stablehlo.rsqrt %4203 : tensor<1x197x1xf32>
-    %4205 = stablehlo.broadcast_in_dim %4183, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4206 = stablehlo.broadcast_in_dim %4201, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4207 = stablehlo.subtract %4205, %4206 : tensor<1x197x1024xf32>
-    %4208 = stablehlo.broadcast_in_dim %4207, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4209 = stablehlo.broadcast_in_dim %4204, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4210 = stablehlo.multiply %4208, %4209 : tensor<1x197x1024xf32>
-    %4211 = stablehlo.convert %arg129 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4212 = stablehlo.broadcast_in_dim %4210, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4213 = stablehlo.broadcast_in_dim %4211, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4214 = stablehlo.multiply %4212, %4213 : tensor<1x197x1024xf32>
-    %4215 = stablehlo.convert %arg130 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4216 = stablehlo.broadcast_in_dim %4214, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4217 = stablehlo.broadcast_in_dim %4215, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4218 = stablehlo.add %4216, %4217 : tensor<1x197x1024xf32>
-    %4219 = stablehlo.convert %4218 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4220 = stablehlo.reshape %4219 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4221 = stablehlo.convert %4220 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4222 = stablehlo.dot_general %4221, %arg402, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4223 = stablehlo.broadcast_in_dim %4222, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4224 = stablehlo.multiply %4223, %60 : tensor<197x1024xf32>
-    %4225 = stablehlo.broadcast_in_dim %4224, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4226 = stablehlo.broadcast_in_dim %arg403, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4227 = stablehlo.add %4225, %4226 : tensor<197x1024xf32>
-    %4228 = stablehlo.convert %4227 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4229 = stablehlo.reshape %4228 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4230 = stablehlo.dot_general %4220, %arg404, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %4231 = stablehlo.reshape %4230 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4232 = stablehlo.reshape %4231 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4233 = stablehlo.transpose %4232, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4234 = stablehlo.dot_general %4221, %arg405, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4235 = stablehlo.broadcast_in_dim %4234, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4236 = stablehlo.multiply %4235, %60 : tensor<197x1024xf32>
-    %4237 = stablehlo.broadcast_in_dim %4236, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4238 = stablehlo.broadcast_in_dim %arg406, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4239 = stablehlo.add %4237, %4238 : tensor<197x1024xf32>
-    %4240 = stablehlo.convert %4239 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4241 = stablehlo.reshape %4240 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4242 = stablehlo.reshape %4241 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4243 = stablehlo.transpose %4242, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4244 = stablehlo.reshape %4229 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4245 = stablehlo.transpose %4244, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4246 = stablehlo.transpose %4233, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %4247 = stablehlo.reshape %4245 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4248 = stablehlo.reshape %4246 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4249 = stablehlo.broadcast_in_dim %4248, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4250 = stablehlo.dot_general %4247, %4249, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %4251 = stablehlo.reshape %4250 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4252 = stablehlo.broadcast_in_dim %4251, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4253 = stablehlo.divide %4252, %92 : tensor<1x16x197x197xbf16>
-    %4254 = stablehlo.add %4253, %arg407 : tensor<1x16x197x197xbf16>
-    %4255 = stablehlo.convert %4254 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %4256 = stablehlo.reduce(%4255 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4257 = stablehlo.reshape %4256 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4258 = stablehlo.broadcast_in_dim %4255, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4259 = stablehlo.broadcast_in_dim %4257, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4260 = stablehlo.subtract %4258, %4259 : tensor<1x16x197x197xf32>
-    %4261 = stablehlo.exponential %4260 : tensor<1x16x197x197xf32>
-    %4262 = stablehlo.reduce(%4261 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4263 = stablehlo.reshape %4262 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4264 = stablehlo.broadcast_in_dim %4261, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4265 = stablehlo.broadcast_in_dim %4263, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4266 = stablehlo.divide %4264, %4265 : tensor<1x16x197x197xf32>
-    %4267 = stablehlo.convert %4266 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %4268 = stablehlo.reshape %4267 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %4269 = stablehlo.reshape %4243 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4270 = stablehlo.broadcast_in_dim %4269, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4271 = stablehlo.dot_general %4268, %4270, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4272 = stablehlo.reshape %4271 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4273 = stablehlo.transpose %4272, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %4274 = stablehlo.reshape %4273 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %4275 = stablehlo.reshape %4274 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4276 = stablehlo.convert %4275 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4277 = stablehlo.dot_general %4276, %arg408, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4278 = stablehlo.broadcast_in_dim %4277, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4279 = stablehlo.multiply %4278, %60 : tensor<197x1024xf32>
-    %4280 = stablehlo.broadcast_in_dim %4279, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4281 = stablehlo.broadcast_in_dim %arg409, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4282 = stablehlo.add %4280, %4281 : tensor<197x1024xf32>
-    %4283 = stablehlo.convert %4282 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4284 = stablehlo.reshape %4283 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4285 = stablehlo.broadcast_in_dim %arg131, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4286 = stablehlo.broadcast_in_dim %4284, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4287 = stablehlo.multiply %4285, %4286 : tensor<1x197x1024xbf16>
-    %4288 = stablehlo.add %4287, %4182 : tensor<1x197x1024xbf16>
-    %4289 = stablehlo.convert %4288 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4290 = stablehlo.convert %4289 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4291 = stablehlo.reduce(%4290 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4292 = stablehlo.reshape %4291 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4293 = stablehlo.broadcast_in_dim %4292, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4294 = stablehlo.divide %4293, %15 : tensor<1x197x1xf64>
-    %4295 = stablehlo.broadcast_in_dim %4290, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4296 = stablehlo.broadcast_in_dim %4294, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4297 = stablehlo.subtract %4295, %4296 : tensor<1x197x1024xf64>
-    %4298 = stablehlo.multiply %4297, %4297 : tensor<1x197x1024xf64>
-    %4299 = stablehlo.reduce(%4298 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4300 = stablehlo.reshape %4299 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4301 = stablehlo.broadcast_in_dim %4300, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4302 = stablehlo.divide %4301, %15 : tensor<1x197x1xf64>
-    %4303 = stablehlo.convert %4302 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4304 = stablehlo.reduce(%4289 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4305 = stablehlo.reshape %4304 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4306 = stablehlo.broadcast_in_dim %4305, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4307 = stablehlo.divide %4306, %31 : tensor<1x197x1xf32>
-    %4308 = stablehlo.broadcast_in_dim %4303, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4309 = stablehlo.add %4308, %36 : tensor<1x197x1xf32>
-    %4310 = stablehlo.rsqrt %4309 : tensor<1x197x1xf32>
-    %4311 = stablehlo.broadcast_in_dim %4289, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4312 = stablehlo.broadcast_in_dim %4307, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4313 = stablehlo.subtract %4311, %4312 : tensor<1x197x1024xf32>
-    %4314 = stablehlo.broadcast_in_dim %4313, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4315 = stablehlo.broadcast_in_dim %4310, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4316 = stablehlo.multiply %4314, %4315 : tensor<1x197x1024xf32>
-    %4317 = stablehlo.convert %arg132 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4318 = stablehlo.broadcast_in_dim %4316, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4319 = stablehlo.broadcast_in_dim %4317, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4320 = stablehlo.multiply %4318, %4319 : tensor<1x197x1024xf32>
-    %4321 = stablehlo.convert %arg133 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4322 = stablehlo.broadcast_in_dim %4320, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4323 = stablehlo.broadcast_in_dim %4321, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4324 = stablehlo.add %4322, %4323 : tensor<1x197x1024xf32>
-    %4325 = stablehlo.convert %4324 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4326 = stablehlo.reshape %4325 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4327 = stablehlo.convert %4326 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4328 = stablehlo.dot_general %4327, %arg410, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %4329 = stablehlo.broadcast_in_dim %4328, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4330 = stablehlo.multiply %4329, %170 : tensor<197x4096xf32>
-    %4331 = stablehlo.broadcast_in_dim %4330, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4332 = stablehlo.broadcast_in_dim %arg411, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %4333 = stablehlo.add %4331, %4332 : tensor<197x4096xf32>
-    %4334 = stablehlo.convert %4333 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %4335 = stablehlo.reshape %4334 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %4336 = stablehlo.multiply %4335, %cst_4 : tensor<1x197x4096xbf16>
-    %4337 = stablehlo.multiply %4335, %178 : tensor<1x197x4096xbf16>
-    %4338 = stablehlo.convert %4337 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %4339 = stablehlo.clamp %cst_5, %4338, %cst_6 : tensor<1x197x4096xf32>
-    %4340 = stablehlo.multiply %4339, %4339 : tensor<1x197x4096xf32>
-    %4341 = stablehlo.multiply %cst_7, %4340 : tensor<1x197x4096xf32>
-    %4342 = stablehlo.add %4341, %cst_8 : tensor<1x197x4096xf32>
-    %4343 = stablehlo.multiply %4342, %4340 : tensor<1x197x4096xf32>
-    %4344 = stablehlo.add %4343, %cst_9 : tensor<1x197x4096xf32>
-    %4345 = stablehlo.multiply %4344, %4340 : tensor<1x197x4096xf32>
-    %4346 = stablehlo.add %4345, %cst_10 : tensor<1x197x4096xf32>
-    %4347 = stablehlo.multiply %4346, %4340 : tensor<1x197x4096xf32>
-    %4348 = stablehlo.add %4347, %cst_11 : tensor<1x197x4096xf32>
-    %4349 = stablehlo.multiply %4348, %4340 : tensor<1x197x4096xf32>
-    %4350 = stablehlo.add %4349, %cst_12 : tensor<1x197x4096xf32>
-    %4351 = stablehlo.multiply %4350, %4340 : tensor<1x197x4096xf32>
-    %4352 = stablehlo.add %4351, %cst_13 : tensor<1x197x4096xf32>
-    %4353 = stablehlo.multiply %cst_14, %4340 : tensor<1x197x4096xf32>
-    %4354 = stablehlo.add %4353, %cst_15 : tensor<1x197x4096xf32>
-    %4355 = stablehlo.multiply %4354, %4340 : tensor<1x197x4096xf32>
-    %4356 = stablehlo.add %4355, %cst_16 : tensor<1x197x4096xf32>
-    %4357 = stablehlo.multiply %4356, %4340 : tensor<1x197x4096xf32>
-    %4358 = stablehlo.add %4357, %cst_17 : tensor<1x197x4096xf32>
-    %4359 = stablehlo.multiply %4358, %4340 : tensor<1x197x4096xf32>
-    %4360 = stablehlo.add %4359, %cst_18 : tensor<1x197x4096xf32>
-    %4361 = stablehlo.multiply %4339, %4352 : tensor<1x197x4096xf32>
-    %4362 = stablehlo.divide %4361, %4360 : tensor<1x197x4096xf32>
-    %4363 = stablehlo.clamp %cst_19, %4362, %cst_20 : tensor<1x197x4096xf32>
-    %4364 = stablehlo.convert %4363 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %4365 = stablehlo.add %4364, %cst_2 : tensor<1x197x4096xbf16>
-    %4366 = stablehlo.multiply %4365, %4336 : tensor<1x197x4096xbf16>
-    %4367 = stablehlo.reshape %4366 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %4368 = stablehlo.convert %4367 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %4369 = stablehlo.dot_general %4368, %arg412, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %4370 = stablehlo.broadcast_in_dim %4369, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4371 = stablehlo.multiply %4370, %60 : tensor<197x1024xf32>
-    %4372 = stablehlo.broadcast_in_dim %4371, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4373 = stablehlo.broadcast_in_dim %arg413, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4374 = stablehlo.add %4372, %4373 : tensor<197x1024xf32>
-    %4375 = stablehlo.convert %4374 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4376 = stablehlo.reshape %4375 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4377 = stablehlo.broadcast_in_dim %arg134, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4378 = stablehlo.broadcast_in_dim %4376, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4379 = stablehlo.multiply %4377, %4378 : tensor<1x197x1024xbf16>
-    %4380 = stablehlo.add %4379, %4288 : tensor<1x197x1024xbf16>
-    %4381 = stablehlo.convert %4380 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4382 = stablehlo.convert %4381 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4383 = stablehlo.reduce(%4382 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4384 = stablehlo.reshape %4383 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4385 = stablehlo.broadcast_in_dim %4384, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4386 = stablehlo.divide %4385, %15 : tensor<1x197x1xf64>
-    %4387 = stablehlo.broadcast_in_dim %4382, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4388 = stablehlo.broadcast_in_dim %4386, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4389 = stablehlo.subtract %4387, %4388 : tensor<1x197x1024xf64>
-    %4390 = stablehlo.multiply %4389, %4389 : tensor<1x197x1024xf64>
-    %4391 = stablehlo.reduce(%4390 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4392 = stablehlo.reshape %4391 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4393 = stablehlo.broadcast_in_dim %4392, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4394 = stablehlo.divide %4393, %15 : tensor<1x197x1xf64>
-    %4395 = stablehlo.convert %4394 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4396 = stablehlo.reduce(%4381 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4397 = stablehlo.reshape %4396 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4398 = stablehlo.broadcast_in_dim %4397, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4399 = stablehlo.divide %4398, %31 : tensor<1x197x1xf32>
-    %4400 = stablehlo.broadcast_in_dim %4395, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4401 = stablehlo.add %4400, %36 : tensor<1x197x1xf32>
-    %4402 = stablehlo.rsqrt %4401 : tensor<1x197x1xf32>
-    %4403 = stablehlo.broadcast_in_dim %4381, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4404 = stablehlo.broadcast_in_dim %4399, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4405 = stablehlo.subtract %4403, %4404 : tensor<1x197x1024xf32>
-    %4406 = stablehlo.broadcast_in_dim %4405, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4407 = stablehlo.broadcast_in_dim %4402, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4408 = stablehlo.multiply %4406, %4407 : tensor<1x197x1024xf32>
-    %4409 = stablehlo.convert %arg135 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4410 = stablehlo.broadcast_in_dim %4408, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4411 = stablehlo.broadcast_in_dim %4409, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4412 = stablehlo.multiply %4410, %4411 : tensor<1x197x1024xf32>
-    %4413 = stablehlo.convert %arg136 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4414 = stablehlo.broadcast_in_dim %4412, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4415 = stablehlo.broadcast_in_dim %4413, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4416 = stablehlo.add %4414, %4415 : tensor<1x197x1024xf32>
-    %4417 = stablehlo.convert %4416 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4418 = stablehlo.reshape %4417 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4419 = stablehlo.convert %4418 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4420 = stablehlo.dot_general %4419, %arg414, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4421 = stablehlo.broadcast_in_dim %4420, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4422 = stablehlo.multiply %4421, %60 : tensor<197x1024xf32>
-    %4423 = stablehlo.broadcast_in_dim %4422, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4424 = stablehlo.broadcast_in_dim %arg415, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4425 = stablehlo.add %4423, %4424 : tensor<197x1024xf32>
-    %4426 = stablehlo.convert %4425 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4427 = stablehlo.reshape %4426 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4428 = stablehlo.dot_general %4418, %arg416, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %4429 = stablehlo.reshape %4428 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4430 = stablehlo.reshape %4429 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4431 = stablehlo.transpose %4430, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4432 = stablehlo.dot_general %4419, %arg417, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4433 = stablehlo.broadcast_in_dim %4432, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4434 = stablehlo.multiply %4433, %60 : tensor<197x1024xf32>
-    %4435 = stablehlo.broadcast_in_dim %4434, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4436 = stablehlo.broadcast_in_dim %arg418, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4437 = stablehlo.add %4435, %4436 : tensor<197x1024xf32>
-    %4438 = stablehlo.convert %4437 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4439 = stablehlo.reshape %4438 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4440 = stablehlo.reshape %4439 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4441 = stablehlo.transpose %4440, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4442 = stablehlo.reshape %4427 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4443 = stablehlo.transpose %4442, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4444 = stablehlo.transpose %4431, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %4445 = stablehlo.reshape %4443 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4446 = stablehlo.reshape %4444 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4447 = stablehlo.broadcast_in_dim %4446, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4448 = stablehlo.dot_general %4445, %4447, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %4449 = stablehlo.reshape %4448 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4450 = stablehlo.broadcast_in_dim %4449, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4451 = stablehlo.divide %4450, %92 : tensor<1x16x197x197xbf16>
-    %4452 = stablehlo.add %4451, %arg419 : tensor<1x16x197x197xbf16>
-    %4453 = stablehlo.convert %4452 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %4454 = stablehlo.reduce(%4453 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4455 = stablehlo.reshape %4454 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4456 = stablehlo.broadcast_in_dim %4453, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4457 = stablehlo.broadcast_in_dim %4455, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4458 = stablehlo.subtract %4456, %4457 : tensor<1x16x197x197xf32>
-    %4459 = stablehlo.exponential %4458 : tensor<1x16x197x197xf32>
-    %4460 = stablehlo.reduce(%4459 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4461 = stablehlo.reshape %4460 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4462 = stablehlo.broadcast_in_dim %4459, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4463 = stablehlo.broadcast_in_dim %4461, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4464 = stablehlo.divide %4462, %4463 : tensor<1x16x197x197xf32>
-    %4465 = stablehlo.convert %4464 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %4466 = stablehlo.reshape %4465 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %4467 = stablehlo.reshape %4441 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4468 = stablehlo.broadcast_in_dim %4467, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4469 = stablehlo.dot_general %4466, %4468, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4470 = stablehlo.reshape %4469 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4471 = stablehlo.transpose %4470, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %4472 = stablehlo.reshape %4471 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %4473 = stablehlo.reshape %4472 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4474 = stablehlo.convert %4473 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4475 = stablehlo.dot_general %4474, %arg420, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4476 = stablehlo.broadcast_in_dim %4475, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4477 = stablehlo.multiply %4476, %60 : tensor<197x1024xf32>
-    %4478 = stablehlo.broadcast_in_dim %4477, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4479 = stablehlo.broadcast_in_dim %arg421, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4480 = stablehlo.add %4478, %4479 : tensor<197x1024xf32>
-    %4481 = stablehlo.convert %4480 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4482 = stablehlo.reshape %4481 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4483 = stablehlo.broadcast_in_dim %arg137, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4484 = stablehlo.broadcast_in_dim %4482, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4485 = stablehlo.multiply %4483, %4484 : tensor<1x197x1024xbf16>
-    %4486 = stablehlo.add %4485, %4380 : tensor<1x197x1024xbf16>
-    %4487 = stablehlo.convert %4486 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4488 = stablehlo.convert %4487 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4489 = stablehlo.reduce(%4488 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4490 = stablehlo.reshape %4489 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4491 = stablehlo.broadcast_in_dim %4490, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4492 = stablehlo.divide %4491, %15 : tensor<1x197x1xf64>
-    %4493 = stablehlo.broadcast_in_dim %4488, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4494 = stablehlo.broadcast_in_dim %4492, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4495 = stablehlo.subtract %4493, %4494 : tensor<1x197x1024xf64>
-    %4496 = stablehlo.multiply %4495, %4495 : tensor<1x197x1024xf64>
-    %4497 = stablehlo.reduce(%4496 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4498 = stablehlo.reshape %4497 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4499 = stablehlo.broadcast_in_dim %4498, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4500 = stablehlo.divide %4499, %15 : tensor<1x197x1xf64>
-    %4501 = stablehlo.convert %4500 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4502 = stablehlo.reduce(%4487 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4503 = stablehlo.reshape %4502 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4504 = stablehlo.broadcast_in_dim %4503, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4505 = stablehlo.divide %4504, %31 : tensor<1x197x1xf32>
-    %4506 = stablehlo.broadcast_in_dim %4501, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4507 = stablehlo.add %4506, %36 : tensor<1x197x1xf32>
-    %4508 = stablehlo.rsqrt %4507 : tensor<1x197x1xf32>
-    %4509 = stablehlo.broadcast_in_dim %4487, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4510 = stablehlo.broadcast_in_dim %4505, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4511 = stablehlo.subtract %4509, %4510 : tensor<1x197x1024xf32>
-    %4512 = stablehlo.broadcast_in_dim %4511, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4513 = stablehlo.broadcast_in_dim %4508, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4514 = stablehlo.multiply %4512, %4513 : tensor<1x197x1024xf32>
-    %4515 = stablehlo.convert %arg138 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4516 = stablehlo.broadcast_in_dim %4514, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4517 = stablehlo.broadcast_in_dim %4515, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4518 = stablehlo.multiply %4516, %4517 : tensor<1x197x1024xf32>
-    %4519 = stablehlo.convert %arg139 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4520 = stablehlo.broadcast_in_dim %4518, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4521 = stablehlo.broadcast_in_dim %4519, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4522 = stablehlo.add %4520, %4521 : tensor<1x197x1024xf32>
-    %4523 = stablehlo.convert %4522 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4524 = stablehlo.reshape %4523 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4525 = stablehlo.convert %4524 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4526 = stablehlo.dot_general %4525, %arg422, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %4527 = stablehlo.broadcast_in_dim %4526, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4528 = stablehlo.multiply %4527, %170 : tensor<197x4096xf32>
-    %4529 = stablehlo.broadcast_in_dim %4528, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4530 = stablehlo.broadcast_in_dim %arg423, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %4531 = stablehlo.add %4529, %4530 : tensor<197x4096xf32>
-    %4532 = stablehlo.convert %4531 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %4533 = stablehlo.reshape %4532 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %4534 = stablehlo.multiply %4533, %cst_4 : tensor<1x197x4096xbf16>
-    %4535 = stablehlo.multiply %4533, %178 : tensor<1x197x4096xbf16>
-    %4536 = stablehlo.convert %4535 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %4537 = stablehlo.clamp %cst_5, %4536, %cst_6 : tensor<1x197x4096xf32>
-    %4538 = stablehlo.multiply %4537, %4537 : tensor<1x197x4096xf32>
-    %4539 = stablehlo.multiply %cst_7, %4538 : tensor<1x197x4096xf32>
-    %4540 = stablehlo.add %4539, %cst_8 : tensor<1x197x4096xf32>
-    %4541 = stablehlo.multiply %4540, %4538 : tensor<1x197x4096xf32>
-    %4542 = stablehlo.add %4541, %cst_9 : tensor<1x197x4096xf32>
-    %4543 = stablehlo.multiply %4542, %4538 : tensor<1x197x4096xf32>
-    %4544 = stablehlo.add %4543, %cst_10 : tensor<1x197x4096xf32>
-    %4545 = stablehlo.multiply %4544, %4538 : tensor<1x197x4096xf32>
-    %4546 = stablehlo.add %4545, %cst_11 : tensor<1x197x4096xf32>
-    %4547 = stablehlo.multiply %4546, %4538 : tensor<1x197x4096xf32>
-    %4548 = stablehlo.add %4547, %cst_12 : tensor<1x197x4096xf32>
-    %4549 = stablehlo.multiply %4548, %4538 : tensor<1x197x4096xf32>
-    %4550 = stablehlo.add %4549, %cst_13 : tensor<1x197x4096xf32>
-    %4551 = stablehlo.multiply %cst_14, %4538 : tensor<1x197x4096xf32>
-    %4552 = stablehlo.add %4551, %cst_15 : tensor<1x197x4096xf32>
-    %4553 = stablehlo.multiply %4552, %4538 : tensor<1x197x4096xf32>
-    %4554 = stablehlo.add %4553, %cst_16 : tensor<1x197x4096xf32>
-    %4555 = stablehlo.multiply %4554, %4538 : tensor<1x197x4096xf32>
-    %4556 = stablehlo.add %4555, %cst_17 : tensor<1x197x4096xf32>
-    %4557 = stablehlo.multiply %4556, %4538 : tensor<1x197x4096xf32>
-    %4558 = stablehlo.add %4557, %cst_18 : tensor<1x197x4096xf32>
-    %4559 = stablehlo.multiply %4537, %4550 : tensor<1x197x4096xf32>
-    %4560 = stablehlo.divide %4559, %4558 : tensor<1x197x4096xf32>
-    %4561 = stablehlo.clamp %cst_19, %4560, %cst_20 : tensor<1x197x4096xf32>
-    %4562 = stablehlo.convert %4561 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %4563 = stablehlo.add %4562, %cst_2 : tensor<1x197x4096xbf16>
-    %4564 = stablehlo.multiply %4563, %4534 : tensor<1x197x4096xbf16>
-    %4565 = stablehlo.reshape %4564 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %4566 = stablehlo.convert %4565 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %4567 = stablehlo.dot_general %4566, %arg424, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %4568 = stablehlo.broadcast_in_dim %4567, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4569 = stablehlo.multiply %4568, %60 : tensor<197x1024xf32>
-    %4570 = stablehlo.broadcast_in_dim %4569, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4571 = stablehlo.broadcast_in_dim %arg425, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4572 = stablehlo.add %4570, %4571 : tensor<197x1024xf32>
-    %4573 = stablehlo.convert %4572 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4574 = stablehlo.reshape %4573 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4575 = stablehlo.broadcast_in_dim %arg140, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4576 = stablehlo.broadcast_in_dim %4574, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4577 = stablehlo.multiply %4575, %4576 : tensor<1x197x1024xbf16>
-    %4578 = stablehlo.add %4577, %4486 : tensor<1x197x1024xbf16>
-    %4579 = stablehlo.convert %4578 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4580 = stablehlo.convert %4579 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4581 = stablehlo.reduce(%4580 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4582 = stablehlo.reshape %4581 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4583 = stablehlo.broadcast_in_dim %4582, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4584 = stablehlo.divide %4583, %15 : tensor<1x197x1xf64>
-    %4585 = stablehlo.broadcast_in_dim %4580, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4586 = stablehlo.broadcast_in_dim %4584, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4587 = stablehlo.subtract %4585, %4586 : tensor<1x197x1024xf64>
-    %4588 = stablehlo.multiply %4587, %4587 : tensor<1x197x1024xf64>
-    %4589 = stablehlo.reduce(%4588 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4590 = stablehlo.reshape %4589 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4591 = stablehlo.broadcast_in_dim %4590, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4592 = stablehlo.divide %4591, %15 : tensor<1x197x1xf64>
-    %4593 = stablehlo.convert %4592 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4594 = stablehlo.reduce(%4579 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4595 = stablehlo.reshape %4594 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4596 = stablehlo.broadcast_in_dim %4595, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4597 = stablehlo.divide %4596, %31 : tensor<1x197x1xf32>
-    %4598 = stablehlo.broadcast_in_dim %4593, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4599 = stablehlo.add %4598, %36 : tensor<1x197x1xf32>
-    %4600 = stablehlo.rsqrt %4599 : tensor<1x197x1xf32>
-    %4601 = stablehlo.broadcast_in_dim %4579, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4602 = stablehlo.broadcast_in_dim %4597, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4603 = stablehlo.subtract %4601, %4602 : tensor<1x197x1024xf32>
-    %4604 = stablehlo.broadcast_in_dim %4603, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4605 = stablehlo.broadcast_in_dim %4600, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4606 = stablehlo.multiply %4604, %4605 : tensor<1x197x1024xf32>
-    %4607 = stablehlo.convert %arg141 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4608 = stablehlo.broadcast_in_dim %4606, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4609 = stablehlo.broadcast_in_dim %4607, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4610 = stablehlo.multiply %4608, %4609 : tensor<1x197x1024xf32>
-    %4611 = stablehlo.convert %arg142 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4612 = stablehlo.broadcast_in_dim %4610, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4613 = stablehlo.broadcast_in_dim %4611, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4614 = stablehlo.add %4612, %4613 : tensor<1x197x1024xf32>
-    %4615 = stablehlo.convert %4614 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4616 = stablehlo.reshape %4615 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4617 = stablehlo.convert %4616 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4618 = stablehlo.dot_general %4617, %arg426, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4619 = stablehlo.broadcast_in_dim %4618, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4620 = stablehlo.multiply %4619, %60 : tensor<197x1024xf32>
-    %4621 = stablehlo.broadcast_in_dim %4620, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4622 = stablehlo.broadcast_in_dim %arg427, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4623 = stablehlo.add %4621, %4622 : tensor<197x1024xf32>
-    %4624 = stablehlo.convert %4623 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4625 = stablehlo.reshape %4624 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4626 = stablehlo.dot_general %4616, %arg428, contracting_dims = [1] x [0] : (tensor<197x1024xbf16>, tensor<1024x1024xbf16>) -> tensor<197x1024xbf16>
-    %4627 = stablehlo.reshape %4626 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4628 = stablehlo.reshape %4627 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4629 = stablehlo.transpose %4628, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4630 = stablehlo.dot_general %4617, %arg429, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4631 = stablehlo.broadcast_in_dim %4630, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4632 = stablehlo.multiply %4631, %60 : tensor<197x1024xf32>
-    %4633 = stablehlo.broadcast_in_dim %4632, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4634 = stablehlo.broadcast_in_dim %arg430, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4635 = stablehlo.add %4633, %4634 : tensor<197x1024xf32>
-    %4636 = stablehlo.convert %4635 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4637 = stablehlo.reshape %4636 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4638 = stablehlo.reshape %4637 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4639 = stablehlo.transpose %4638, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4640 = stablehlo.reshape %4625 : (tensor<1x197x1024xbf16>) -> tensor<1x197x16x64xbf16>
-    %4641 = stablehlo.transpose %4640, dims = [0, 2, 1, 3] : (tensor<1x197x16x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4642 = stablehlo.transpose %4629, dims = [0, 1, 3, 2] : (tensor<1x16x197x64xbf16>) -> tensor<1x16x64x197xbf16>
-    %4643 = stablehlo.reshape %4641 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4644 = stablehlo.reshape %4642 : (tensor<1x16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4645 = stablehlo.broadcast_in_dim %4644, dims = [0, 1, 2] : (tensor<16x64x197xbf16>) -> tensor<16x64x197xbf16>
-    %4646 = stablehlo.dot_general %4643, %4645, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x64xbf16>, tensor<16x64x197xbf16>) -> tensor<16x197x197xbf16>
-    %4647 = stablehlo.reshape %4646 : (tensor<16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4648 = stablehlo.broadcast_in_dim %4647, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xbf16>
-    %4649 = stablehlo.divide %4648, %92 : tensor<1x16x197x197xbf16>
-    %4650 = stablehlo.add %4649, %arg431 : tensor<1x16x197x197xbf16>
-    %4651 = stablehlo.convert %4650 : (tensor<1x16x197x197xbf16>) -> tensor<1x16x197x197xf32>
-    %4652 = stablehlo.reduce(%4651 init: %cst_1) applies stablehlo.maximum across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4653 = stablehlo.reshape %4652 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4654 = stablehlo.broadcast_in_dim %4651, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4655 = stablehlo.broadcast_in_dim %4653, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4656 = stablehlo.subtract %4654, %4655 : tensor<1x16x197x197xf32>
-    %4657 = stablehlo.exponential %4656 : tensor<1x16x197x197xf32>
-    %4658 = stablehlo.reduce(%4657 init: %cst_0) applies stablehlo.add across dimensions = [3] : (tensor<1x16x197x197xf32>, tensor<f32>) -> tensor<1x16x197xf32>
-    %4659 = stablehlo.reshape %4658 : (tensor<1x16x197xf32>) -> tensor<1x16x197x1xf32>
-    %4660 = stablehlo.broadcast_in_dim %4657, dims = [0, 1, 2, 3] : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xf32>
-    %4661 = stablehlo.broadcast_in_dim %4659, dims = [0, 1, 2, 3] : (tensor<1x16x197x1xf32>) -> tensor<1x16x197x197xf32>
-    %4662 = stablehlo.divide %4660, %4661 : tensor<1x16x197x197xf32>
-    %4663 = stablehlo.convert %4662 : (tensor<1x16x197x197xf32>) -> tensor<1x16x197x197xbf16>
-    %4664 = stablehlo.reshape %4663 : (tensor<1x16x197x197xbf16>) -> tensor<16x197x197xbf16>
-    %4665 = stablehlo.reshape %4639 : (tensor<1x16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4666 = stablehlo.broadcast_in_dim %4665, dims = [0, 1, 2] : (tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4667 = stablehlo.dot_general %4664, %4666, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<16x197x197xbf16>, tensor<16x197x64xbf16>) -> tensor<16x197x64xbf16>
-    %4668 = stablehlo.reshape %4667 : (tensor<16x197x64xbf16>) -> tensor<1x16x197x64xbf16>
-    %4669 = stablehlo.transpose %4668, dims = [0, 2, 1, 3] : (tensor<1x16x197x64xbf16>) -> tensor<1x197x16x64xbf16>
-    %4670 = stablehlo.reshape %4669 : (tensor<1x197x16x64xbf16>) -> tensor<1x197x1024xbf16>
-    %4671 = stablehlo.reshape %4670 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4672 = stablehlo.convert %4671 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4673 = stablehlo.dot_general %4672, %arg432, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x1024xf32>) -> tensor<197x1024xf32>
-    %4674 = stablehlo.broadcast_in_dim %4673, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4675 = stablehlo.multiply %4674, %60 : tensor<197x1024xf32>
-    %4676 = stablehlo.broadcast_in_dim %4675, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4677 = stablehlo.broadcast_in_dim %arg433, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4678 = stablehlo.add %4676, %4677 : tensor<197x1024xf32>
-    %4679 = stablehlo.convert %4678 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4680 = stablehlo.reshape %4679 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4681 = stablehlo.broadcast_in_dim %arg143, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4682 = stablehlo.broadcast_in_dim %4680, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4683 = stablehlo.multiply %4681, %4682 : tensor<1x197x1024xbf16>
-    %4684 = stablehlo.add %4683, %4578 : tensor<1x197x1024xbf16>
-    %4685 = stablehlo.convert %4684 : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xf32>
-    %4686 = stablehlo.convert %4685 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf64>
-    %4687 = stablehlo.reduce(%4686 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4688 = stablehlo.reshape %4687 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4689 = stablehlo.broadcast_in_dim %4688, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4690 = stablehlo.divide %4689, %15 : tensor<1x197x1xf64>
-    %4691 = stablehlo.broadcast_in_dim %4686, dims = [0, 1, 2] : (tensor<1x197x1024xf64>) -> tensor<1x197x1024xf64>
-    %4692 = stablehlo.broadcast_in_dim %4690, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1024xf64>
-    %4693 = stablehlo.subtract %4691, %4692 : tensor<1x197x1024xf64>
-    %4694 = stablehlo.multiply %4693, %4693 : tensor<1x197x1024xf64>
-    %4695 = stablehlo.reduce(%4694 init: %cst) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf64>, tensor<f64>) -> tensor<1x197xf64>
-    %4696 = stablehlo.reshape %4695 : (tensor<1x197xf64>) -> tensor<1x197x1xf64>
-    %4697 = stablehlo.broadcast_in_dim %4696, dims = [0, 1, 2] : (tensor<1x197x1xf64>) -> tensor<1x197x1xf64>
-    %4698 = stablehlo.divide %4697, %15 : tensor<1x197x1xf64>
-    %4699 = stablehlo.convert %4698 : (tensor<1x197x1xf64>) -> tensor<1x197x1xf32>
-    %4700 = stablehlo.reduce(%4685 init: %cst_0) applies stablehlo.add across dimensions = [2] : (tensor<1x197x1024xf32>, tensor<f32>) -> tensor<1x197xf32>
-    %4701 = stablehlo.reshape %4700 : (tensor<1x197xf32>) -> tensor<1x197x1xf32>
-    %4702 = stablehlo.broadcast_in_dim %4701, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4703 = stablehlo.divide %4702, %31 : tensor<1x197x1xf32>
-    %4704 = stablehlo.broadcast_in_dim %4699, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1xf32>
-    %4705 = stablehlo.add %4704, %36 : tensor<1x197x1xf32>
-    %4706 = stablehlo.rsqrt %4705 : tensor<1x197x1xf32>
-    %4707 = stablehlo.broadcast_in_dim %4685, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4708 = stablehlo.broadcast_in_dim %4703, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4709 = stablehlo.subtract %4707, %4708 : tensor<1x197x1024xf32>
-    %4710 = stablehlo.broadcast_in_dim %4709, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4711 = stablehlo.broadcast_in_dim %4706, dims = [0, 1, 2] : (tensor<1x197x1xf32>) -> tensor<1x197x1024xf32>
-    %4712 = stablehlo.multiply %4710, %4711 : tensor<1x197x1024xf32>
-    %4713 = stablehlo.convert %arg144 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4714 = stablehlo.broadcast_in_dim %4712, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4715 = stablehlo.broadcast_in_dim %4713, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4716 = stablehlo.multiply %4714, %4715 : tensor<1x197x1024xf32>
-    %4717 = stablehlo.convert %arg145 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4718 = stablehlo.broadcast_in_dim %4716, dims = [0, 1, 2] : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xf32>
-    %4719 = stablehlo.broadcast_in_dim %4717, dims = [2] : (tensor<1024xf32>) -> tensor<1x197x1024xf32>
-    %4720 = stablehlo.add %4718, %4719 : tensor<1x197x1024xf32>
-    %4721 = stablehlo.convert %4720 : (tensor<1x197x1024xf32>) -> tensor<1x197x1024xbf16>
-    %4722 = stablehlo.reshape %4721 : (tensor<1x197x1024xbf16>) -> tensor<197x1024xbf16>
-    %4723 = stablehlo.convert %4722 : (tensor<197x1024xbf16>) -> tensor<197x1024xf32>
-    %4724 = stablehlo.dot_general %4723, %arg434, contracting_dims = [1] x [0] : (tensor<197x1024xf32>, tensor<1024x4096xf32>) -> tensor<197x4096xf32>
-    %4725 = stablehlo.broadcast_in_dim %4724, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4726 = stablehlo.multiply %4725, %170 : tensor<197x4096xf32>
-    %4727 = stablehlo.broadcast_in_dim %4726, dims = [0, 1] : (tensor<197x4096xf32>) -> tensor<197x4096xf32>
-    %4728 = stablehlo.broadcast_in_dim %arg435, dims = [1] : (tensor<4096xf32>) -> tensor<197x4096xf32>
-    %4729 = stablehlo.add %4727, %4728 : tensor<197x4096xf32>
-    %4730 = stablehlo.convert %4729 : (tensor<197x4096xf32>) -> tensor<197x4096xbf16>
-    %4731 = stablehlo.reshape %4730 : (tensor<197x4096xbf16>) -> tensor<1x197x4096xbf16>
-    %4732 = stablehlo.multiply %4731, %cst_4 : tensor<1x197x4096xbf16>
-    %4733 = stablehlo.multiply %4731, %178 : tensor<1x197x4096xbf16>
-    %4734 = stablehlo.convert %4733 : (tensor<1x197x4096xbf16>) -> tensor<1x197x4096xf32>
-    %4735 = stablehlo.clamp %cst_5, %4734, %cst_6 : tensor<1x197x4096xf32>
-    %4736 = stablehlo.multiply %4735, %4735 : tensor<1x197x4096xf32>
-    %4737 = stablehlo.multiply %cst_7, %4736 : tensor<1x197x4096xf32>
-    %4738 = stablehlo.add %4737, %cst_8 : tensor<1x197x4096xf32>
-    %4739 = stablehlo.multiply %4738, %4736 : tensor<1x197x4096xf32>
-    %4740 = stablehlo.add %4739, %cst_9 : tensor<1x197x4096xf32>
-    %4741 = stablehlo.multiply %4740, %4736 : tensor<1x197x4096xf32>
-    %4742 = stablehlo.add %4741, %cst_10 : tensor<1x197x4096xf32>
-    %4743 = stablehlo.multiply %4742, %4736 : tensor<1x197x4096xf32>
-    %4744 = stablehlo.add %4743, %cst_11 : tensor<1x197x4096xf32>
-    %4745 = stablehlo.multiply %4744, %4736 : tensor<1x197x4096xf32>
-    %4746 = stablehlo.add %4745, %cst_12 : tensor<1x197x4096xf32>
-    %4747 = stablehlo.multiply %4746, %4736 : tensor<1x197x4096xf32>
-    %4748 = stablehlo.add %4747, %cst_13 : tensor<1x197x4096xf32>
-    %4749 = stablehlo.multiply %cst_14, %4736 : tensor<1x197x4096xf32>
-    %4750 = stablehlo.add %4749, %cst_15 : tensor<1x197x4096xf32>
-    %4751 = stablehlo.multiply %4750, %4736 : tensor<1x197x4096xf32>
-    %4752 = stablehlo.add %4751, %cst_16 : tensor<1x197x4096xf32>
-    %4753 = stablehlo.multiply %4752, %4736 : tensor<1x197x4096xf32>
-    %4754 = stablehlo.add %4753, %cst_17 : tensor<1x197x4096xf32>
-    %4755 = stablehlo.multiply %4754, %4736 : tensor<1x197x4096xf32>
-    %4756 = stablehlo.add %4755, %cst_18 : tensor<1x197x4096xf32>
-    %4757 = stablehlo.multiply %4735, %4748 : tensor<1x197x4096xf32>
-    %4758 = stablehlo.divide %4757, %4756 : tensor<1x197x4096xf32>
-    %4759 = stablehlo.clamp %cst_19, %4758, %cst_20 : tensor<1x197x4096xf32>
-    %4760 = stablehlo.convert %4759 : (tensor<1x197x4096xf32>) -> tensor<1x197x4096xbf16>
-    %4761 = stablehlo.add %4760, %cst_2 : tensor<1x197x4096xbf16>
-    %4762 = stablehlo.multiply %4761, %4732 : tensor<1x197x4096xbf16>
-    %4763 = stablehlo.reshape %4762 : (tensor<1x197x4096xbf16>) -> tensor<197x4096xbf16>
-    %4764 = stablehlo.convert %4763 : (tensor<197x4096xbf16>) -> tensor<197x4096xf32>
-    %4765 = stablehlo.dot_general %4764, %arg436, contracting_dims = [1] x [0] : (tensor<197x4096xf32>, tensor<4096x1024xf32>) -> tensor<197x1024xf32>
-    %4766 = stablehlo.broadcast_in_dim %4765, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4767 = stablehlo.multiply %4766, %60 : tensor<197x1024xf32>
-    %4768 = stablehlo.broadcast_in_dim %4767, dims = [0, 1] : (tensor<197x1024xf32>) -> tensor<197x1024xf32>
-    %4769 = stablehlo.broadcast_in_dim %arg437, dims = [1] : (tensor<1024xf32>) -> tensor<197x1024xf32>
-    %4770 = stablehlo.add %4768, %4769 : tensor<197x1024xf32>
-    %4771 = stablehlo.convert %4770 : (tensor<197x1024xf32>) -> tensor<197x1024xbf16>
-    %4772 = stablehlo.reshape %4771 : (tensor<197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4773 = stablehlo.broadcast_in_dim %arg146, dims = [2] : (tensor<1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4774 = stablehlo.broadcast_in_dim %4772, dims = [0, 1, 2] : (tensor<1x197x1024xbf16>) -> tensor<1x197x1024xbf16>
-    %4775 = stablehlo.multiply %4773, %4774 : tensor<1x197x1024xbf16>
-    %4776 = stablehlo.add %4775, %4684 : tensor<1x197x1024xbf16>
-    %4777 = stablehlo.slice %4776 [0:1, 1:197, 0:1024] : (tensor<1x197x1024xbf16>) -> tensor<1x196x1024xbf16>
-    %4778 = stablehlo.reduce(%4777 init: %cst_21) applies stablehlo.add across dimensions = [1] : (tensor<1x196x1024xbf16>, tensor<bf16>) -> tensor<1x1024xbf16>
-    %4779 = stablehlo.convert %cst_26 : (tensor<1xi64>) -> tensor<1xbf16>
-    %4780 = stablehlo.reshape %4779 : (tensor<1xbf16>) -> tensor<bf16>
-    %4781 = stablehlo.broadcast_in_dim %4778, dims = [0, 1] : (tensor<1x1024xbf16>) -> tensor<1x1024xbf16>
-    %4782 = stablehlo.broadcast_in_dim %4780, dims = [] : (tensor<bf16>) -> tensor<1x1024xbf16>
-    %4783 = stablehlo.divide %4781, %4782 : tensor<1x1024xbf16>
-    %4784 = stablehlo.convert %4783 : (tensor<1x1024xbf16>) -> tensor<1x1024xf32>
-    %4785 = stablehlo.convert %4784 : (tensor<1x1024xf32>) -> tensor<1x1024xf64>
-    %4786 = stablehlo.reduce(%4785 init: %cst) applies stablehlo.add across dimensions = [1] : (tensor<1x1024xf64>, tensor<f64>) -> tensor<1xf64>
-    %4787 = stablehlo.reshape %4786 : (tensor<1xf64>) -> tensor<1x1xf64>
-    %4788 = stablehlo.broadcast_in_dim %4787, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<1x1xf64>
-    %4789 = stablehlo.broadcast_in_dim %13, dims = [] : (tensor<f64>) -> tensor<1x1xf64>
-    %4790 = stablehlo.divide %4788, %4789 : tensor<1x1xf64>
-    %4791 = stablehlo.broadcast_in_dim %4785, dims = [0, 1] : (tensor<1x1024xf64>) -> tensor<1x1024xf64>
-    %4792 = stablehlo.broadcast_in_dim %4790, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<1x1024xf64>
-    %4793 = stablehlo.subtract %4791, %4792 : tensor<1x1024xf64>
-    %4794 = stablehlo.multiply %4793, %4793 : tensor<1x1024xf64>
-    %4795 = stablehlo.reduce(%4794 init: %cst) applies stablehlo.add across dimensions = [1] : (tensor<1x1024xf64>, tensor<f64>) -> tensor<1xf64>
-    %4796 = stablehlo.reshape %4795 : (tensor<1xf64>) -> tensor<1x1xf64>
-    %4797 = stablehlo.broadcast_in_dim %4796, dims = [0, 1] : (tensor<1x1xf64>) -> tensor<1x1xf64>
-    %4798 = stablehlo.divide %4797, %4789 : tensor<1x1xf64>
-    %4799 = stablehlo.convert %4798 : (tensor<1x1xf64>) -> tensor<1x1xf32>
-    %4800 = stablehlo.reduce(%4784 init: %cst_0) applies stablehlo.add across dimensions = [1] : (tensor<1x1024xf32>, tensor<f32>) -> tensor<1xf32>
-    %4801 = stablehlo.reshape %4800 : (tensor<1xf32>) -> tensor<1x1xf32>
-    %4802 = stablehlo.broadcast_in_dim %4801, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x1xf32>
-    %4803 = stablehlo.broadcast_in_dim %29, dims = [] : (tensor<f32>) -> tensor<1x1xf32>
-    %4804 = stablehlo.divide %4802, %4803 : tensor<1x1xf32>
-    %4805 = stablehlo.broadcast_in_dim %4799, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x1xf32>
-    %4806 = stablehlo.broadcast_in_dim %34, dims = [] : (tensor<f32>) -> tensor<1x1xf32>
-    %4807 = stablehlo.add %4805, %4806 : tensor<1x1xf32>
-    %4808 = stablehlo.rsqrt %4807 : tensor<1x1xf32>
-    %4809 = stablehlo.broadcast_in_dim %4784, dims = [0, 1] : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %4810 = stablehlo.broadcast_in_dim %4804, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x1024xf32>
-    %4811 = stablehlo.subtract %4809, %4810 : tensor<1x1024xf32>
-    %4812 = stablehlo.broadcast_in_dim %4811, dims = [0, 1] : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %4813 = stablehlo.broadcast_in_dim %4808, dims = [0, 1] : (tensor<1x1xf32>) -> tensor<1x1024xf32>
-    %4814 = stablehlo.multiply %4812, %4813 : tensor<1x1024xf32>
-    %4815 = stablehlo.convert %arg147 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4816 = stablehlo.broadcast_in_dim %4814, dims = [0, 1] : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %4817 = stablehlo.broadcast_in_dim %4815, dims = [1] : (tensor<1024xf32>) -> tensor<1x1024xf32>
-    %4818 = stablehlo.multiply %4816, %4817 : tensor<1x1024xf32>
-    %4819 = stablehlo.convert %arg148 : (tensor<1024xbf16>) -> tensor<1024xf32>
-    %4820 = stablehlo.broadcast_in_dim %4818, dims = [0, 1] : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
-    %4821 = stablehlo.broadcast_in_dim %4819, dims = [1] : (tensor<1024xf32>) -> tensor<1x1024xf32>
-    %4822 = stablehlo.add %4820, %4821 : tensor<1x1024xf32>
-    %4823 = stablehlo.convert %4822 : (tensor<1x1024xf32>) -> tensor<1x1024xbf16>
-    %4824 = stablehlo.convert %4823 : (tensor<1x1024xbf16>) -> tensor<1x1024xf32>
-    %4825 = stablehlo.dot_general %4824, %arg438, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x1000xf32>) -> tensor<1x1000xf32>
-    %4826 = stablehlo.broadcast_in_dim %4825, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %4827 = stablehlo.broadcast_in_dim %58, dims = [] : (tensor<f32>) -> tensor<1x1000xf32>
-    %4828 = stablehlo.multiply %4826, %4827 : tensor<1x1000xf32>
-    %4829 = stablehlo.broadcast_in_dim %4828, dims = [0, 1] : (tensor<1x1000xf32>) -> tensor<1x1000xf32>
-    %4830 = stablehlo.broadcast_in_dim %arg439, dims = [1] : (tensor<1000xf32>) -> tensor<1x1000xf32>
-    %4831 = stablehlo.add %4829, %4830 : tensor<1x1000xf32>
-    %4832 = stablehlo.convert %4831 : (tensor<1x1000xf32>) -> tensor<1x1000xbf16>
-    return %4832 : tensor<1x1000xbf16>
-  }
-}
diff --git a/mlir_tests/pytests/test_autoencoder_linear.py b/mlir_tests/pytests/test_autoencoder_linear.py
deleted file mode 100644
index 23d567e7..00000000
--- a/mlir_tests/pytests/test_autoencoder_linear.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_autoencoder_linear():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/Autoencoder (linear).mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "Autoencoder (linear)"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_beit_base_patch16_224.py b/mlir_tests/pytests/test_beit_base_patch16_224.py
deleted file mode 100644
index 336a1a98..00000000
--- a/mlir_tests/pytests/test_beit_base_patch16_224.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_beit_base_patch16_224():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open(
-        "mlir_tests/microsoftbeit-base-patch16-224.mlir", "r", encoding="utf-8"
-    ) as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "microsoft/beit-base-patch16-224"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_beit_large_patch16_224.py b/mlir_tests/pytests/test_beit_large_patch16_224.py
deleted file mode 100644
index 5e59292e..00000000
--- a/mlir_tests/pytests/test_beit_large_patch16_224.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_beit_large_patch16_224():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open(
-        "mlir_tests/microsoftbeit-large-patch16-224.mlir", "r", encoding="utf-8"
-    ) as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "microsoft/beit-large-patch16-224"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_detr.py b/mlir_tests/pytests/test_detr.py
deleted file mode 100644
index a3035cd6..00000000
--- a/mlir_tests/pytests/test_detr.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_detr():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/DETR.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "DETR"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_distilbert_base_uncased.py b/mlir_tests/pytests/test_distilbert_base_uncased.py
deleted file mode 100644
index 3da2419d..00000000
--- a/mlir_tests/pytests/test_distilbert_base_uncased.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_distilbert_base_uncased():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/distilbert-base-uncased.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "distilbert-base-uncased"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_glpn_kitti.py b/mlir_tests/pytests/test_glpn_kitti.py
deleted file mode 100644
index e840f2d3..00000000
--- a/mlir_tests/pytests/test_glpn_kitti.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_glpn_kitti():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/GLPN-KITTI.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "GLPN-KITTI"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_mgp_str_base.py b/mlir_tests/pytests/test_mgp_str_base.py
deleted file mode 100644
index 6b9e3634..00000000
--- a/mlir_tests/pytests/test_mgp_str_base.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_mgp_str_base():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open(
-        "mlir_tests/alibaba-damomgp-str-base.mlir", "r", encoding="utf-8"
-    ) as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "alibaba-damo/mgp-str-base"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_mlpmixer.py b/mlir_tests/pytests/test_mlpmixer.py
deleted file mode 100644
index 12cc5b3e..00000000
--- a/mlir_tests/pytests/test_mlpmixer.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_mlpmixer():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/MLPMixer.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "MLPMixer"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_mnist.py b/mlir_tests/pytests/test_mnist.py
deleted file mode 100644
index 0503d401..00000000
--- a/mlir_tests/pytests/test_mnist.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_mnist():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/MNIST.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "Mnist"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_mobilenetssd.py b/mlir_tests/pytests/test_mobilenetssd.py
deleted file mode 100644
index 7bbd47af..00000000
--- a/mlir_tests/pytests/test_mobilenetssd.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_mobilenetssd():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/MobileNetSSD.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "MobileNetSSD"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_mobilenetv2.py b/mlir_tests/pytests/test_mobilenetv2.py
deleted file mode 100644
index f284d7f7..00000000
--- a/mlir_tests/pytests/test_mobilenetv2.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_mobilenetv2():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/MobileNetV2.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "MobileNetV2"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_openposev2.py b/mlir_tests/pytests/test_openposev2.py
deleted file mode 100644
index 3a0c79a4..00000000
--- a/mlir_tests/pytests/test_openposev2.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_openposev2():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/OpenPose V2.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "OpenPose V2"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_perceiverio.py b/mlir_tests/pytests/test_perceiverio.py
deleted file mode 100644
index 3c63e837..00000000
--- a/mlir_tests/pytests/test_perceiverio.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_perceiverio():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/Perceiver IO.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "Perceiver IO"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_resnet18.py b/mlir_tests/pytests/test_resnet18.py
deleted file mode 100644
index 8d11d7d9..00000000
--- a/mlir_tests/pytests/test_resnet18.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_resnet18():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/ResNet18.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "ResNet18"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_resnet50.py b/mlir_tests/pytests/test_resnet50.py
deleted file mode 100644
index 6f40dea2..00000000
--- a/mlir_tests/pytests/test_resnet50.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_resnet50():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/ResNet50.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "ResNet50"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_segformer.py b/mlir_tests/pytests/test_segformer.py
deleted file mode 100644
index 0464aabe..00000000
--- a/mlir_tests/pytests/test_segformer.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_segformer():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/SegFormer.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "SegFormer"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_squeezebert.py b/mlir_tests/pytests/test_squeezebert.py
deleted file mode 100644
index 305328ed..00000000
--- a/mlir_tests/pytests/test_squeezebert.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_squeezebert():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/SqueezeBERT.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "SqueezeBERT"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_vilt.py b/mlir_tests/pytests/test_vilt.py
deleted file mode 100644
index a76f3082..00000000
--- a/mlir_tests/pytests/test_vilt.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_vilt():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/ViLT.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "ViLT"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/mlir_tests/pytests/test_yolov3.py b/mlir_tests/pytests/test_yolov3.py
deleted file mode 100644
index 4759625c..00000000
--- a/mlir_tests/pytests/test_yolov3.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-def test_yolov3():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("mlir_tests/YOLOV3.mlir", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "YOLOV3"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {test_name} - {compile_depth}")
-
-    clear_cache()
diff --git a/tests/models/resnet/test_resnet.py b/tests/models/resnet/test_resnet.py
index c51c5718..6dd402af 100644
--- a/tests/models/resnet/test_resnet.py
+++ b/tests/models/resnet/test_resnet.py
@@ -5,7 +5,7 @@
 import torchvision
 import pytest
 from tests.utils import ModelTester
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
+from tt_torch.tools.utils import CompilerConfig, CompileDepth
 
 
 class ThisTester(ModelTester):
@@ -35,7 +35,7 @@ def test_resnet(record_property, mode, op_by_op):
     cc.consteval_parameters = True
     if op_by_op:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
-    cc.op_by_op_backend = OpByOpBackend.STABLEHLO
+
     tester = ThisTester(
         model_name,
         mode,
diff --git a/tt_torch/dynamo/executor.py b/tt_torch/dynamo/executor.py
index ed6d6a65..07c69ed9 100644
--- a/tt_torch/dynamo/executor.py
+++ b/tt_torch/dynamo/executor.py
@@ -66,6 +66,7 @@ def __init__(
         required_atol=1e-2,
     ):
         self.gm = gm
+        self.binary = None
         if graph_constants is not None:
             self.graph_constants = (
                 (graph_constants,)
@@ -79,7 +80,6 @@ def __init__(
         self.compiler_config = compiler_config
         self.required_atol = required_atol
         self.required_pcc = required_pcc
-        self.binary = None
 
         # Dictionary to keep track of the type conversion for unsupported hardware
         # types and use it to convert the input arguments to supported types.
@@ -89,6 +89,13 @@ def __init__(
             torch.float64: torch.float32,
         }
 
+    def register_intermediate_callback(self, callback):
+        if not is_runtime_debug_enabled():
+            raise RuntimeError(
+                "Runtime debug is required to use intermediate callbacks. Please recompile this project with -DTT_RUNTIME_DEBUG=ON."
+            )
+        tt_mlir.DebugHooks.get_debug_hooks(callback)
+
     def typecast_inputs(self, inputs):
         new_inputs = ()
         for input in inputs:
@@ -112,6 +119,9 @@ def typecast_inputs(self, inputs):
             new_inputs = new_inputs + ((input),)
         return new_inputs
 
+    def set_binary(self, binary):
+        self.binary = binary
+
     def __call__(self, *inputs):
         if self.compiler_config.compile_depth != CompileDepth.EXECUTE:
             assert (
@@ -125,9 +135,6 @@ def __call__(self, *inputs):
             inputs = inputs + self.graph_constants
         return tt_mlir.run(inputs, self.binary)
 
-    def set_binary(self, binary):
-        self.binary = binary
-
 
 class OpByOpExecutor(Executor):
     def __init__(
diff --git a/tt_torch/dynamo/test_mlir.py b/tt_torch/dynamo/test_mlir.py
deleted file mode 100644
index 0d644396..00000000
--- a/tt_torch/dynamo/test_mlir.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
-#
-# SPDX-License-Identifier: Apache-2.0
-import os
-import glob
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import (
-    CompilerConfig,
-    CompileDepth,
-    OpByOpBackend,
-)
-import re
-import torch
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {file}")
-        except Exception as e:
-            print(f"Error removing {file}: {e}")
-
-
-# The tests dictionary with model names and mlir paths
-tests = {
-    "mgp_str_base": {
-        "model_name": "alibaba-damo/mgp-str-base",
-        "mlir_path": "mlir_tests/alibaba-damomgp-str-base.mlir",
-    },
-    "autoencoder_linear": {
-        "model_name": "Autoencoder (linear)",
-        "mlir_path": "mlir_tests/Autoencoder (linear).mlir",
-    },
-    "detr": {
-        "model_name": "DETR",
-        "mlir_path": "mlir_tests/DETR.mlir",
-    },
-    "distilbert-base-uncased": {
-        "model_name": "distilbert-base-uncased",
-        "mlir_path": "mlir_tests/distilbert-base-uncased.mlir",
-    },
-    "glpn-kitti": {
-        "model_name": "GLPN-KITTI",
-        "mlir_path": "mlir_tests/GLPN-KITTI.mlir",
-    },
-    "beit-base-patch16-224": {
-        "model_name": "microsoft/beit-base-patch16-224",
-        "mlir_path": "mlir_tests/microsoftbeit-base-patch16-224.mlir",
-    },
-    "beit-large-patch16-224": {
-        "model_name": "microsoft/beit-large-patch16-224",
-        "mlir_path": "mlir_tests/microsoftbeit-large-patch16-224.mlir",
-    },
-    "MLPMixer": {
-        "model_name": "MLPMixer",
-        "mlir_path": "mlir_tests/MLPMixer.mlir",
-    },
-    "MNIST": {
-        "model_name": "Mnist",
-        "mlir_path": "mlir_tests/MNIST.mlir",
-    },
-    "MobileNetSSD": {
-        "model_name": "MobileNetSSD",
-        "mlir_path": "mlir_tests/MobileNetSSD.mlir",
-    },
-    "MobileNetV2": {
-        "model_name": "MobileNetV2",
-        "mlir_path": "mlir_tests/MobileNetV2.mlir",
-    },
-    "OpenPoseV2": {
-        "model_name": "OpenPose V2",
-        "mlir_path": "mlir_tests/OpenPose V2.mlir",
-    },
-    "PerceiverIO": {
-        "model_name": "Perceiver IO",
-        "mlir_path": "mlir_tests/Perceiver IO.mlir",
-    },
-    "ResNet18": {
-        "model_name": "ResNet18",
-        "mlir_path": "mlir_tests/ResNet18.mlir",
-    },
-    "ResNet50": {
-        "model_name": "ResNet50",
-        "mlir_path": "mlir_tests/ResNet50.mlir",
-    },
-    "SegFormer": {
-        "model_name": "SegFormer",
-        "mlir_path": "mlir_tests/SegFormer.mlir",
-    },
-    "SqueezeBERT": {
-        "model_name": "SqueezeBERT",
-        "mlir_path": "mlir_tests/SqueezeBERT.mlir",
-    },
-    "ViLT": {
-        "model_name": "ViLT",
-        "mlir_path": "mlir_tests/ViLT.mlir",
-    },
-    "YOLOV3": {
-        "model_name": "YOLOV3",
-        "mlir_path": "mlir_tests/YOLOV3.mlir",
-    },
-}
-
-
-def generate_test_file(model_name, mlir_path, test_name):
-    test_name_new = (
-        test_name.replace("/", "_").replace(" ", "_").replace("-", "_").lower()
-    )
-    test_file_name = f"mlir_tests/pytests/test_{test_name_new}.py"
-    with open(test_file_name, "w") as f:
-        f.write(
-            f"""import pytest
-import os
-from tt_torch.dynamo.backend import backend
-from tt_torch.dynamo.shlo_backend import generate_random_inputs_for_shlo
-from tt_torch.tools.utils import CompilerConfig, CompileDepth, OpByOpBackend
-
-os.environ["HF_HOME"] = "/localdev/ddilbaz/cache"
-
-def clear_cache():
-    cache_path = "/localdev/ddilbaz/cache/*"
-    files = glob.glob(cache_path)
-    for file in files:
-        try:
-            os.remove(file)
-            print(f"Removed cache file: {{file}}")
-        except Exception as e:
-            print(f"Error removing {{file}}: {{e}}")
-
-def test_{test_name_new}():
-    mlir_code = ""
-    compile_depths = [
-        CompileDepth.COMPILE_OP_BY_OP,
-        CompileDepth.EXECUTE,
-        CompileDepth.EXECUTE_OP_BY_OP,
-    ]
-
-    # Read the MLIR file content
-    with open("{mlir_path}", "r", encoding="utf-8") as file:
-        mlir_code = file.read()
-
-    for compile_depth in compile_depths:
-        compiler_config = CompilerConfig()
-        compiler_config.compile_depth = compile_depth
-        compiler_config.op_by_op_backend = OpByOpBackend.STABLEHLO
-        compiler_config.model_name = "{model_name}"
-
-        inputs = generate_random_inputs_for_shlo(mlir_code)
-        executor = backend(mlir_code, inputs, compiler_config)
-        result = executor(*inputs)
-        print(f"SUCCESS: {{test_name}} - {{compile_depth}}")
-
-    clear_cache()
-"""
-        )
-    print(f"Generated test file: {test_file_name}")
-
-
-if __name__ == "__main__":
-    # Iterate over the tests dictionary and generate a Pytest file for each model
-    for test_name, test_info in tests.items():
-        model_name = test_info["model_name"]
-        mlir_path = test_info["mlir_path"]
-        generate_test_file(model_name, mlir_path, test_name)
diff --git a/tt_torch/dynamo/torch_backend.py b/tt_torch/dynamo/torch_backend.py
index 41c0f6b5..bf49b90f 100644
--- a/tt_torch/dynamo/torch_backend.py
+++ b/tt_torch/dynamo/torch_backend.py
@@ -32,6 +32,15 @@
 #########################################################
 
 
+def verify_ir(module):
+    def verify_op(op):
+        if hasattr(op, "verify"):
+            op.verify()
+        return torch_mlir.ir.WalkResult.ADVANCE
+
+    module.operation.walk(verify_op)
+
+
 class TTContextCache(ContextCache):
     def get_node_location(self, node: torch.fx.Node) -> Optional[Location]:
         return Location.name(node.name, context=self._c)
@@ -48,15 +57,6 @@ def import_graph(graph: torch.fx.GraphModule):
     return importer.module
 
 
-def verify_ir(module):
-    def verify_op(op):
-        if hasattr(op, "verify"):
-            op.verify()
-        return torch_mlir.ir.WalkResult.ADVANCE
-
-    module.operation.walk(verify_op)
-
-
 def lower_to_stable_hlo(module, op=None, enable_ir_printing=False):
     run_pipeline_with_repro_report(
         module,
@@ -101,13 +101,6 @@ def __init__(
             compiler_config = CompilerConfig()
         self.compiler_config = compiler_config
 
-    def register_intermediate_callback(self, callback):
-        if not is_runtime_debug_enabled():
-            raise RuntimeError(
-                "Runtime debug is required to use intermediate callbacks. Please recompile this project with -DTT_RUNTIME_DEBUG=ON."
-            )
-        tt_mlir.DebugHooks.get_debug_hooks(callback)
-
     def is_node_valid(self, node):
         if not isinstance(node.target, torch._ops.OpOverload):
             if "getitem" not in name:
@@ -119,107 +112,10 @@ def get_node_name(self, node):
         name = node.target.name() if hasattr(node.target, "name") else node.name
         return name
 
-    def run_gm_op_by_op(self, *inputs):
-        node_to_tensor = {}
-        input_index = 0
-        outputs = []
-        num_nodes = len(self.gm.graph.nodes)
-        out_degree = {}
-        for idx, node in enumerate(self.gm.graph.nodes):
-            print(f"Compiling {idx}/{num_nodes}: {node.target}")
-            out_degree[node] = len(node.users)
-            if node.op == "placeholder":
-                node_to_tensor[node] = inputs[input_index]
-                input_index += 1
-            elif node.op == "get_attr":
-                for buffer in self.gm.named_buffers():
-                    if buffer[0] == node.target:
-                        node_to_tensor[node] = buffer[1]
-                        break
-            elif node.op == "call_function":
-                args = []
-                for arg in node.args:
-                    if isinstance(arg, torch.fx.node.Node):
-                        args.append(node_to_tensor[arg])
-                    elif isinstance(arg, list):
-                        args.append(
-                            [
-                                node_to_tensor[a]
-                                if isinstance(a, torch.fx.node.Node)
-                                else a
-                                for a in arg
-                            ]
-                        )
-                    else:
-                        args.append(arg)
-                try:
-                    binary, op = self.compile_op(node, *args, **node.kwargs)
-                except Exception as e:
-                    binary = None
-                    print(f"Failed to compile {idx}/{num_nodes}: {node.target}: {e}")
-
-                if (
-                    self.compiler_config.compile_depth == CompileDepth.EXECUTE_OP_BY_OP
-                    and binary is not None
-                ):
-                    try:
-                        calculated, runtime_stack_dump = self.run_op(binary, *args)
-                        self.compiler_config.unique_ops[
-                            op.unique_key()
-                        ].runtime_stack_dump = runtime_stack_dump
-
-                        print(f"Ran: {idx}/{num_nodes}: {node.target}")
-                        if calculated is None:
-                            raise ValueError("Failed to execute")
-                        op.compilation_status = OpCompilationStatus.EXECUTED
-                        tensor = node.target(*args, **node.kwargs)
-                        if self.compiler_config.verify_op_by_op:
-                            atol = calculate_atol(calculated, tensor)
-                            op.atol = atol
-                            if atol > self.required_atol:
-                                print(f"atol too high for {idx}: {atol}")
-                            pcc = calculate_pcc(calculated, tensor)
-                            op.pcc = pcc
-                            if pcc < self.required_pcc:
-                                print(f"pcc too low for {idx}: {pcc}")
-                    except Exception as e:
-                        print(
-                            f"Failed to execute {idx}/{num_nodes}: {node.target}: {e}"
-                        )
-                        tensor = node.target(*args, **node.kwargs)
-                else:
-                    tensor = node.target(*args, **node.kwargs)
-                node_to_tensor[node] = tensor
-            elif node.op == "output":
-                args = node.args[0]
-                output_tensors = [node_to_tensor[arg] for arg in args]
-                outputs = output_tensors
-            args_set = set()
-            for arg in node.args:
-                if arg in args_set:
-                    continue
-                args_set.add(arg)
-                if isinstance(arg, torch.fx.node.Node):
-                    out_degree[arg] -= 1
-                    if out_degree[arg] == 0 and arg.op != "output":
-                        del node_to_tensor[arg]
-                        out_degree.pop(arg)
-
-        self.compiler_config.save_unique_ops()
-        if self.execute_process is not None:
-            self.execute_process.terminate()
-            self.execute_process = None
-        if self.stderror_redirected:
-            os.unlink(self.file_stderr.name)
-            self.stderror_redirected = False
-
-        return outputs
-
     def get_stable_hlo_graph(self, node, inputs, **kwargs):
 
         input_shapes_and_constants = self.get_input_shapes_and_constants(inputs)
-        if not self.is_node_valid(node):
-            return None, None
+
         name = node.target.name() if hasattr(node.target, "name") else node.name
         if not isinstance(node.target, torch._ops.OpOverload):
             if "getitem" not in name:
@@ -311,13 +207,108 @@ def get_stable_hlo_graph(self, node, inputs, **kwargs):
             op.output_shapes.append([dim for dim in out.shape])
 
         module = import_graph(graph)
-        verify_ir(module)
         op.compilation_status = OpCompilationStatus.CONVERTED_TO_TORCH_IR
         op.add_torch_ir_graph(module.operation.get_asm())
         lower_to_stable_hlo(module, op=op)
         op.add_stable_hlo_graph(module.operation.get_asm())
         return module, op
 
+    def run_gm_op_by_op(self, *inputs):
+        node_to_tensor = {}
+        input_index = 0
+        outputs = []
+        num_nodes = len(self.gm.graph.nodes)
+        out_degree = {}
+        for idx, node in enumerate(self.gm.graph.nodes):
+            print(f"Compiling {idx}/{num_nodes}: {node.target}")
+            out_degree[node] = len(node.users)
+            if node.op == "placeholder":
+                node_to_tensor[node] = inputs[input_index]
+                input_index += 1
+            elif node.op == "get_attr":
+                for buffer in self.gm.named_buffers():
+                    if buffer[0] == node.target:
+                        node_to_tensor[node] = buffer[1]
+                        break
+            elif node.op == "call_function":
+                args = []
+                for arg in node.args:
+                    if isinstance(arg, torch.fx.node.Node):
+                        args.append(node_to_tensor[arg])
+                    elif isinstance(arg, list):
+                        args.append(
+                            [
+                                node_to_tensor[a]
+                                if isinstance(a, torch.fx.node.Node)
+                                else a
+                                for a in arg
+                            ]
+                        )
+                    else:
+                        args.append(arg)
+                try:
+                    binary, op = self.compile_op(node, *args, **node.kwargs)
+                except Exception as e:
+                    binary = None
+                    print(f"Failed to compile {idx}/{num_nodes}: {node.target}: {e}")
+
+                if (
+                    self.compiler_config.compile_depth == CompileDepth.EXECUTE_OP_BY_OP
+                    and binary is not None
+                ):
+                    try:
+                        calculated, runtime_stack_dump = self.run_op(binary, *args)
+                        self.compiler_config.unique_ops[
+                            op.unique_key()
+                        ].runtime_stack_dump = runtime_stack_dump
+
+                        print(f"Ran: {idx}/{num_nodes}: {node.target}")
+                        if calculated is None:
+                            raise ValueError("Failed to execute")
+                        op.compilation_status = OpCompilationStatus.EXECUTED
+                        tensor = node.target(*args, **node.kwargs)
+                        if self.compiler_config.verify_op_by_op:
+                            atol = calculate_atol(calculated, tensor)
+                            op.atol = atol
+                            if atol > self.required_atol:
+                                print(f"atol too high for {idx}: {atol}")
+                            pcc = calculate_pcc(calculated, tensor)
+                            op.pcc = pcc
+                            if pcc < self.required_pcc:
+                                print(f"pcc too low for {idx}: {pcc}")
+                    except Exception as e:
+                        print(
+                            f"Failed to execute {idx}/{num_nodes}: {node.target}: {e}"
+                        )
+                        tensor = node.target(*args, **node.kwargs)
+                else:
+                    tensor = node.target(*args, **node.kwargs)
+                node_to_tensor[node] = tensor
+            elif node.op == "output":
+                args = node.args[0]
+                output_tensors = [node_to_tensor[arg] for arg in args]
+                outputs = output_tensors
+            args_set = set()
+            for arg in node.args:
+                if arg in args_set:
+                    continue
+                args_set.add(arg)
+                if isinstance(arg, torch.fx.node.Node):
+                    out_degree[arg] -= 1
+                    if out_degree[arg] == 0 and arg.op != "output":
+                        del node_to_tensor[arg]
+                        out_degree.pop(arg)
+
+        self.compiler_config.save_unique_ops()
+        if self.execute_process is not None:
+            self.execute_process.terminate()
+            self.execute_process = None
+        if self.stderror_redirected:
+            os.unlink(self.file_stderr.name)
+            self.stderror_redirected = False
+
+        return outputs
+
     def __call__(self, *inputs):
         inputs = self.typecast_inputs(inputs)
         if self.compiler_config.compile_depth in (